aws-samples · amindm · Oct 21, 2025
diff --git a/3_distributed_training/function-calling-sft-dpo/run_training_job.ipynb b/3_distributed_training/function-calling-sft-dpo/run_training_job.ipynb
diff --git a/...ributed_training/function-calling-sft-dpo/scripts/accelerate_configs/deepspeed_zero1.yaml b/...ributed_training/function-calling-sft-dpo/scripts/accelerate_configs/deepspeed_zero1.yaml
@@ -0,0 +1,20 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero_stage: 1
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
diff --git a/...ributed_training/function-calling-sft-dpo/scripts/accelerate_configs/deepspeed_zero3.yaml b/...ributed_training/function-calling-sft-dpo/scripts/accelerate_configs/deepspeed_zero3.yaml
@@ -0,0 +1,22 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: true
+  zero3_save_16bit_model: true
+  zero_stage: 3
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
diff --git a/3_distributed_training/function-calling-sft-dpo/scripts/create_preference_dataset.py b/3_distributed_training/function-calling-sft-dpo/scripts/create_preference_dataset.py
@@ -0,0 +1,207 @@
+from dataclasses import dataclass, field
+import logging
+import os
+import time
+from typing import cast
+import re 
+
+import torch
+from datasets import load_dataset
+from tqdm.auto import tqdm
+from trl import TrlParser
+from vllm import LLM, SamplingParams
+from datasets import Dataset
+from peft import LoraConfig, AutoPeftModelForCausalLM
+
+logger = logging.getLogger(__name__)
+
+@dataclass
+class CandidateArguments:
+    generation_model_name_or_path: str = field(
+        default=None,
+        metadata={
+            'help': 'Huggingface model name or path to model directory, for the model that will be used for generation, defaults to SFT model or previous iteration model.'
+        },
+    )
+    dataset_id: str = field(
+        default=None,
+        metadata={
+            'help': 'Path to the input dataset, that will be used to generate candidates, defaults to previous iteration output dataset.'
+        },  
+    )
+    sample_size: int = field(
+        default=None,
+        metadata={
+            'help': 'Number of samples to generate, defaults to as many as possible.'
+        },
+    )
+    prompt_column: str = field(
+        default='question',
+        metadata={'help': 'Column name in the input dataset that contains the messages.'},
+    )
+    answer_column: str = field(
+        default='answer',
+        metadata={'help': 'Column name in the input dataset that contains the answer.'},
+    )
+    system_prompt: str = field(
+        default= """Solve the given high school math problem by providing a clear explanation of each step leading to the final solution.
+
+Provide a detailed breakdown of your calculations, beginning with an explanation of the problem and describing how you derive each formula, value, or conclusion. Use logical steps that build upon one another, to arrive at the final answer in a systematic manner.
+
+# Steps
+
+1. **Understand the Problem**: Restate the given math problem and clearly identify the main question and any important given values.
+2. **Set Up**: Identify the key formulas or concepts that could help solve the problem (e.g., algebraic manipulation, geometry formulas, trigonometric identities).
+3. **Solve Step-by-Step**: Iteratively progress through each step of the math problem, justifying why each consecutive operation brings you closer to the solution.
+4. **Double Check**: If applicable, double check the work for accuracy and sense, and mention potential alternative approaches if any.
+5. **Final Answer**: Provide the numerical or algebraic solution clearly, accompanied by appropriate units if relevant.
+
+# Notes
+
+- Always clearly define any variable or term used.
+- Wherever applicable, include unit conversions or context to explain why each formula or step has been chosen.
+- Assume the level of mathematics is suitable for high school, and avoid overly advanced math techniques unless they are common at that level.
+""",
+        metadata={'help': 'System prompt to use for generation.'},
+    )
+    num_solutions: int = field(
+        default=5,
+        metadata={'help': 'Number of solutions to generate for each input.'},
+    )
+    batch_size: int = field(
+        default=1,
+        metadata={'help': 'Batch size for generation.'},
+    )
+    max_new_tokens: int = field(
+        default=2048,
+        metadata={'help': 'Maximum number of new tokens to generate.'},
+    )
+    temperature: float = field(
+        default=0.7,
+        metadata={'help': 'Temperature for generation.'},
+    )
+    top_p: float = field(
+        default=1.0,
+        metadata={'help': 'Top-p for generation.'},
+    )
+
+def score_solutions(
+    candidate_result: str,
+    ground_truth_result: str,
+) -> bool:
+    # finds the answer in the candidate result
+    regex_pattern = r'\b\d+\b'
+    match = re.findall(regex_pattern, candidate_result)
+
+    if match:
+        return match[-1]  == ground_truth_result
+    else:
+        return False
+
+
+def vllm_create_candidates(
+    dataset: Dataset,
+    model_name_or_path: str,
+    num_solutions: int,
+    max_new_tokens: int,
+    batch_size: int = 1,
+    prompt_column: str = 'prompt',
+    system_prompt: str = None,
+    answer_column: str = 'answer',
+    sample_size: int = None,
+    **kwargs,
+) -> Dataset:
+
+    # Loads the model on all available GPUs with vLLM
+    llm = LLM(
+        model=model_name_or_path,
+        tokenizer=model_name_or_path,
+        tensor_parallel_size=torch.cuda.device_count(),
+        max_model_len=4096,
+    )
+    # formats the prompt using the system prompt and the prompt column
+    tokenizer = llm.get_tokenizer()
+    def format_prompt(s):
+        messages = [
+            {"role": "system", "content": system_prompt}, 
+            {"role": "user", "content": s[prompt_column]}
+        ]
+        return {"prompt": tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True), "messages": messages}
+
+    dataset = dataset.map(format_prompt)
+    # print the first prompt
+    print('First prompt:', dataset['prompt'][0])
+
+    # set sampling params
+    sampling_params = SamplingParams(
+        max_tokens=max_new_tokens,
+        n=num_solutions,
+        temperature=kwargs.get('temperature', 1.0),
+        top_p=kwargs.get('top_p', 1),
+    )
+
+    # Iterate over the dataset with batch size to generate candidates and create preference pairs based on the correct answer and ground truth
+    preference_dataset = []
+    for i in tqdm(range(0, len(dataset), batch_size), desc=f'Generating solutions: Already generated {len(preference_dataset)} preference pairs'):
+        batch = dataset[i : i + batch_size]
+        # Generate `num_solutions` candidates per batch
+        result = llm.generate(batch['prompt'], sampling_params, use_tqdm=False)
+        for j in range(0, len(batch['prompt'])):
+            # iterate each candidate and check if it is correct
+            preference_pair = {
+                "system_prompt": system_prompt,
+                "prompt": batch[prompt_column][j],
+                "ground_truth": batch[answer_column][j],
+            }
+            for cand in result[j].outputs:
+                # check if the candidate is correct
+                cand_score = score_solutions(candidate_result=cand.text, ground_truth_result=batch[answer_column][j])                   
+                if cand_score and preference_pair.get('chosen',None) is None:
+                    preference_pair['chosen'] = cand.text
+                elif not cand_score and preference_pair.get('rejected',None) is None:
+                    preference_pair['rejected'] = cand.text
+                # check if the pair is complete to prevent overwriting
+                if preference_pair.get('chosen',None) and preference_pair.get('rejected',None):
+                    continue
+
+            # check is the generated candidates lead to a complete preference pair
+            if preference_pair.get('chosen',None) and preference_pair.get('rejected',None):
+                print(f'Found preference pair, adding to dataset.')
+                preference_dataset.append(preference_pair)
+
+        print(f'Generated {len(preference_dataset)} preference pairs')
+        if len(preference_dataset) >= sample_size:
+            break
+    return Dataset.from_list(preference_dataset)
+
+
+def main():
+    parser = TrlParser((CandidateArguments))
+    script_args = parser.parse_args_and_config()[0]
+    script_args = cast(CandidateArguments, script_args)
+
+    # load dataset and tokenizer
+    dataset = load_dataset(script_args.dataset_id, split='train')
+    print(f'Generating {script_args.num_solutions} solutions for {len(dataset)} prompts...')
+
+    start_time = time.time()
+    candidates_ds = vllm_create_candidates(
+        dataset,
+        model_name_or_path=script_args.generation_model_name_or_path,
+        num_solutions=script_args.num_solutions,
+        max_new_tokens=script_args.max_new_tokens,
+        batch_size=script_args.batch_size,
+        prompt_column=script_args.prompt_column,
+        answer_column=script_args.answer_column,
+        system_prompt=script_args.system_prompt,
+        temperature=script_args.temperature,
+        top_p=script_args.top_p,
+        sample_size=script_args.sample_size if script_args.sample_size is not None else len(dataset),
+    )
+    print(f'Generated {len(dataset) * script_args.num_solutions} solutions in {time.time() - start_time:.2f} seconds.')
+
+    save_dataset_id = f"{script_args.generation_model_name_or_path.replace('/', '-')[:40]}-{script_args.dataset_id.replace('/', '-')[:40]}-candidates"
+    candidates_ds.push_to_hub(save_dataset_id)
+
+if __name__ == '__main__':
+    main()
diff --git a/3_distributed_training/function-calling-sft-dpo/scripts/merge_adapter_weights.py b/3_distributed_training/function-calling-sft-dpo/scripts/merge_adapter_weights.py
@@ -0,0 +1,50 @@
+from dataclasses import dataclass, field
+import tempfile
+from typing import Optional
+import torch
+from peft import AutoPeftModelForCausalLM
+from transformers import AutoTokenizer, HfArgumentParser
+from huggingface_hub import HfApi
+
+# Example usage:
+# python scripts/merge_adapter_weights.py --peft_model_id falcon-180b-lora-fa --output_dir merged-weights --save_tokenizer True
+
+def save_model(model_path_or_id, save_dir, save_tokenizer=True):
+  model = AutoPeftModelForCausalLM.from_pretrained(
+      model_path_or_id,
+      low_cpu_mem_usage=True,
+      torch_dtype=torch.float16,
+  )  
+  # Merge LoRA and base model and save
+  model = model.merge_and_unload()        
+  model.save_pretrained(save_dir, safe_serialization=True, max_shard_size="3GB")
+
+  # save tokenizer
+  if save_tokenizer:
+    tokenizer = AutoTokenizer.from_pretrained(model_path_or_id)
+    tokenizer.save_pretrained(save_dir) 
+
+
+@dataclass
+class ScriptArguments:
+    peft_model_id: str = field(metadata={"help": "model id or path to model"})
+    output_dir: Optional[str] = field(default="merged-weights", metadata={"help": "where the merged model should be saved"})
+    save_tokenizer: Optional[bool] = field(default=True, metadata={"help": "whether to save the tokenizer"})
+    push_to_hub: Optional[bool] = field(default=False, metadata={"help": "whether to push the model to the hub"})
+    repository_id: Optional[str] = field(default=None, metadata={"help": "the model name"})
+
+parser = HfArgumentParser(ScriptArguments)
+args = parser.parse_args_into_dataclasses()[0]
+api = HfApi()
+
+if args.push_to_hub:
+  repo_id = args.repository_id if args.repository_id else args.peft_model_id.split('/')[-1]
+  with tempfile.TemporaryDirectory() as temp_dir:
+    save_model(args.peft_model_id, temp_dir, args.save_tokenizer)
+    api.upload_large_folder(
+      folder_path=temp_dir,
+      repo_id=repo_id,
+      repo_type="model",
+    )
+else:
+  save_model(args.peft_model_id, args.output_dir, args.save_tokenizer)
diff --git a/3_distributed_training/function-calling-sft-dpo/scripts/recipes/sft-dpo-qwen3-0.6b.yaml b/3_distributed_training/function-calling-sft-dpo/scripts/recipes/sft-dpo-qwen3-0.6b.yaml
@@ -0,0 +1,41 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen3-0.6B
+tokenizer_name_or_path: Qwen/Qwen3-0.6B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+bf16: true
+tf32: true
+
+model_download_location: /opt/ml/input/model
+dataset_local_location: /opt/ml/input/data/training_dataset/
+output_dir: /opt/ml/model/ #/opt/ml/output
+
+# Dataset arguments
+dataset_id_or_path: /opt/ml/input/data/training_dataset/
+max_length: 2048
+packing: true
+
+# Training arguments
+beta: 0.1
+max_length: 1536
+max_prompt_length: 768
+loss_type: sigmoid # default loss, alternatives: https://huggingface.co/docs/trl/dpo_trainer#loss-functions
+num_train_epochs: 10
+per_device_train_batch_size: 8 
+gradient_accumulation_steps: 2
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: true
+learning_rate: 5.0e-7 
+lr_scheduler_type: constant
+warmup_ratio: 0.03
+#weight_decay: 0.01
+
+# Logging arguments
+logging_strategy: steps
+logging_steps: 5
+report_to:
+- none
+save_strategy: "no"
+seed: 42
diff --git a/3_distributed_training/function-calling-sft-dpo/scripts/recipes/sft-dpo-qwen3-1.7b.yaml b/3_distributed_training/function-calling-sft-dpo/scripts/recipes/sft-dpo-qwen3-1.7b.yaml
@@ -0,0 +1,41 @@
+# Model arguments
+model_name_or_path: Qwen/Qwen3-1.7B
+tokenizer_name_or_path: Qwen/Qwen3-1.7B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+bf16: true
+tf32: true
+
+model_download_location: /opt/ml/input/model
+dataset_local_location: /opt/ml/input/data/training_dataset/
+output_dir: /opt/ml/model/ #/opt/ml/output
+
+# Dataset arguments
+dataset_id_or_path: /opt/ml/input/data/training_dataset/
+max_length: 2048
+packing: true
+
+# Training arguments
+beta: 0.1
+max_length: 1536
+max_prompt_length: 768
+loss_type: sigmoid # default loss, alternatives: https://huggingface.co/docs/trl/dpo_trainer#loss-functions
+num_train_epochs: 10
+per_device_train_batch_size: 8
+gradient_accumulation_steps: 2
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: true
+learning_rate: 5.0e-7 
+lr_scheduler_type: constant
+warmup_ratio: 0.03
+#weight_decay: 0.01
+
+# Logging arguments
+logging_strategy: steps
+logging_steps: 5
+report_to:
+- none
+save_strategy: "no"
+seed: 42