diff --git a/PR_DESCRIPTION.md b/PR_DESCRIPTION.md new file mode 100644 index 0000000..3ac22b2 --- /dev/null +++ b/PR_DESCRIPTION.md @@ -0,0 +1,112 @@ +# Add Supervisor Process Management Module + +This introduces a **supervisor module** that wraps ML frameworks with supervisord for automatic crash recovery and robust process management. It can be integrated into any Dockerfile easily. + +## Integration + +Install and use with these commands: + +```bash +pip install model-hosting-container-standards +standard-supervisor vllm serve model --host 0.0.0.0 --port 8080 +``` + +Or in a Dockerfile: +```dockerfile +COPY model_hosting_container_standards-0.1.2-py3-none-any.whl /tmp/ +RUN pip install supervisor +RUN pip install /tmp/model_hosting_container_standards-0.1.2-py3-none-any.whl + +# Use supervisor entrypoint for SageMaker +ENV ENGINE_AUTO_RECOVERY=true +ENV ENGINE_MAX_RECOVERY_ATTEMPTS=3 +ENTRYPOINT ["standard-supervisor", "./sagemaker-entrypoint.sh"] +``` + +## Workflow + +1. **Parse command and environment** → Read ML framework command and supervisor configuration +2. **Generate supervisord config** → Create robust configuration with configparser +3. **Start supervisord** → Launch supervisor daemon with your framework as managed process +4. **Monitor and restart** → Supervisor detects crashes and restarts automatically with configurable limits +5. **Handle failures** → After max retries, container exits gracefully with proper error codes + +### **Key Components** + +**Core Modules:** +- `models.py` - Configuration data models with comprehensive validation and environment variable parsing +- `generator.py` - Robust supervisord configuration generation using configparser + +**CLI Tools & Scripts:** +- `scripts/standard_supervisor.py` - Main CLI tool for running ML frameworks under supervisor (`standard-supervisor`) +- `scripts/generate_supervisor_config.py` - Standalone configuration generator CLI + +**Documentation & Tests:** +- `README.md` - Comprehensive setup guide with examples +- `tests/integration/test_supervisor_cli_integration.py` - **Real behavior integration tests** that verify actual restart and retry behavior +- `tests/supervisor/` - Comprehensive unit tests for all components + +## Usage Examples + +### Simple CLI Usage +```bash +# Direct command execution with supervisor +standard-supervisor vllm serve model --host 0.0.0.0 --port 8080 + +# With custom configuration +PROCESS_MAX_START_RETRIES=5 SUPERVISOR_PROGRAM__APP_STARTSECS=30 \ +standard-supervisor python -m tensorrt_llm.hlapi.llm_api +``` + +### Dockerfile Integration +```dockerfile +FROM vllm/vllm-openai:latest + +# Install with supervisor support +RUN pip install model-hosting-container-standards + +# Configure your ML framework with supervisor settings +ENV PROCESS_MAX_START_RETRIES=3 +ENV SUPERVISOR_PROGRAM__APP_STARTSECS=30 +ENV SUPERVISOR_PROGRAM__APP_STOPWAITSECS=60 +ENV LOG_LEVEL=info + +# Use supervisor for process management +ENTRYPOINT ["python", "-m", "model_hosting_container_standards.supervisor.scripts.standard_supervisor"] +CMD ["vllm", "serve", "model", "--host", "0.0.0.0", "--port", "8080"] +``` + +## Configuration Options + +**Basic Configuration:** +- Command line arguments become the supervised process command +- `PROCESS_MAX_START_RETRIES=3` - Maximum startup attempts before giving up (0-100) +- `LOG_LEVEL=info` - Logging level (debug, info, warn, error, critical) + +**Advanced Supervisor Settings:** +- `SUPERVISOR_PROGRAM__APP_STARTSECS=30` - Time process must run to be considered "started" +- `SUPERVISOR_PROGRAM__APP_STOPWAITSECS=60` - Time to wait for graceful shutdown +- `SUPERVISOR_PROGRAM__APP_AUTORESTART=true` - Enable automatic restart on failure +- `SUPERVISOR_PROGRAM__APP_STARTRETRIES=3` - Startup retry attempts +- `SUPERVISOR_CONFIG_PATH=/tmp/supervisord.conf` - Custom config file location + +**Custom Sections:** +- `SUPERVISOR_SUPERVISORD_LOGLEVEL=debug` - Supervisord daemon log level +- `SUPERVISOR_EVENTLISTENER__MEMMON_COMMAND=memmon -a 200MB` - Add custom event listeners + +## Testing & Validation + +**Comprehensive Test Suite:** +- **Integration Tests** - Actual supervisor processes that verify continuous restart and retry limit behavior +**Test Coverage:** +- **Continuous restart behavior** - Verifies supervisor actually restarts failed processes +- **Startup retry limits** - Confirms supervisor respects retry limits and gives up appropriately +- **Signal handling** - Tests graceful shutdown with SIGTERM +- **ML framework integration** - Tests with realistic ML framework startup patterns +- **Configuration generation** - Validates all supervisor configuration options +- **Error handling** - Tests invalid configurations and edge cases + +**Manual Testing:** +- Tested with vLLM dockerfile build +- Verified with `docker exec` process killing to confirm restart behavior +- Validated in production-like container environments diff --git a/python/MANIFEST.in b/python/MANIFEST.in new file mode 100644 index 0000000..e20df56 --- /dev/null +++ b/python/MANIFEST.in @@ -0,0 +1,16 @@ +# Include supervisor scripts +recursive-include model_hosting_container_standards/supervisor/scripts * + +# Include documentation +include README.md +include LICENSE + +# Include configuration files +include pyproject.toml + +# Exclude development files +exclude .gitignore +exclude .pre-commit-config.yaml +recursive-exclude * __pycache__ +recursive-exclude * *.py[co] +recursive-exclude * .DS_Store diff --git a/python/model_hosting_container_standards/supervisor/README.md b/python/model_hosting_container_standards/supervisor/README.md new file mode 100644 index 0000000..0b5fdcc --- /dev/null +++ b/python/model_hosting_container_standards/supervisor/README.md @@ -0,0 +1,299 @@ +# Supervisor Process Management + +Provides supervisord-based process management for ML frameworks with automatic recovery and container-friendly logging. + +## Overview + +This module wraps your ML framework (vLLM, TensorRT-LLM, etc.) with supervisord to provide: + +- **Automatic Process Monitoring**: Detects when your service crashes or exits unexpectedly +- **Auto-Recovery**: Automatically restarts failed processes with configurable retry limits +- **Container-Friendly**: Exits with code 1 after max retries so orchestrators (Docker, Kubernetes) can detect failures +- **Production Ready**: Structured logging, configurable behavior, and battle-tested supervisord underneath + +**Use Case**: Deploy ML frameworks on SageMaker or any container platform with automatic crash recovery and proper failure signaling. + +## Quick Setup (Simplified CLI Approach) + +### 1. Install the Package +```bash +pip install model-hosting-container-standards +``` + +### 2. Use standard-supervisor with Your Command +Simply prepend `standard-supervisor` to your existing framework command: + +```dockerfile +# Basic usage - just add standard-supervisor before your command +CMD ["standard-supervisor", "vllm", "serve", "model", "--host", "0.0.0.0", "--port", "8080"] +``` + +### 3. Alternative: Entrypoint Style +```dockerfile +# Use as entrypoint for more flexibility +ENTRYPOINT ["standard-supervisor"] +CMD ["vllm", "serve", "model", "--host", "0.0.0.0", "--port", "8080"] +``` + +That's it! No complex setup, no script extraction, no custom entrypoints needed. + +## Configuration + +Configure supervisor behavior using the unified `SUPERVISOR_*` environment variable pattern. These can be set in your Dockerfile with `ENV` or overridden at container runtime. + +### Default Behavior +- **Config file**: `/tmp/supervisord.conf` (generated automatically) +- **Auto-recovery**: Enabled by default +- **Max retries**: 3 attempts +- **Log level**: info + +### Configuration Options + +#### Application-Level Configuration (Recommended) +Use these simple environment variables for common settings: + +```bash +# Basic application behavior +export PROCESS_AUTO_RECOVERY=true # Auto-restart on failure (default: true) +export PROCESS_MAX_START_RETRIES=3 # Max restart attempts (default: 3) +export LOG_LEVEL=info # Log level (default: info, options: debug, info, warn, error, critical) +``` + +#### Advanced SUPERVISOR_* Configuration +Use the pattern `SUPERVISOR_{SECTION}_{KEY}=VALUE` for advanced supervisord customization: + +**Important**: +- The default program name is `app` +- To target specific programs, use double underscores `__` to represent colons in section names +- Program names in environment variables use the same format (e.g., `APP` for `app`) + +```bash +# Program section overrides (for default program "app") +export SUPERVISOR_PROGRAM__APP_STARTSECS=10 # Seconds to wait before considering started (default: 1) +export SUPERVISOR_PROGRAM__APP_STOPWAITSECS=30 # Seconds to wait for graceful shutdown (default: 10) +export SUPERVISOR_PROGRAM__APP_AUTORESTART=unexpected # Advanced restart control (true/false/unexpected) + +# For program-specific overrides, use the program name (default: "app") +# Or use application-level variables like PROCESS_MAX_START_RETRIES for simpler configuration + +# Supervisord daemon configuration +export SUPERVISOR_SUPERVISORD_LOGLEVEL=debug # Daemon log level (can differ from application LOG_LEVEL) +export SUPERVISOR_SUPERVISORD_LOGFILE=/tmp/supervisord.log # Log file location + +# Unix HTTP server configuration +export SUPERVISOR_UNIX_HTTP_SERVER_FILE=/tmp/supervisor.sock # Socket file location +``` + +### Common Configuration Examples + +```bash +# High availability setup with more retries (recommended approach) +export PROCESS_MAX_START_RETRIES=10 +export SUPERVISOR_PROGRAM__APP_STARTSECS=30 +export SUPERVISOR_PROGRAM__APP_STARTRETRIES=10 + +# Debug mode with verbose logging +export LOG_LEVEL=debug +export SUPERVISOR_SUPERVISORD_LOGLEVEL=debug + +# Quick restart for development +export SUPERVISOR_PROGRAM__APP_STARTSECS=1 +export SUPERVISOR_PROGRAM__APP_STOPWAITSECS=5 +export SUPERVISOR_PROGRAM__APP_STARTRETRIES=1 + +# Disable auto-recovery for debugging +export PROCESS_AUTO_RECOVERY=false +export PROCESS_MAX_START_RETRIES=1 +``` + +### Runtime Override Examples + +Environment variables set in the Dockerfile can be overridden when launching the container: + +```bash +# Override max retries at runtime (recommended) +docker run -e PROCESS_MAX_START_RETRIES=5 my-image + +# Disable auto-recovery at runtime (recommended) +docker run -e PROCESS_AUTO_RECOVERY=false my-image + +# Change log level for debugging (recommended) +docker run -e LOG_LEVEL=debug my-image + +# Override multiple settings (recommended approach) +docker run \ + -e PROCESS_MAX_START_RETRIES=10 \ + -e PROCESS_AUTO_RECOVERY=true \ + -e LOG_LEVEL=debug \ + my-image + +# Advanced: Direct supervisord configuration override +docker run \ + -e SUPERVISOR_PROGRAM__APP_STARTSECS=30 \ + -e SUPERVISOR_PROGRAM__APP_STARTRETRIES=5 \ + -e SUPERVISOR_SUPERVISORD_LOGLEVEL=debug \ + my-image +``` + +## Complete Examples + +### Basic vLLM Example +```dockerfile +FROM vllm/vllm-openai:latest + +# Install model hosting container standards (includes supervisor) +RUN pip install model-hosting-container-standards + +# Use standard-supervisor with your vLLM command +CMD ["standard-supervisor", "vllm", "serve", "TinyLlama/TinyLlama-1.1B-Chat-v1.0", "--host", "0.0.0.0", "--port", "8080", "--dtype", "auto"] +``` + +### TensorRT-LLM Example +```dockerfile +FROM nvcr.io/nvidia/tensorrt:23.08-py3 + +# Install dependencies and model hosting container standards +RUN pip install tensorrt-llm model-hosting-container-standards + +# Use standard-supervisor with TensorRT-LLM +CMD ["standard-supervisor", "python", "-m", "tensorrt_llm.hlapi.llm_api", "--host", "0.0.0.0", "--port", "8080"] +``` + +### Advanced Configuration Example +```dockerfile +FROM vllm/vllm-openai:latest + +# Install model hosting container standards +RUN pip install model-hosting-container-standards + +# Configure supervisor behavior (recommended approach) +ENV PROCESS_MAX_START_RETRIES=5 +ENV LOG_LEVEL=debug +ENV SUPERVISOR_PROGRAM__APP_STARTSECS=30 +ENV SUPERVISOR_PROGRAM__APP_STARTRETRIES=5 + +# Use standard-supervisor with custom configuration +CMD ["standard-supervisor", "vllm", "serve", "model", "--host", "0.0.0.0", "--port", "8080"] +``` + +### SageMaker Integration with Custom Script +```dockerfile +FROM vllm/vllm-openai:latest + +# Install model hosting container standards +RUN pip install model-hosting-container-standards + +# Copy your custom startup script +COPY sagemaker-entrypoint.sh . +RUN chmod +x sagemaker-entrypoint.sh + +# Configure supervisor for production (recommended approach) +ENV PROCESS_MAX_START_RETRIES=3 +ENV PROCESS_AUTO_RECOVERY=true + +# Use standard-supervisor with your custom script +CMD ["standard-supervisor", "./sagemaker-entrypoint.sh"] +``` + +### Entrypoint Style for Flexibility +```dockerfile +FROM vllm/vllm-openai:latest + +# Install model hosting container standards +RUN pip install model-hosting-container-standards + +# Optional: Configure supervisor (recommended approach) +ENV PROCESS_MAX_START_RETRIES=5 +ENV LOG_LEVEL=info + +# Use as entrypoint for runtime flexibility +ENTRYPOINT ["standard-supervisor"] +CMD ["vllm", "serve", "model", "--host", "0.0.0.0", "--port", "8080"] +``` + +### Service Monitoring Behavior + +**Expected Behavior**: LLM services should run indefinitely. Any exit is treated as an error. + +**Restart Logic**: +1. If your service exits for any reason (crash, OOM, etc.), it will be automatically restarted +2. Maximum restart attempts: `PROCESS_MAX_START_RETRIES` (default: 3) +3. If restart limit is exceeded, the container exits with code 1 +4. This signals to container orchestrators (Docker, Kubernetes) that the service failed + +**Why This Matters**: Container orchestrators can detect the failure and take appropriate action (restart container, alert operators, etc.) + + +## Troubleshooting + +### Common Errors + +**"No command provided"** +```bash +# Fix: Provide a command after standard-supervisor +standard-supervisor vllm serve model --host 0.0.0.0 --port 8080 +``` + +**"supervisord command not found"** +```bash +# Fix: Install supervisor (usually included with model-hosting-container-standards) +pip install supervisor +``` + +**Process keeps restarting** +```bash +# Fix: Disable auto-recovery to see the actual error (recommended) +export PROCESS_AUTO_RECOVERY=false +export PROCESS_MAX_START_RETRIES=1 +``` + +**Configuration not taking effect** +```bash +# Fix: Use recommended application-level variables first +# Recommended: PROCESS_MAX_START_RETRIES=5 +# Advanced (specific program): SUPERVISOR_PROGRAM__APP_STARTRETRIES=5 +``` + +## Framework-Specific Examples + +### vLLM Examples +```bash +# Basic vLLM server +standard-supervisor vllm serve model --host 0.0.0.0 --port 8080 + +# vLLM with specific model and parameters +standard-supervisor vllm serve microsoft/DialoGPT-medium --host 0.0.0.0 --port 8080 --dtype auto --max-model-len 2048 + +# vLLM with OpenAI-compatible API +standard-supervisor python -m vllm.entrypoints.openai.api_server --model model --host 0.0.0.0 --port 8080 +``` + +### TensorRT-LLM Examples +```bash +# TensorRT-LLM API server +standard-supervisor python -m tensorrt_llm.hlapi.llm_api --host 0.0.0.0 --port 8080 + +# TensorRT-LLM with custom model path +standard-supervisor python -m tensorrt_llm.hlapi.llm_api --model-dir /opt/model --host 0.0.0.0 --port 8080 +``` + +### Custom Python Scripts +```bash +# Your custom ML serving script +standard-supervisor python my_model_server.py --port 8080 + +# FastAPI application +standard-supervisor uvicorn app:app --host 0.0.0.0 --port 8080 + +# Any other command +standard-supervisor ./my-custom-entrypoint.sh +``` + + + +## Key Files + +- `scripts/standard_supervisor.py` - Main CLI entry point (`standard-supervisor` command) +- `scripts/generate_supervisor_config.py` - Configuration generator (used internally) + +That's all you need! The supervisor system handles the rest automatically. diff --git a/python/model_hosting_container_standards/supervisor/__init__.py b/python/model_hosting_container_standards/supervisor/__init__.py new file mode 100644 index 0000000..e4c8c2f --- /dev/null +++ b/python/model_hosting_container_standards/supervisor/__init__.py @@ -0,0 +1,17 @@ +""" +Supervisor process management module for ML frameworks. + +This module provides supervisord-based process management capabilities +for containerized ML frameworks, enabling automatic process recovery +and self-contained resilience. +""" + +from .generator import generate_supervisord_config, write_supervisord_config +from .models import ConfigurationError, SupervisorConfig + +__all__ = [ + "SupervisorConfig", + "ConfigurationError", + "generate_supervisord_config", + "write_supervisord_config", +] diff --git a/python/model_hosting_container_standards/supervisor/generator.py b/python/model_hosting_container_standards/supervisor/generator.py new file mode 100644 index 0000000..c7ab555 --- /dev/null +++ b/python/model_hosting_container_standards/supervisor/generator.py @@ -0,0 +1,239 @@ +""" +Supervisord configuration generation for ML framework process management. + +This module provides functionality to generate supervisord configuration files +based on environment variables and framework-specific settings. +""" + +from pathlib import Path + +from ..logging_config import get_logger +from .models import ConfigurationError, SupervisorConfig + +logger = get_logger(__name__) + + +# Supervisord configuration template for LLM service monitoring +# +# Key behavior: LLM services are expected to run indefinitely. Any exit is considered an error. +# - exitcodes=255: Only exit code 255 is "expected" - all other exits (0,1,2...) trigger restart +# - startsecs=1: Process must run at least 1 second to be considered successfully started +# - autorestart=unexpected: Only restart on unexpected exit codes (not 255) +# When ENGINE_AUTO_RECOVERY=false, autorestart=false to disable all restarts +# - startretries=N: Maximum restart attempts before entering FATAL state +# +# FATAL state examples (supervisorctl status output): +# app FATAL Exited too quickly (process log may have details) +# app FATAL can't find command '/path/to/missing/binary' +# app FATAL spawn error +# +# When a program enters FATAL state (too many restart failures), the entrypoint script +# will detect this and exit with code 1 to signal container failure. +def get_base_config_template( + program_name: str, + log_level: str, + framework_command: str, + auto_restart: str, + max_start_retries: int, +) -> dict: + """Get base supervisord configuration as dictionary structure. + + Note: We don't use supervisorctl for process management, but supervisord + still needs minimal RPC configuration for its internal operations. + """ + return { + "supervisord": { + "nodaemon": "true", + "loglevel": log_level, + "logfile": f"/tmp/supervisord-{program_name}.log", + "logfile_maxbytes": "50MB", + "logfile_backups": "3", + "pidfile": f"/tmp/supervisord-{program_name}.pid", + }, + f"program:{program_name}": { + "command": framework_command, + "autostart": "true", + "autorestart": auto_restart, + "startretries": str(max_start_retries), + "stdout_logfile": "/dev/stdout", + "stdout_logfile_maxbytes": "0", + "stderr_logfile": "/dev/stderr", + "stderr_logfile_maxbytes": "0", + "exitcodes": "255", + "startsecs": "1", + "stopsignal": "TERM", + "stopwaitsecs": "30", + "stopasgroup": "true", + "killasgroup": "true", + }, + } + + +def generate_supervisord_config( + config: SupervisorConfig, + launch_command: str, + program_name: str = "app", +) -> str: + """Generate supervisord configuration content with validation and logging. + + Creates a supervisord configuration file content based on the provided + configuration and launch command. Merges custom SUPERVISOR_* configuration + with the base template. + + Args: + config: SupervisorConfig instance with supervisor settings. + launch_command: Command to execute in the supervised program + program_name: Name for the supervisord program section + + Returns: + str: Complete supervisord configuration file content + + Raises: + ConfigurationError: If configuration generation fails + ValueError: If required parameters are invalid + """ + # Validate required parameters + if not program_name or not program_name.strip(): + error_msg = "Program name cannot be empty" + logger.error(error_msg) + raise ValueError(error_msg) + + # Validate launch command parameter + if not launch_command or not launch_command.strip(): + error_msg = "Launch command cannot be empty" + logger.error(error_msg) + raise ValueError(error_msg) + + # Convert boolean auto_recovery to supervisord format + # Use "true" to always restart (except for exitcodes=255 which is "expected") + auto_restart = "true" if config.auto_recovery else "false" + + try: + # Get base configuration as dictionary + base_config = get_base_config_template( + program_name=program_name, + log_level=config.log_level, + framework_command=launch_command, + auto_restart=auto_restart, + max_start_retries=config.max_start_retries, + ) + + # Merge custom configuration sections + merged_config = _merge_custom_sections(base_config, config.custom_sections) + + # Convert to INI format string + return _dict_to_ini_string(merged_config) + + except Exception as e: + error_msg = f"Failed to generate supervisord configuration: {str(e)}" + logger.error(error_msg) + raise ConfigurationError(error_msg) from e + + +def write_supervisord_config( + config_path: str, + config: SupervisorConfig, + launch_command: str, + program_name: str = "app", +) -> None: + """Write supervisord configuration to file with comprehensive error handling. + + Generates supervisord configuration content and writes it to the + specified file path. Creates parent directories if they don't exist. + + Args: + config_path: Path where the configuration file should be written + config: SupervisorConfig instance with supervisor settings. + launch_command: Command to execute in the supervised program + program_name: Name for the supervisord program section + + Raises: + ConfigurationError: If configuration generation or validation fails + OSError: If the configuration file cannot be written + ValueError: If required parameters are invalid + """ + try: + # Generate configuration content + config_content = generate_supervisord_config( + config, launch_command, program_name + ) + + # Create parent directories if they don't exist + Path(config_path).parent.mkdir(parents=True, exist_ok=True, mode=0o755) + + # Write configuration to file + with open(config_path, "w", encoding="utf-8") as f: + f.write(config_content) + + logger.info(f"Successfully wrote supervisord configuration to '{config_path}'") + + except (OSError, IOError) as e: + error_msg = f"Failed to write configuration file '{config_path}': {str(e)}" + logger.error(error_msg) + raise OSError(error_msg) from e + except Exception as e: + error_msg = f"Unexpected error writing configuration: {str(e)}" + logger.error(error_msg) + raise ConfigurationError(error_msg) from e + + +def _merge_custom_sections(base_config: dict, custom_sections: dict) -> dict: + """Merge custom configuration sections with base configuration. + + Args: + base_config: Base configuration dictionary + custom_sections: Custom configuration sections to merge + + Returns: + dict: Merged configuration dictionary + """ + if not custom_sections: + return base_config + + # Merge custom sections directly into base config + for section_name, custom_config in custom_sections.items(): + if section_name in base_config: + # Update existing section + for key, value in custom_config.items(): + if key in base_config[section_name]: + logger.info(f"Overrode setting in [{section_name}]: {key}={value}") + else: + logger.info( + f"Added custom setting to [{section_name}]: {key}={value}" + ) + base_config[section_name][key] = value + else: + # Add new section + base_config[section_name] = custom_config.copy() + logger.info( + f"Added new custom section [{section_name}] with {len(custom_config)} settings" + ) + + return base_config + + +def _dict_to_ini_string(config_dict: dict) -> str: + """Convert configuration dictionary to INI format string using configparser. + + Args: + config_dict: Configuration dictionary + + Returns: + str: INI format configuration string + """ + import configparser + from io import StringIO + + config = configparser.ConfigParser() + + # Add sections and their key-value pairs + for section_name, section_config in config_dict.items(): + config.add_section(section_name) + for key, value in section_config.items(): + config.set(section_name, key, str(value)) + + # Write to string buffer + output = StringIO() + config.write(output) + + return output.getvalue() diff --git a/python/model_hosting_container_standards/supervisor/models.py b/python/model_hosting_container_standards/supervisor/models.py new file mode 100644 index 0000000..896a5f5 --- /dev/null +++ b/python/model_hosting_container_standards/supervisor/models.py @@ -0,0 +1,167 @@ +"""Configuration management for supervisor process management.""" + +import os +from dataclasses import dataclass, field +from typing import Dict, Optional + +from ..logging_config import get_logger + +logger = get_logger(__name__) + +# Environment variable constants +PROCESS_AUTO_RECOVERY = "PROCESS_AUTO_RECOVERY" +PROCESS_MAX_START_RETRIES = "PROCESS_MAX_START_RETRIES" +LOG_LEVEL = "LOG_LEVEL" +SUPERVISOR_CONFIG_PATH = "SUPERVISOR_CONFIG_PATH" + + +class ConfigurationError(Exception): + """Exception raised for configuration validation errors.""" + + pass + + +@dataclass +class SupervisorConfig: + """Configuration for supervisor process management system. + + Hybrid Environment Variable Design: + - Application config: PROCESS_ prefixed names (PROCESS_AUTO_RECOVERY, PROCESS_MAX_START_RETRIES, LOG_LEVEL) + - Supervisord config: SUPERVISOR_{SECTION}_{KEY} pattern for custom overrides + - Section names with colons: Use double underscore __ to represent colon : + + Examples: + - PROCESS_AUTO_RECOVERY=false (application behavior) + - PROCESS_MAX_START_RETRIES=5 (application behavior) + - LOG_LEVEL=debug (application behavior) + - SUPERVISOR_PROGRAM_STARTSECS=10 (supervisord [program] section override) + - SUPERVISOR_SUPERVISORD_LOGLEVEL=debug (supervisord [supervisord] section override) + - SUPERVISOR_PROGRAM__WEB_COMMAND="gunicorn app:app" (supervisord [program:web] section) + - SUPERVISOR_RPCINTERFACE__SUPERVISOR_FACTORY=... (supervisord [rpcinterface:supervisor] section) + """ + + auto_recovery: bool = True + max_start_retries: int = 3 + config_path: str = "/tmp/supervisord.conf" + log_level: str = "info" + custom_sections: Dict[str, Dict[str, str]] = field(default_factory=dict) + + +def _parse_bool(value: str) -> bool: + """Parse boolean from string.""" + return value.lower() in ("true", "1", "yes", "on") + + +def _get_env_int(name: str, default: int, min_val: int = 0, max_val: int = 100) -> int: + """Get integer from environment with validation.""" + value = os.getenv(name) + if not value: + return default + + try: + parsed = int(value) + if not (min_val <= parsed <= max_val): + raise ConfigurationError( + f"{name} must be between {min_val} and {max_val}, got {parsed}" + ) + return parsed + except ValueError: + raise ConfigurationError(f"{name} must be an integer, got '{value}'") + + +def _get_env_str(name: str, default: str, allowed: Optional[list] = None) -> str: + """Get string from environment with validation.""" + value = os.getenv(name, default).strip() + if allowed and value.lower() not in allowed: + raise ConfigurationError(f"{name} must be one of {allowed}, got '{value}'") + return value + + +def parse_environment_variables() -> SupervisorConfig: + """Parse environment variables and return SupervisorConfig instance.""" + try: + # Parse custom SUPERVISOR_* configuration sections + custom_sections = _parse_supervisor_custom_sections() + + return SupervisorConfig( + auto_recovery=_parse_bool(os.getenv(PROCESS_AUTO_RECOVERY, "true")), + max_start_retries=_get_env_int(PROCESS_MAX_START_RETRIES, 3), + config_path=_get_env_str(SUPERVISOR_CONFIG_PATH, "/tmp/supervisord.conf"), + log_level=_get_env_str( + LOG_LEVEL, + "info", + ["debug", "info", "warn", "error", "critical"], + ), + custom_sections=custom_sections, + ) + except ConfigurationError as e: + logger.error(f"Configuration validation failed: {e}") + raise + + +def _parse_supervisor_custom_sections() -> Dict[str, Dict[str, str]]: + """ + Parse SUPERVISOR_{SECTION}_{KEY}=VALUE environment variables for supervisord configuration. + + Pattern: SUPERVISOR_SECTION_KEY -> [section] key=value + Special handling for section names with colons: + - Double underscore __ in section name becomes colon : + + Examples: + - SUPERVISOR_PROGRAM_STARTSECS=10 -> [program] startsecs=10 + - SUPERVISOR_SUPERVISORD_LOGLEVEL=debug -> [supervisord] loglevel=debug + - SUPERVISOR_PROGRAM__WEB_COMMAND="gunicorn app:app" -> [program:web] command=gunicorn app:app + - SUPERVISOR_RPCINTERFACE__SUPERVISOR_FACTORY=... -> [rpcinterface:supervisor] factory=... + + Skips SUPERVISOR_CONFIG_PATH (used for file path, not supervisord config). + + Returns: + Dictionary mapping section names to their key-value configurations + """ + import re + + # Pattern matches SUPERVISOR_SECTION_KEY where: + # - SECTION: alphanumeric, may contain __ (for colons) or _ (internal), no leading/trailing _ + # - KEY: alphanumeric, may contain _ (internal), no leading/trailing _ + pattern = re.compile( + r"^SUPERVISOR_" + r"(?P
[A-Z0-9]+(?:__[A-Z0-9]+|_[A-Z0-9]+)*)" # SECTION (__ for colons) + r"_(?P[A-Z0-9]+(?:_[A-Z0-9]+)*)$" # KEY (no leading/trailing _) + ) + + custom_sections: Dict[str, Dict[str, str]] = {} + + for env_var, value in os.environ.items(): + # Skip non-SUPERVISOR_ variables and the config path variable + if env_var == "SUPERVISOR_CONFIG_PATH": + continue + + match = pattern.match(env_var) + if not match: + # Only warn if it starts with SUPERVISOR_ but doesn't match pattern + if env_var.startswith("SUPERVISOR_"): + logger.warning( + f"Invalid SUPERVISOR_ environment variable format: '{env_var}'. " + f"Expected format: SUPERVISOR_SECTION_KEY=value (alphanumeric with underscores, " + f"no leading/trailing underscores, use __ for section colons)" + ) + continue + + # Extract section and key from regex groups + section_part = match.group("section") + key_name = match.group("key").lower() + + # Convert double underscores to colons in section name + section_name = section_part.replace("__", ":").lower() + + # Initialize section if it doesn't exist + if section_name not in custom_sections: + custom_sections[section_name] = {} + + # Store the custom configuration + custom_sections[section_name][key_name] = value.strip() + logger.debug( + f"Found custom supervisor configuration: [{section_name}] {key_name}={value}" + ) + + return custom_sections diff --git a/python/model_hosting_container_standards/supervisor/scripts/generate_supervisor_config.py b/python/model_hosting_container_standards/supervisor/scripts/generate_supervisor_config.py new file mode 100644 index 0000000..57e37b6 --- /dev/null +++ b/python/model_hosting_container_standards/supervisor/scripts/generate_supervisor_config.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 +""" +Supervisor Configuration Generator Script + +Simple script to generate supervisord configuration files for ML frameworks. +""" + +import argparse +import logging +import sys + +from model_hosting_container_standards.logging_config import get_logger +from model_hosting_container_standards.supervisor.generator import ( + write_supervisord_config, +) +from model_hosting_container_standards.supervisor.models import ( + ConfigurationError, + parse_environment_variables, +) + + +def main() -> int: + """Main entry point with comprehensive error handling and logging.""" + parser = argparse.ArgumentParser(description="Generate supervisord configuration") + + parser.add_argument( + "-o", "--output", required=True, help="Output path for config file" + ) + + parser.add_argument("-p", "--program-name", default="app", help="Program name") + parser.add_argument( + "--log-level", + choices=["ERROR", "INFO", "DEBUG"], + default="ERROR", + help="Log level", + ) + parser.add_argument("command", nargs="+", help="Launch command and arguments") + + args = parser.parse_args() + + # Set up logging based on command line argument + logger = get_logger(__name__) + if args.log_level == "DEBUG": + logger.setLevel(logging.DEBUG) + elif args.log_level == "INFO": + logger.setLevel(logging.INFO) + else: + logger.setLevel(logging.ERROR) + + try: + # Parse configuration from environment + config = parse_environment_variables() + + # Get launch command from CLI arguments + launch_command = " ".join(args.command) + + # Generate and write configuration + write_supervisord_config(args.output, config, launch_command, args.program_name) + + if args.log_level != "ERROR": + print(f"Configuration written to: {args.output}") + + return 0 + + except ConfigurationError as e: + logger.error(f"Configuration error: {str(e)}") + print(f"ERROR: Configuration error: {e}", file=sys.stderr) + return 1 + except (OSError, IOError) as e: + logger.error(f"File I/O error: {str(e)}") + print(f"ERROR: File I/O error: {e}", file=sys.stderr) + return 1 + except Exception as e: + logger.error(f"Unexpected error: {str(e)}", exc_info=True) + print(f"ERROR: Unexpected error: {e}", file=sys.stderr) + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/python/model_hosting_container_standards/supervisor/scripts/standard_supervisor.py b/python/model_hosting_container_standards/supervisor/scripts/standard_supervisor.py new file mode 100644 index 0000000..e5fcffa --- /dev/null +++ b/python/model_hosting_container_standards/supervisor/scripts/standard_supervisor.py @@ -0,0 +1,214 @@ +#!/usr/bin/env python3 +""" +Standard Supervisor CLI Script + +Simplified CLI command that wraps and manages user launch processes under supervision. +Users can prepend 'standard-supervisor' to their existing launch commands. + +Usage: + standard-supervisor [args...] + +Example: + standard-supervisor vllm serve model --host 0.0.0.0 --port 8080 +""" + +import logging +import os +import shutil +import signal +import subprocess +import sys +import time +from typing import Any, Dict, List, Optional + +from model_hosting_container_standards.logging_config import get_logger +from model_hosting_container_standards.supervisor.generator import ( + write_supervisord_config, +) +from model_hosting_container_standards.supervisor.models import ( + ConfigurationError, + parse_environment_variables, +) + + +class ProcessManager: + """Manages supervisord process lifecycle without supervisorctl dependency.""" + + def __init__(self, logger: logging.Logger): + self.logger = logger + self.process: Optional[subprocess.Popen] = None + + def check_tools_available(self) -> tuple[bool, str]: + """Check if supervisord is available.""" + if not shutil.which("supervisord"): + return False, "supervisord" + return True, "" + + def start(self, config_path: str) -> subprocess.Popen: + """Start supervisord process with the given configuration.""" + self.logger.info("Starting supervisord...") + + self.process = subprocess.Popen(["supervisord", "-c", config_path]) + time.sleep(1.0) # Give supervisord time to start + + if self.process.poll() is not None: + error_msg = ( + f"Supervisord failed to start. Exit code: {self.process.returncode}" + ) + self.logger.error(error_msg) + raise RuntimeError(error_msg) + + self.logger.info(f"Supervisord started with PID: {self.process.pid}") + return self.process + + def terminate(self) -> None: + """Terminate the supervisord process.""" + if not self.process: + return + + try: + self.process.terminate() + self.process.wait(timeout=5) + self.logger.info("Supervisord terminated") + except subprocess.TimeoutExpired: + self.logger.warning("Termination timed out, force killing...") + self.process.kill() + self.process.wait() + self.logger.info("Supervisord force killed") + except Exception as e: + self.logger.error(f"Error during shutdown: {e}") + + +class SignalHandler: + """Handles process signals for graceful shutdown.""" + + def __init__(self, process_manager: ProcessManager, logger: logging.Logger): + self.process_manager = process_manager + self.logger = logger + self._original_handlers: Dict[int, Any] = {} + + def setup(self) -> None: + """Set up signal handlers.""" + + def signal_handler(signum: int, frame) -> None: + self.logger.info(f"Received signal {signum}, shutting down...") + self._restore_default_handlers() + self.process_manager.terminate() + sys.exit(0) + + # Store original handlers and set new ones + self._original_handlers[signal.SIGTERM] = signal.signal( + signal.SIGTERM, signal_handler + ) + self._original_handlers[signal.SIGINT] = signal.signal( + signal.SIGINT, signal_handler + ) + + def _restore_default_handlers(self) -> None: + """Restore default signal handlers to prevent recursive calls.""" + signal.signal(signal.SIGTERM, signal.SIG_DFL) + signal.signal(signal.SIGINT, signal.SIG_DFL) + + +class StandardSupervisor: + """Main supervisor orchestrator.""" + + def __init__(self): + self.logger = get_logger(__name__) + self._setup_logging() + + self.process_manager = ProcessManager(self.logger) + self.signal_handler = SignalHandler(self.process_manager, self.logger) + + def _setup_logging(self) -> None: + """Configure logging based on environment.""" + log_level = os.getenv("LOG_LEVEL", "INFO").upper() + self.logger.setLevel(getattr(logging, log_level, logging.INFO)) + + def parse_arguments(self) -> List[str]: + """Parse command-line arguments to extract launch command.""" + launch_command = sys.argv[1:] + + if not launch_command: + print("ERROR: No launch command provided", file=sys.stderr) + print( + "Usage: standard-supervisor [args...]", file=sys.stderr + ) + print( + "Example: standard-supervisor vllm serve model --host 0.0.0.0 --port 8080", + file=sys.stderr, + ) + sys.exit(1) + + return launch_command + + def run(self) -> int: + """Main execution method.""" + launch_command = self.parse_arguments() + self.logger.info(f"Starting: {' '.join(launch_command)}") + + # Check system requirements + tools_available, missing_tool = self.process_manager.check_tools_available() + if not tools_available: + self.logger.error(f"{missing_tool} not found. Install supervisor package.") + return 1 + + # Parse configuration + try: + config = parse_environment_variables() + except ConfigurationError as e: + self.logger.error(f"Configuration error: {e}") + return 1 + + config_path = config.config_path + program_name = "app" + + try: + # Generate and start supervisor + self.logger.info("Generating supervisor configuration...") + write_supervisord_config( + config_path=config_path, + config=config, + launch_command=" ".join(launch_command), + program_name=program_name, + ) + + supervisord_process = self.process_manager.start(config_path) + self.signal_handler.setup() + + # Wait for supervisord to exit using poll loop + # This allows signal handlers to interrupt and respond quickly + self.logger.info("Supervisord running, waiting for completion...") + while supervisord_process.poll() is None: + time.sleep(0.5) # Check twice per second + + exit_code = supervisord_process.returncode + self.logger.info(f"Supervisord exited with code: {exit_code}") + return exit_code + + except Exception as e: + self.logger.error(f"Unexpected error: {e}") + return 1 + finally: + # Cleanup - only delete auto-generated temp files, not user-specified configs + user_specified_config = os.getenv("SUPERVISOR_CONFIG_PATH") + should_cleanup = ( + config_path.startswith("/tmp/") + and os.path.exists(config_path) + and not user_specified_config + ) + if should_cleanup: + try: + os.unlink(config_path) + except OSError as e: + self.logger.warning(f"Failed to clean up config file: {e}") + + +def main() -> int: + """Main entry point for standard-supervisor CLI.""" + supervisor = StandardSupervisor() + return supervisor.run() + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/python/poetry.lock b/python/poetry.lock index 250fa06..f2e288e 100644 --- a/python/poetry.lock +++ b/python/poetry.lock @@ -885,6 +885,27 @@ files = [ {file = "pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f"}, ] +[[package]] +name = "setuptools" +version = "80.9.0" +description = "Easily download, build, install, upgrade, and uninstall Python packages" +optional = false +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "setuptools-80.9.0-py3-none-any.whl", hash = "sha256:062d34222ad13e0cc312a4c02d73f059e86a4acbfbdea8f8f76b28c99f306922"}, + {file = "setuptools-80.9.0.tar.gz", hash = "sha256:f36b47402ecde768dbfafc46e8e4207b4360c654f1f3bb84475f0a28628fb19c"}, +] + +[package.extras] +check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\"", "ruff (>=0.8.0) ; sys_platform != \"cygwin\""] +core = ["importlib_metadata (>=6) ; python_version < \"3.10\"", "jaraco.functools (>=4)", "jaraco.text (>=3.7)", "more_itertools", "more_itertools (>=8.8)", "packaging (>=24.2)", "platformdirs (>=4.2.2)", "tomli (>=2.0.1) ; python_version < \"3.11\"", "wheel (>=0.43.0)"] +cover = ["pytest-cov"] +doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier", "towncrier (<24.7)"] +enabler = ["pytest-enabler (>=2.2)"] +test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21) ; python_version >= \"3.9\" and sys_platform != \"cygwin\"", "jaraco.envs (>=2.2)", "jaraco.path (>=3.7.2)", "jaraco.test (>=5.5)", "packaging (>=24.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf ; sys_platform != \"cygwin\"", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"] +type = ["importlib_metadata (>=7.0.2) ; python_version < \"3.10\"", "jaraco.develop (>=7.21) ; sys_platform != \"cygwin\"", "mypy (==1.14.*)", "pytest-mypy"] + [[package]] name = "sniffio" version = "1.3.1" @@ -916,6 +937,21 @@ typing-extensions = {version = ">=4.10.0", markers = "python_version < \"3.13\"" [package.extras] full = ["httpx (>=0.27.0,<0.29.0)", "itsdangerous", "jinja2", "python-multipart (>=0.0.18)", "pyyaml"] +[[package]] +name = "supervisor" +version = "4.3.0" +description = "A system for controlling process state under UNIX" +optional = false +python-versions = "*" +groups = ["main"] +files = [ + {file = "supervisor-4.3.0-py2.py3-none-any.whl", hash = "sha256:0bcb763fddafba410f35cbde226aa7f8514b9fb82eb05a0c85f6588d1c13f8db"}, + {file = "supervisor-4.3.0.tar.gz", hash = "sha256:4a2bf149adf42997e1bb44b70c43b613275ec9852c3edacca86a9166b27e945e"}, +] + +[package.extras] +test = ["pytest", "pytest-cov"] + [[package]] name = "tomli" version = "2.3.0" @@ -1021,4 +1057,4 @@ test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess [metadata] lock-version = "2.1" python-versions = ">=3.10" -content-hash = "67645431a7969e2d9a337dc15611543552cc3636cf1b34555d137c0a632291dd" +content-hash = "06462368f46834a041e4fb294599d5f2c6c6f7485c72bfb3cc1faca6af5504e8" diff --git a/python/pyproject.toml b/python/pyproject.toml index 551f656..2f89a7a 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -14,11 +14,23 @@ dependencies = [ "pydantic", "jmespath", "httpx", + "setuptools", + "supervisor>=4.2.0", ] [tool.poetry] packages = [{include = "model_hosting_container_standards"}] +# Include supervisor scripts in the package +include = [ + "model_hosting_container_standards/supervisor/scripts/*", +] + +# Console scripts for easy access +[tool.poetry.scripts] +generate-supervisor-config = "model_hosting_container_standards.supervisor.scripts.generate_supervisor_config:main" +standard-supervisor = "model_hosting_container_standards.supervisor.scripts.standard_supervisor:main" + [build-system] requires = ["poetry-core>=2.0.0,<3.0.0"] build-backend = "poetry.core.masonry.api" diff --git a/python/tests/integration/test_supervisor_cli_integration.py b/python/tests/integration/test_supervisor_cli_integration.py new file mode 100644 index 0000000..c835d08 --- /dev/null +++ b/python/tests/integration/test_supervisor_cli_integration.py @@ -0,0 +1,464 @@ +""" +Integration tests for standard-supervisor CLI functionality. + +Tests verify: +1. Configuration file generation and validation +2. Process supervision and restart behavior +3. Startup retry limits +4. Signal handling and graceful shutdown +""" + +import configparser +import os +import signal +import subprocess +import sys +import tempfile +import time +from pathlib import Path + +import pytest + + +def get_python_cwd(): + """Get the correct working directory for python module execution.""" + current_dir = Path(__file__).parent.parent.parent.absolute() + return str(current_dir) + + +def parse_supervisor_config(config_path): + """Parse supervisor configuration file and return configparser object.""" + config = configparser.ConfigParser() + config.read(config_path) + return config + + +class TestSupervisorCLIIntegration: + """Integration tests for the standard-supervisor CLI.""" + + @pytest.fixture + def clean_env(self): + """Provide clean environment for testing.""" + original_env = dict(os.environ) + + # Clear supervisor-related variables + for key in list(os.environ.keys()): + if key.startswith("SUPERVISOR_") or key in [ + "PROCESS_AUTO_RECOVERY", + "PROCESS_MAX_START_RETRIES", + "LOG_LEVEL", + ]: + del os.environ[key] + + yield + + # Restore original environment + os.environ.clear() + os.environ.update(original_env) + + def test_basic_cli_execution_and_config_generation(self, clean_env): + """Test basic CLI execution with configuration generation and validation.""" + env = { + "PROCESS_MAX_START_RETRIES": "2", + "SUPERVISOR_PROGRAM__APP_STARTSECS": "2", + "SUPERVISOR_PROGRAM__APP_STOPWAITSECS": "5", + "SUPERVISOR_PROGRAM__APP_AUTORESTART": "true", + "LOG_LEVEL": "info", + } + + with tempfile.TemporaryDirectory() as temp_dir: + config_path = os.path.join(temp_dir, "supervisord.conf") + env["SUPERVISOR_CONFIG_PATH"] = config_path + + # Start supervisor with a long-running server + process = subprocess.Popen( + [ + sys.executable, + "-m", + "model_hosting_container_standards.supervisor.scripts.standard_supervisor", + sys.executable, + "-c", + "import time; print('Server started', flush=True); time.sleep(30)", + ], + env={**os.environ, **env}, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + cwd=get_python_cwd(), + ) + + try: + # Give it time to start and generate config + time.sleep(3) + + # Verify config file was generated + assert os.path.exists( + config_path + ), f"Config file not found at {config_path}" + + config = parse_supervisor_config(config_path) + + # Check main sections exist + assert "supervisord" in config.sections() + assert "program:app" in config.sections() + + # Verify program configuration + program_section = config["program:app"] + assert "python" in program_section["command"] + assert program_section["startsecs"] == "2" + assert program_section["stopwaitsecs"] == "5" + assert program_section["autostart"] == "true" + assert program_section["autorestart"] == "true" + assert program_section["stdout_logfile"] == "/dev/stdout" + assert program_section["stderr_logfile"] == "/dev/stderr" + + finally: + # Clean up + if process.poll() is None: + process.terminate() + try: + process.communicate(timeout=5) + except subprocess.TimeoutExpired: + process.kill() + process.communicate() + + def test_ml_framework_configuration(self, clean_env): + """Test supervisor configuration for ML framework scenarios.""" + env = { + "PROCESS_MAX_START_RETRIES": "3", + "SUPERVISOR_PROGRAM__APP_STARTSECS": "30", # ML models need longer startup + "SUPERVISOR_PROGRAM__APP_STOPWAITSECS": "60", # Graceful shutdown time + "SUPERVISOR_PROGRAM__APP_STARTRETRIES": "3", + "SUPERVISOR_PROGRAM__APP_AUTORESTART": "true", + "LOG_LEVEL": "info", + } + + with tempfile.TemporaryDirectory() as temp_dir: + config_path = os.path.join(temp_dir, "supervisord.conf") + env["SUPERVISOR_CONFIG_PATH"] = config_path + + # Simulate ML framework server + process = subprocess.Popen( + [ + sys.executable, + "-m", + "model_hosting_container_standards.supervisor.scripts.standard_supervisor", + sys.executable, + "-c", + "print('ML model server starting...', flush=True); import time; time.sleep(30); print('Ready')", + ], + env={**os.environ, **env}, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + cwd=get_python_cwd(), + ) + + try: + # Give it time to start and generate config + time.sleep(3) + + # Verify ML-specific configuration + assert os.path.exists( + config_path + ), f"Config file not found at {config_path}" + + config = parse_supervisor_config(config_path) + program_section = config["program:app"] + + # ML frameworks need longer startup and shutdown times + assert program_section["startsecs"] == "30" + assert program_section["stopwaitsecs"] == "60" + assert program_section["startretries"] == "3" + assert program_section["autorestart"] == "true" + + # Verify process management settings for ML workloads + assert program_section["stopasgroup"] == "true" + assert program_section["killasgroup"] == "true" + assert program_section["stopsignal"] == "TERM" + + finally: + # Clean up + if process.poll() is None: + process.terminate() + try: + process.communicate(timeout=5) + except subprocess.TimeoutExpired: + process.kill() + process.communicate() + + def test_signal_handling(self, clean_env): + """Test that supervisor handles signals correctly.""" + env = { + "PROCESS_MAX_START_RETRIES": "1", + "SUPERVISOR_PROGRAM__APP_STARTSECS": "1", + "SUPERVISOR_PROGRAM__APP_STOPWAITSECS": "5", + "LOG_LEVEL": "info", + } + + with tempfile.TemporaryDirectory() as temp_dir: + config_path = os.path.join(temp_dir, "supervisord.conf") + env["SUPERVISOR_CONFIG_PATH"] = config_path + + # Start a long-running process + process = subprocess.Popen( + [ + sys.executable, + "-m", + "model_hosting_container_standards.supervisor.scripts.standard_supervisor", + sys.executable, + "-c", + "import time; print('Long running process started', flush=True); time.sleep(30)", + ], + env={**os.environ, **env}, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + cwd=get_python_cwd(), + ) + + try: + # Give it time to start + time.sleep(3) + assert os.path.exists(config_path) + + # Send SIGTERM to test graceful shutdown + process.send_signal(signal.SIGTERM) + + # Wait for termination with longer timeout + # supervisord needs time to stop child processes + stdout, stderr = process.communicate(timeout=10) + + # Should have terminated (any exit code is fine, we just want it to stop) + assert process.returncode is not None + + except subprocess.TimeoutExpired: + process.kill() + process.wait() + pytest.fail("Process did not terminate gracefully within timeout") + + def test_continuous_restart_behavior(self, clean_env): + """Test that supervisor continuously restarts processes when autorestart=true.""" + env = { + "SUPERVISOR_PROGRAM__APP_STARTSECS": "2", + "SUPERVISOR_PROGRAM__APP_AUTORESTART": "true", + "SUPERVISOR_PROGRAM__APP_STARTRETRIES": "10", + "LOG_LEVEL": "info", + } + + with tempfile.TemporaryDirectory() as temp_dir: + config_path = os.path.join(temp_dir, "supervisord.conf") + restart_log = os.path.join(temp_dir, "restart_log.txt") + env["SUPERVISOR_CONFIG_PATH"] = config_path + + # Create a server that runs briefly then exits (to test restart) + server_script_file = os.path.join(temp_dir, "test_server.py") + with open(server_script_file, "w") as f: + f.write( + f"""import time +import sys +import os + +# Log each startup +with open('{restart_log}', 'a') as f: + f.write(f'Server started at {{time.time()}}\\n') + f.flush() + +print('Server started, PID:', os.getpid(), flush=True) + +# Run for 3 seconds then exit (supervisor will restart due to autorestart=true) +for i in range(3): + time.sleep(1) + print(f'Server running {{i+1}}/3', flush=True) + +print('Server exiting (will be restarted by supervisor)', flush=True) +sys.exit(0) +""" + ) + + # Start supervisor with the server + process = subprocess.Popen( + [ + sys.executable, + "-m", + "model_hosting_container_standards.supervisor.scripts.standard_supervisor", + sys.executable, + server_script_file, + ], + env={**os.environ, **env}, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + cwd=get_python_cwd(), + ) + + try: + # Wait for multiple restart cycles + time.sleep(10) + + # Check restart log + assert os.path.exists( + restart_log + ), "Server should have created restart log" + with open(restart_log, "r") as f: + restart_entries = f.read().strip().split("\n") + restart_count = len([line for line in restart_entries if line]) + + print(f"Server restart count: {restart_count}") + + # Should have multiple restarts + assert ( + restart_count >= 2 + ), f"Server should have been restarted multiple times, got {restart_count}" + + # Verify config + config = parse_supervisor_config(config_path) + program_section = config["program:app"] + assert program_section["autorestart"] == "true" + + print( + f"✅ Server was restarted {restart_count} times, proving continuous restart behavior" + ) + + finally: + if process.poll() is None: + process.terminate() + try: + process.communicate(timeout=5) + except subprocess.TimeoutExpired: + process.kill() + process.communicate() + + def test_startup_retry_limit(self, clean_env): + """Test that supervisor respects startretries limit.""" + env = { + "SUPERVISOR_PROGRAM__APP_STARTSECS": "5", # Process must run 5 seconds to be "started" + "SUPERVISOR_PROGRAM__APP_STARTRETRIES": "3", # Only 3 startup attempts + "SUPERVISOR_PROGRAM__APP_AUTORESTART": "true", + "LOG_LEVEL": "info", + } + + with tempfile.TemporaryDirectory() as temp_dir: + config_path = os.path.join(temp_dir, "supervisord.conf") + startup_log = os.path.join(temp_dir, "startup_attempts.txt") + env["SUPERVISOR_CONFIG_PATH"] = config_path + + # Create script that logs startup attempts then fails before startsecs + script_file = os.path.join(temp_dir, "failing_script.py") + with open(script_file, "w") as f: + f.write( + f"""import time +import os + +# Log this startup attempt +with open('{startup_log}', 'a') as f: + f.write(f'Startup attempt at {{time.time()}}\\n') + f.flush() + +print('Process starting up...', flush=True) +time.sleep(2) # Run for 2 seconds (less than startsecs=5, so it's a startup failure) +print('Process failing before startsecs...', flush=True) +exit(1) +""" + ) + + # Run supervisor with the failing script + # Use Popen since supervisord won't exit after FATAL + process = subprocess.Popen( + [ + sys.executable, + "-m", + "model_hosting_container_standards.supervisor.scripts.standard_supervisor", + sys.executable, + script_file, + ], + env={**os.environ, **env}, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + cwd=get_python_cwd(), + ) + + try: + # Wait for retries to complete (should take ~10 seconds) + time.sleep(15) + + # Verify config was generated + assert os.path.exists(config_path), "Config file should exist" + config = parse_supervisor_config(config_path) + program_section = config["program:app"] + assert program_section["startretries"] == "3" + assert program_section["startsecs"] == "5" + + # Check startup attempts + assert os.path.exists( + startup_log + ), "Startup log should have been created" + + with open(startup_log, "r") as f: + startup_attempts = f.read().strip().split("\n") + attempt_count = len([line for line in startup_attempts if line]) + + # Should have made exactly startretries + 1 attempts (initial + retries) + expected_attempts = 4 # 1 initial + 3 retries + assert ( + attempt_count == expected_attempts + ), f"Expected {expected_attempts} startup attempts, got {attempt_count}" + + # Check supervisord log for FATAL state + log_path = "/tmp/supervisord-app.log" + if os.path.exists(log_path): + with open(log_path, "r") as f: + log_content = f.read() + assert ( + "gave up:" in log_content + and "entered FATAL state" in log_content + ), "Supervisor should have entered FATAL state" + + print( + f"✅ Supervisor made exactly {attempt_count} startup attempts before giving up" + ) + + finally: + # Clean up + if process.poll() is None: + process.terminate() + try: + process.communicate(timeout=5) + except subprocess.TimeoutExpired: + process.kill() + process.communicate() + + def test_configuration_validation_error(self, clean_env): + """Test CLI with invalid configuration.""" + env = { + "PROCESS_MAX_START_RETRIES": "invalid_number", # Invalid value + } + + result = subprocess.run( + [ + sys.executable, + "-m", + "model_hosting_container_standards.supervisor.scripts.standard_supervisor", + "echo", + "test", + ], + env={**os.environ, **env}, + capture_output=True, + text=True, + timeout=10, + cwd=get_python_cwd(), + ) + + # Should fail due to configuration error + assert result.returncode == 1 + output = result.stdout + result.stderr + assert ( + "Configuration error" in output + or "must be an integer" in output + or "Configuration validation failed" in output + ) + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/python/tests/supervisor/__init__.py b/python/tests/supervisor/__init__.py new file mode 100644 index 0000000..19f9fc1 --- /dev/null +++ b/python/tests/supervisor/__init__.py @@ -0,0 +1 @@ +"""Tests for supervisor module.""" diff --git a/python/tests/supervisor/test_generator.py b/python/tests/supervisor/test_generator.py new file mode 100644 index 0000000..e9d173c --- /dev/null +++ b/python/tests/supervisor/test_generator.py @@ -0,0 +1,364 @@ +""" +Unit tests for supervisor configuration generator. + +These tests focus on the configuration generation logic +without requiring actual file I/O or supervisor processes. +""" + +import tempfile +from pathlib import Path +from unittest.mock import patch + +import pytest + +from model_hosting_container_standards.supervisor.generator import ( + _dict_to_ini_string, + _merge_custom_sections, + generate_supervisord_config, + get_base_config_template, + write_supervisord_config, +) +from model_hosting_container_standards.supervisor.models import ( + ConfigurationError, + SupervisorConfig, +) + + +class TestGetBaseConfigTemplate: + """Test the base configuration template generation.""" + + def test_basic_template_structure(self): + """Test that basic template has all required sections.""" + template = get_base_config_template( + program_name="test_program", + log_level="info", + framework_command="echo test", + auto_restart="true", + max_start_retries=3, + ) + + # Check all required sections exist + expected_sections = [ + "supervisord", + "program:test_program", + ] + + for section in expected_sections: + assert section in template + + def test_program_section_configuration(self): + """Test program section has correct configuration.""" + template = get_base_config_template( + program_name="llm_engine", + log_level="debug", + framework_command="vllm serve model", + auto_restart="false", + max_start_retries=5, + ) + + program_section = template["program:llm_engine"] + + assert program_section["command"] == "vllm serve model" + assert program_section["autorestart"] == "false" + assert program_section["startretries"] == "5" + assert program_section["exitcodes"] == "255" + assert program_section["startsecs"] == "1" + assert program_section["stdout_logfile"] == "/dev/stdout" + assert program_section["stderr_logfile"] == "/dev/stderr" + + def test_supervisord_section_configuration(self): + """Test supervisord section has correct configuration.""" + template = get_base_config_template( + program_name="test_program", + log_level="debug", + framework_command="echo test", + auto_restart="true", + max_start_retries=3, + ) + + supervisord_section = template["supervisord"] + + assert supervisord_section["nodaemon"] == "true" + assert supervisord_section["loglevel"] == "debug" + assert "test_program" in supervisord_section["logfile"] + assert "test_program" in supervisord_section["pidfile"] + + +class TestMergeCustomSections: + """Test custom configuration section merging.""" + + def test_merge_empty_custom_sections(self): + """Test merging with empty custom sections.""" + base_config = {"program:test": {"command": "echo test", "autorestart": "true"}} + custom_sections = {} + + result = _merge_custom_sections(base_config, custom_sections) + + assert result == base_config + + def test_merge_override_existing_setting(self): + """Test overriding existing settings in base config.""" + base_config = { + "program:test": { + "command": "echo test", + "autorestart": "true", + "startsecs": "1", + } + } + custom_sections = {"program:test": {"startsecs": "10", "stopwaitsecs": "30"}} + + result = _merge_custom_sections(base_config, custom_sections) + + expected = { + "program:test": { + "command": "echo test", + "autorestart": "true", + "startsecs": "10", # Overridden + "stopwaitsecs": "30", # Added + } + } + assert result == expected + + def test_merge_add_new_section(self): + """Test adding completely new sections.""" + base_config = {"program:test": {"command": "echo test"}} + custom_sections = { + "eventlistener:memmon": { + "command": "memmon -a 200MB", + "events": "PROCESS_STATE_FATAL", + } + } + + result = _merge_custom_sections(base_config, custom_sections) + + assert "program:test" in result + assert "eventlistener:memmon" in result + assert result["eventlistener:memmon"]["command"] == "memmon -a 200MB" + + def test_merge_modifies_base_config(self): + """Test that merging modifies the base config in place.""" + base_config = {"program:test": {"command": "echo test", "autorestart": "true"}} + original_base = { + "program:test": {"command": "echo test", "autorestart": "true"} + } + + custom_sections = {"program:test": {"startsecs": "10"}} + + result = _merge_custom_sections(base_config, custom_sections) + + # Should modify base config in place + assert result is base_config + assert base_config != original_base + assert base_config["program:test"]["startsecs"] == "10" + + +class TestDictToIniString: + """Test INI string generation from dictionary.""" + + def test_simple_config(self): + """Test simple configuration conversion.""" + config_dict = { + "section1": {"key1": "value1", "key2": "value2"}, + "section2": {"key3": "value3"}, + } + + result = _dict_to_ini_string(config_dict) + + assert "[section1]" in result + assert "key1 = value1" in result + assert "key2 = value2" in result + assert "[section2]" in result + assert "key3 = value3" in result + + def test_empty_config(self): + """Test empty configuration conversion.""" + config_dict = {} + result = _dict_to_ini_string(config_dict) + assert result == "" + + def test_section_ordering(self): + """Test that sections are properly separated with empty lines.""" + config_dict = {"section1": {"key1": "value1"}, "section2": {"key2": "value2"}} + + result = _dict_to_ini_string(config_dict) + lines = result.split("\n") + + # Expected structure: + # [section1] <- lines[0] + # key1=value1 <- lines[1] + # (empty line) <- lines[2] + # [section2] <- lines[3] + # key2=value2 <- lines[4] + # (empty line) <- lines[5] + + # Find section positions + section1_idx = lines.index("[section1]") + section2_idx = lines.index("[section2]") + + # Verify empty line after section1's content (section1 + key + empty line) + assert lines[section1_idx + 2] == "", "Missing empty line after section1" + + # Verify empty line after section2's content for consistency + assert lines[section2_idx + 2] == "", "Missing empty line after section2" + + # Verify sections are in correct order + assert section1_idx < section2_idx, "Sections should maintain order" + + +class TestGenerateSupervisordConfig: + """Test the main configuration generation function.""" + + def test_basic_generation(self): + """Test basic configuration generation.""" + config = SupervisorConfig( + auto_recovery=True, max_start_retries=3, log_level="info" + ) + + result = generate_supervisord_config(config, "echo test", "test_program") + + assert "[program:test_program]" in result + assert "command = echo test" in result + assert "autorestart = true" in result + assert "startretries = 3" in result + + def test_auto_recovery_disabled(self): + """Test configuration with auto recovery disabled.""" + config = SupervisorConfig( + auto_recovery=False, max_start_retries=1, log_level="debug" + ) + + result = generate_supervisord_config(config, "python script.py", "my_program") + + assert "autorestart = false" in result + assert "startretries = 1" in result + assert "loglevel = debug" in result + + def test_custom_sections_integration(self): + """Test integration with custom sections.""" + custom_sections = { + "program:llm_engine": {"startsecs": "15", "stopwaitsecs": "45"}, + "supervisord": {"logfile_maxbytes": "100MB"}, + } + + config = SupervisorConfig( + auto_recovery=True, + max_start_retries=5, + log_level="info", + custom_sections=custom_sections, + ) + + result = generate_supervisord_config(config, "vllm serve model", "llm_engine") + + assert "startsecs = 15" in result + assert "stopwaitsecs = 45" in result + assert "logfile_maxbytes = 100MB" in result + assert "startretries = 5" in result + + def test_empty_launch_command_error(self): + """Test error handling for empty launch command.""" + config = SupervisorConfig() + + with pytest.raises(ValueError, match="Launch command cannot be empty"): + generate_supervisord_config(config, "", "test_program") + + with pytest.raises(ValueError, match="Launch command cannot be empty"): + generate_supervisord_config(config, " ", "test_program") + + def test_empty_program_name_error(self): + """Test error handling for empty program name.""" + config = SupervisorConfig() + + with pytest.raises(ValueError, match="Program name cannot be empty"): + generate_supervisord_config(config, "echo test", "") + + with pytest.raises(ValueError, match="Program name cannot be empty"): + generate_supervisord_config(config, "echo test", " ") + + def test_special_characters_in_command(self): + """Test handling of special characters in launch command.""" + config = SupervisorConfig() + + command_with_quotes = "python -c \"print('Hello World')\"" + result = generate_supervisord_config( + config, command_with_quotes, "test_program" + ) + + assert command_with_quotes in result + + @patch( + "model_hosting_container_standards.supervisor.generator.get_base_config_template" + ) + def test_exception_handling(self, mock_get_template): + """Test exception handling in configuration generation.""" + mock_get_template.side_effect = Exception("Template error") + + config = SupervisorConfig() + + with pytest.raises( + ConfigurationError, match="Failed to generate supervisord configuration" + ): + generate_supervisord_config(config, "echo test", "test_program") + + +class TestWriteSupervisordConfig: + """Test configuration file writing.""" + + def test_successful_write(self): + """Test successful configuration file writing.""" + config = SupervisorConfig( + auto_recovery=True, max_start_retries=2, log_level="info" + ) + + with tempfile.NamedTemporaryFile(mode="w", delete=False) as f: + temp_path = f.name + + try: + write_supervisord_config(temp_path, config, "echo test", "test_program") + + # Verify file was created and has content + content = Path(temp_path).read_text() + assert "[program:test_program]" in content + assert "command = echo test" in content + assert "startretries = 2" in content + + finally: + Path(temp_path).unlink(missing_ok=True) + + def test_directory_creation(self): + """Test that parent directories are created if they don't exist.""" + config = SupervisorConfig() + + with tempfile.TemporaryDirectory() as temp_dir: + nested_path = Path(temp_dir) / "nested" / "dir" / "config.conf" + + write_supervisord_config( + str(nested_path), config, "echo test", "test_program" + ) + + assert nested_path.exists() + content = nested_path.read_text() + assert "[program:test_program]" in content + + @patch("builtins.open", side_effect=OSError("Permission denied")) + def test_write_permission_error(self, mock_open): + """Test handling of file write permission errors.""" + config = SupervisorConfig() + + with pytest.raises(OSError, match="Failed to write configuration file"): + write_supervisord_config( + "/invalid/path/config.conf", config, "echo test", "test_program" + ) + + def test_invalid_launch_command_propagation(self): + """Test that validation errors are properly propagated.""" + config = SupervisorConfig() + + with tempfile.NamedTemporaryFile() as f: + with pytest.raises( + ConfigurationError, match="Launch command cannot be empty" + ): + write_supervisord_config(f.name, config, "", "test_program") + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/python/tests/supervisor/test_models.py b/python/tests/supervisor/test_models.py new file mode 100644 index 0000000..3a23faf --- /dev/null +++ b/python/tests/supervisor/test_models.py @@ -0,0 +1,505 @@ +""" +Unit tests for supervisor models module. + +Tests configuration parsing, validation functions, and error handling. +""" + +import os +from unittest.mock import patch + +import pytest + +from model_hosting_container_standards.supervisor.models import ( + ConfigurationError, + SupervisorConfig, + _get_env_int, + _get_env_str, + _parse_bool, + _parse_supervisor_custom_sections, + parse_environment_variables, +) + + +class TestSupervisorConfig: + """Test the SupervisorConfig dataclass.""" + + def test_default_values(self): + """Test SupervisorConfig with default values.""" + config = SupervisorConfig() + + assert config.auto_recovery is True + assert config.max_start_retries == 3 + assert config.config_path == "/tmp/supervisord.conf" + assert config.log_level == "info" + assert config.custom_sections == {} + + def test_custom_values(self): + """Test SupervisorConfig with custom values.""" + custom_sections = {"program": {"startsecs": "10"}} + config = SupervisorConfig( + auto_recovery=False, + max_start_retries=5, + config_path="/custom/path.conf", + log_level="debug", + custom_sections=custom_sections, + ) + + assert config.auto_recovery is False + assert config.max_start_retries == 5 + assert config.config_path == "/custom/path.conf" + assert config.log_level == "debug" + assert config.custom_sections == custom_sections + + +class TestParseBool: + """Test the _parse_bool helper function.""" + + def test_true_values(self): + """Test values that should parse to True.""" + true_values = ["true", "True", "TRUE", "1", "yes", "YES", "on", "ON"] + for value in true_values: + assert _parse_bool(value) is True + + def test_false_values(self): + """Test values that should parse to False.""" + false_values = ["false", "False", "FALSE", "0", "no", "NO", "off", "OFF", ""] + for value in false_values: + assert _parse_bool(value) is False + + def test_mixed_case(self): + """Test mixed case values.""" + assert _parse_bool("TrUe") is True + assert _parse_bool("FaLsE") is False + assert _parse_bool("YeS") is True + assert _parse_bool("nO") is False + + +class TestGetEnvInt: + """Test the _get_env_int helper function.""" + + def test_default_value(self): + """Test returning default when env var not set.""" + result = _get_env_int("NONEXISTENT_VAR", 42) + assert result == 42 + + def test_valid_integer(self): + """Test parsing valid integer from environment.""" + with patch.dict(os.environ, {"TEST_INT": "25"}): + result = _get_env_int("TEST_INT", 10) + assert result == 25 + + def test_boundary_values(self): + """Test boundary validation.""" + with patch.dict(os.environ, {"TEST_INT": "5"}): + result = _get_env_int("TEST_INT", 10, min_val=0, max_val=10) + assert result == 5 + + def test_invalid_integer(self): + """Test error on invalid integer.""" + with patch.dict(os.environ, {"TEST_INT": "not_a_number"}): + with pytest.raises(ConfigurationError, match="must be an integer"): + _get_env_int("TEST_INT", 10) + + def test_below_minimum(self): + """Test error when value below minimum.""" + with patch.dict(os.environ, {"TEST_INT": "-5"}): + with pytest.raises(ConfigurationError, match="must be between 0 and 100"): + _get_env_int("TEST_INT", 10, min_val=0, max_val=100) + + def test_above_maximum(self): + """Test error when value above maximum.""" + with patch.dict(os.environ, {"TEST_INT": "150"}): + with pytest.raises(ConfigurationError, match="must be between 0 and 100"): + _get_env_int("TEST_INT", 10, min_val=0, max_val=100) + + def test_empty_string(self): + """Test empty string returns default.""" + with patch.dict(os.environ, {"TEST_INT": ""}): + result = _get_env_int("TEST_INT", 42) + assert result == 42 + + def test_whitespace_only(self): + """Test whitespace-only string raises error.""" + with patch.dict(os.environ, {"TEST_INT": " "}): + with pytest.raises(ConfigurationError, match="must be an integer"): + _get_env_int("TEST_INT", 42) + + +class TestGetEnvStr: + """Test the _get_env_str helper function.""" + + def test_default_value(self): + """Test returning default when env var not set.""" + result = _get_env_str("NONEXISTENT_VAR", "default") + assert result == "default" + + def test_valid_string(self): + """Test getting valid string from environment.""" + with patch.dict(os.environ, {"TEST_STR": "test_value"}): + result = _get_env_str("TEST_STR", "default") + assert result == "test_value" + + def test_whitespace_trimming(self): + """Test that whitespace is trimmed.""" + with patch.dict(os.environ, {"TEST_STR": " test_value "}): + result = _get_env_str("TEST_STR", "default") + assert result == "test_value" + + def test_allowed_values_valid(self): + """Test validation with allowed values - valid case.""" + with patch.dict(os.environ, {"TEST_STR": "debug"}): + result = _get_env_str("TEST_STR", "info", allowed=["debug", "info", "warn"]) + assert result == "debug" + + def test_allowed_values_case_insensitive(self): + """Test validation with allowed values is case insensitive.""" + with patch.dict(os.environ, {"TEST_STR": "DEBUG"}): + result = _get_env_str("TEST_STR", "info", allowed=["debug", "info", "warn"]) + assert result == "DEBUG" + + def test_allowed_values_invalid(self): + """Test error when value not in allowed list.""" + with patch.dict(os.environ, {"TEST_STR": "invalid"}): + with pytest.raises(ConfigurationError, match="must be one of"): + _get_env_str("TEST_STR", "info", allowed=["debug", "info", "warn"]) + + def test_empty_string_with_allowed(self): + """Test empty string with allowed values raises error.""" + with patch.dict(os.environ, {"TEST_STR": ""}): + with pytest.raises(ConfigurationError, match="must be one of"): + _get_env_str("TEST_STR", "info", allowed=["debug", "info", "warn"]) + + +class TestParseSupervisorCustomSections: + """Test the _parse_supervisor_custom_sections helper function.""" + + def test_empty_environment(self): + """Test with no SUPERVISOR_ environment variables.""" + with patch.dict(os.environ, {}, clear=True): + result = _parse_supervisor_custom_sections() + assert result == {} + + def test_skip_config_path(self): + """Test that SUPERVISOR_CONFIG_PATH is skipped.""" + test_env = {"SUPERVISOR_CONFIG_PATH": "/tmp/test.conf"} + with patch.dict(os.environ, test_env, clear=True): + result = _parse_supervisor_custom_sections() + assert result == {} + + def test_basic_sections(self): + """Test parsing basic section configurations.""" + test_env = { + "SUPERVISOR_PROGRAM_STARTSECS": "10", + "SUPERVISOR_SUPERVISORD_LOGLEVEL": "debug", + } + with patch.dict(os.environ, test_env, clear=True): + result = _parse_supervisor_custom_sections() + + expected = { + "program": {"startsecs": "10"}, + "supervisord": {"loglevel": "debug"}, + } + assert result == expected + + def test_colon_sections(self): + """Test parsing sections with colons (double underscore conversion).""" + test_env = { + "SUPERVISOR_PROGRAM__WEB_COMMAND": "gunicorn app:app", + "SUPERVISOR_RPCINTERFACE__SUPERVISOR_FACTORY": "supervisor.rpcinterface:make_main_rpcinterface", + } + with patch.dict(os.environ, test_env, clear=True): + result = _parse_supervisor_custom_sections() + + expected = { + "program:web": {"command": "gunicorn app:app"}, + "rpcinterface:supervisor": { + "factory": "supervisor.rpcinterface:make_main_rpcinterface" + }, + } + assert result == expected + + def test_mixed_sections(self): + """Test parsing mix of basic and colon sections.""" + test_env = { + "SUPERVISOR_PROGRAM_AUTORESTART": "true", + "SUPERVISOR_PROGRAM__API_DIRECTORY": "/app/api", + "SUPERVISOR_SUPERVISORD_NODAEMON": "true", + } + with patch.dict(os.environ, test_env, clear=True): + result = _parse_supervisor_custom_sections() + + expected = { + "program": {"autorestart": "true"}, + "program:api": {"directory": "/app/api"}, + "supervisord": {"nodaemon": "true"}, + } + assert result == expected + + def test_case_conversion(self): + """Test that section names and keys are converted to lowercase.""" + test_env = { + "SUPERVISOR_PROGRAM_STARTSECS": "10", + "SUPERVISOR_PROGRAM__WEB_COMMAND": "gunicorn app:app", + } + with patch.dict(os.environ, test_env, clear=True): + result = _parse_supervisor_custom_sections() + + # Verify all keys are lowercase + assert "program" in result + assert "program:web" in result + assert "startsecs" in result["program"] + assert "command" in result["program:web"] + + def test_whitespace_trimming(self): + """Test that values are trimmed of whitespace.""" + test_env = { + "SUPERVISOR_PROGRAM_COMMAND": " python app.py ", + } + with patch.dict(os.environ, test_env, clear=True): + result = _parse_supervisor_custom_sections() + + assert result["program"]["command"] == "python app.py" + + def test_valid_format_parsing(self): + """Test that valid format environment variables are parsed correctly.""" + test_env = { + "SUPERVISOR_PROGRAM_COMMAND": "python app.py", + "SUPERVISOR_PROGRAM__WEB_DIRECTORY": "/app", + "SUPERVISOR_SUPERVISORD_LOGLEVEL": "info", + } + + with patch.dict(os.environ, test_env, clear=True): + result = _parse_supervisor_custom_sections() + + # Should parse correctly + expected = { + "program": {"command": "python app.py"}, + "program:web": {"directory": "/app"}, + "supervisord": {"loglevel": "info"}, + } + assert result == expected + + def test_invalid_format_ignored(self): + """Test that invalid format environment variables are ignored.""" + test_env = { + "SUPERVISOR_": "invalid", # No section or key + "SUPERVISOR_PROGRAM": "invalid", # No key (no underscore) + "SUPERVISOR_PROGRAM_": "invalid", # Empty key name + "SUPERVISOR__WEB_COMMAND": "gunicorn app:app", # Empty section name + } + + with patch.dict(os.environ, test_env, clear=True): + result = _parse_supervisor_custom_sections() + + # All invalid formats should be ignored, result should be empty + assert result == {} + + def test_leading_underscore_in_section_rejected(self): + """Test that section names with leading underscores are rejected.""" + test_env = { + "SUPERVISOR__PROGRAM_COMMAND": "python app.py", # Leading underscore in section + } + + with patch.dict(os.environ, test_env, clear=True): + result = _parse_supervisor_custom_sections() + assert result == {} + + def test_trailing_underscore_in_section_rejected(self): + """Test that section names with trailing underscores are rejected.""" + test_env = { + "SUPERVISOR_PROGRAM__COMMAND": "python app.py", # Trailing underscore in section (before key) + } + + with patch.dict(os.environ, test_env, clear=True): + result = _parse_supervisor_custom_sections() + assert result == {} + + def test_multiple_consecutive_underscores_rejected(self): + """Test that three or more consecutive underscores are rejected.""" + test_env = { + "SUPERVISOR_PROGRAM___WEB_COMMAND": "gunicorn app:app", # Three underscores + } + + with patch.dict(os.environ, test_env, clear=True): + result = _parse_supervisor_custom_sections() + assert result == {} + + def test_leading_underscore_in_key_rejected(self): + """Test that key names with leading underscores are rejected.""" + test_env = { + "SUPERVISOR_PROGRAM__COMMAND": "python app.py", # Leading underscore in key + } + + with patch.dict(os.environ, test_env, clear=True): + result = _parse_supervisor_custom_sections() + assert result == {} + + def test_trailing_underscore_in_key_rejected(self): + """Test that key names with trailing underscores are rejected.""" + test_env = { + "SUPERVISOR_PROGRAM_COMMAND_": "python app.py", # Trailing underscore in key + } + + with patch.dict(os.environ, test_env, clear=True): + result = _parse_supervisor_custom_sections() + assert result == {} + + def test_numeric_only_sections_and_keys_accepted(self): + """Test that purely numeric section and key names are accepted.""" + test_env = { + "SUPERVISOR_123_456": "value", # Numeric section and key + } + + with patch.dict(os.environ, test_env, clear=True): + result = _parse_supervisor_custom_sections() + + expected = { + "123": {"456": "value"}, + } + assert result == expected + + def test_mixed_alphanumeric_accepted(self): + """Test that mixed alphanumeric section and key names are accepted.""" + test_env = { + "SUPERVISOR_PROGRAM2_COMMAND3": "python app.py", + "SUPERVISOR_WEB1__API2_PORT8080": "8080", + } + + with patch.dict(os.environ, test_env, clear=True): + result = _parse_supervisor_custom_sections() + + expected = { + "program2": {"command3": "python app.py"}, + "web1:api2": {"port8080": "8080"}, + } + assert result == expected + + +class TestParseEnvironmentVariables: + """Test the main parse_environment_variables function.""" + + def test_defaults(self): + """Test parsing with default values.""" + # Clear supervisor-related env vars + supervisor_vars = { + k: v + for k, v in os.environ.items() + if k.startswith( + ( + "PROCESS_AUTO_RECOVERY", + "PROCESS_MAX_START_RETRIES", + "LOG_LEVEL", + "SUPERVISOR_", + ) + ) + } + + with patch.dict(os.environ, {}, clear=False): + # Remove supervisor vars + for key in supervisor_vars: + os.environ.pop(key, None) + + try: + config = parse_environment_variables() + + assert config.auto_recovery is True + assert config.max_start_retries == 3 + assert config.config_path == "/tmp/supervisord.conf" + assert config.log_level == "info" + assert config.custom_sections == {} + finally: + # Restore original env vars + os.environ.update(supervisor_vars) + + def test_all_custom_values(self): + """Test parsing with all custom values.""" + test_env = { + "PROCESS_AUTO_RECOVERY": "false", + "PROCESS_MAX_START_RETRIES": "5", + "SUPERVISOR_CONFIG_PATH": "/custom/supervisord.conf", + "LOG_LEVEL": "debug", + "SUPERVISOR_PROGRAM_STARTSECS": "10", + "SUPERVISOR_PROGRAM__WEB_COMMAND": "gunicorn app:app", + } + + with patch.dict(os.environ, test_env): + config = parse_environment_variables() + + assert config.auto_recovery is False + assert config.max_start_retries == 5 + assert config.config_path == "/custom/supervisord.conf" + assert config.log_level == "debug" + + expected_custom = { + "program": {"startsecs": "10"}, + "program:web": {"command": "gunicorn app:app"}, + } + assert config.custom_sections == expected_custom + + def test_invalid_max_start_retries(self): + """Test error handling for invalid PROCESS_MAX_START_RETRIES.""" + with patch.dict(os.environ, {"PROCESS_MAX_START_RETRIES": "invalid"}): + with pytest.raises(ConfigurationError, match="must be an integer"): + parse_environment_variables() + + def test_invalid_log_level(self): + """Test error handling for invalid LOG_LEVEL.""" + with patch.dict(os.environ, {"LOG_LEVEL": "invalid"}): + with pytest.raises(ConfigurationError, match="must be one of"): + parse_environment_variables() + + def test_max_start_retries_out_of_range(self): + """Test error handling for PROCESS_MAX_START_RETRIES out of range.""" + with patch.dict(os.environ, {"PROCESS_MAX_START_RETRIES": "150"}): + with pytest.raises(ConfigurationError, match="must be between 0 and 100"): + parse_environment_variables() + + def test_configuration_error_logging(self): + """Test that configuration errors are logged.""" + with patch.dict(os.environ, {"PROCESS_MAX_START_RETRIES": "invalid"}): + with patch( + "model_hosting_container_standards.supervisor.models.logger" + ) as mock_logger: + with pytest.raises(ConfigurationError): + parse_environment_variables() + + mock_logger.error.assert_called_once() + assert ( + "Configuration validation failed" + in mock_logger.error.call_args[0][0] + ) + + def test_boolean_variations(self): + """Test various boolean value formats for PROCESS_AUTO_RECOVERY.""" + test_cases = [ + ("true", True), + ("True", True), + ("TRUE", True), + ("1", True), + ("yes", True), + ("on", True), + ("false", False), + ("False", False), + ("FALSE", False), + ("0", False), + ("no", False), + ("off", False), + ] + + for env_value, expected in test_cases: + with patch.dict(os.environ, {"PROCESS_AUTO_RECOVERY": env_value}): + config = parse_environment_variables() + assert config.auto_recovery is expected + + def test_log_level_case_insensitive(self): + """Test that LOG_LEVEL validation is case insensitive.""" + test_cases = ["debug", "DEBUG", "Debug", "INFO", "info", "WARN", "warn"] + + for log_level in test_cases: + with patch.dict(os.environ, {"LOG_LEVEL": log_level}): + config = parse_environment_variables() + assert config.log_level == log_level + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/python/tests/supervisor/test_standard_supervisor.py b/python/tests/supervisor/test_standard_supervisor.py new file mode 100644 index 0000000..b9bbe2a --- /dev/null +++ b/python/tests/supervisor/test_standard_supervisor.py @@ -0,0 +1,303 @@ +""" +Unit tests for StandardSupervisor CLI components. + +These tests focus on individual components of the standard-supervisor CLI +without requiring actual supervisor processes or system integration. +""" + +import os +import signal +import subprocess +import sys +from unittest.mock import Mock, patch + +import pytest + +from model_hosting_container_standards.supervisor.scripts.standard_supervisor import ( + ProcessManager, + SignalHandler, + StandardSupervisor, +) + + +class TestProcessManager: + """Test the ProcessManager class.""" + + def test_init(self): + """Test ProcessManager initialization.""" + logger = Mock() + manager = ProcessManager(logger) + + assert manager.logger == logger + assert manager.process is None + + @patch("shutil.which") + def test_check_tools_available_success(self, mock_which): + """Test successful tool availability check.""" + mock_which.return_value = "/usr/bin/supervisord" + + logger = Mock() + manager = ProcessManager(logger) + + available, missing = manager.check_tools_available() + + assert available is True + assert missing == "" + assert mock_which.call_count == 1 # Only supervisord + + @patch("shutil.which") + def test_check_tools_available_missing_supervisord(self, mock_which): + """Test tool availability check with missing supervisord.""" + + def mock_which_side_effect(tool): + if tool == "supervisord": + return None + return "/usr/bin/tool" + + mock_which.side_effect = mock_which_side_effect + + logger = Mock() + manager = ProcessManager(logger) + + available, missing = manager.check_tools_available() + + assert available is False + assert missing == "supervisord" + + @patch("subprocess.Popen") + @patch("subprocess.run") + @patch("time.sleep") + def test_start_success(self, mock_sleep, mock_run, mock_popen): + """Test successful process start.""" + # Mock successful process start + mock_process = Mock() + mock_process.poll.return_value = None # Process is running + mock_process.pid = 12345 + mock_popen.return_value = mock_process + + logger = Mock() + manager = ProcessManager(logger) + + result = manager.start("/tmp/test.conf") + + assert result == mock_process + assert manager.process == mock_process + mock_popen.assert_called_once_with(["supervisord", "-c", "/tmp/test.conf"]) + mock_sleep.assert_called_once_with(1.0) + + @patch("subprocess.Popen") + @patch("time.sleep") + def test_start_failure(self, mock_sleep, mock_popen): + """Test process start failure.""" + # Mock failed process start + mock_process = Mock() + mock_process.poll.return_value = 1 # Process exited with error + mock_process.returncode = 1 + mock_popen.return_value = mock_process + + logger = Mock() + manager = ProcessManager(logger) + + with pytest.raises(RuntimeError, match="Supervisord failed to start"): + manager.start("/tmp/test.conf") + + def test_terminate_no_process(self): + """Test terminate when no process is running.""" + logger = Mock() + manager = ProcessManager(logger) + + # Should not raise an exception + manager.terminate() + + def test_terminate_success(self): + """Test successful process termination.""" + mock_process = Mock() + mock_process.terminate.return_value = None + mock_process.wait.return_value = 0 + + logger = Mock() + manager = ProcessManager(logger) + manager.process = mock_process + + manager.terminate() + + mock_process.terminate.assert_called_once() + mock_process.wait.assert_called_once_with(timeout=5) + + def test_terminate_timeout_and_kill(self): + """Test process termination with timeout and force kill.""" + mock_process = Mock() + mock_process.terminate.return_value = None + mock_process.wait.side_effect = [subprocess.TimeoutExpired("cmd", 5), 0] + mock_process.kill.return_value = None + + logger = Mock() + manager = ProcessManager(logger) + manager.process = mock_process + + manager.terminate() + + mock_process.terminate.assert_called_once() + mock_process.kill.assert_called_once() + assert mock_process.wait.call_count == 2 + + +class TestSignalHandler: + """Test the SignalHandler class.""" + + def test_init(self): + """Test SignalHandler initialization.""" + process_manager = Mock() + logger = Mock() + handler = SignalHandler(process_manager, logger) + + assert handler.process_manager == process_manager + assert handler.logger == logger + assert handler._original_handlers == {} + + @patch("signal.signal") + def test_setup(self, mock_signal): + """Test signal handler setup.""" + process_manager = Mock() + logger = Mock() + handler = SignalHandler(process_manager, logger) + + # Mock original handlers + original_term = Mock() + original_int = Mock() + mock_signal.side_effect = [original_term, original_int] + + handler.setup() + + # Verify signal handlers were set + assert mock_signal.call_count == 2 + calls = mock_signal.call_args_list + assert calls[0][0][0] == signal.SIGTERM + assert calls[1][0][0] == signal.SIGINT + + # Verify original handlers were stored + assert handler._original_handlers[signal.SIGTERM] == original_term + assert handler._original_handlers[signal.SIGINT] == original_int + + +class TestStandardSupervisor: + """Test the StandardSupervisor main class.""" + + def test_init(self): + """Test StandardSupervisor initialization.""" + supervisor = StandardSupervisor() + + assert supervisor.logger is not None + assert supervisor.process_manager is not None + assert supervisor.signal_handler is not None + + @patch.dict(os.environ, {"LOG_LEVEL": "DEBUG"}) + def test_setup_logging_debug(self): + """Test logging setup with DEBUG level.""" + supervisor = StandardSupervisor() + + # Logger should be set to DEBUG level + assert supervisor.logger.level <= 10 # DEBUG is 10 + + @patch.dict(os.environ, {"LOG_LEVEL": "ERROR"}) + def test_setup_logging_error(self): + """Test logging setup with ERROR level.""" + supervisor = StandardSupervisor() + + # Logger should be set to ERROR level + assert supervisor.logger.level >= 40 # ERROR is 40 + + def test_parse_arguments_valid(self): + """Test argument parsing with valid arguments.""" + supervisor = StandardSupervisor() + + with patch.object(sys, "argv", ["standard-supervisor", "echo", "hello"]): + result = supervisor.parse_arguments() + assert result == ["echo", "hello"] + + def test_parse_arguments_complex(self): + """Test argument parsing with complex command.""" + supervisor = StandardSupervisor() + + with patch.object( + sys, + "argv", + ["standard-supervisor", "vllm", "serve", "model", "--host", "0.0.0.0"], + ): + result = supervisor.parse_arguments() + assert result == ["vllm", "serve", "model", "--host", "0.0.0.0"] + + def test_parse_arguments_empty(self): + """Test argument parsing with no arguments.""" + supervisor = StandardSupervisor() + + with patch.object(sys, "argv", ["standard-supervisor"]): + with pytest.raises(SystemExit) as exc_info: + supervisor.parse_arguments() + assert exc_info.value.code == 1 + + @patch( + "model_hosting_container_standards.supervisor.scripts.standard_supervisor.parse_environment_variables" + ) + @patch( + "model_hosting_container_standards.supervisor.scripts.standard_supervisor.write_supervisord_config" + ) + def test_run_success_flow(self, mock_write_config, mock_parse_env): + """Test successful run flow.""" + # Mock configuration + mock_config = Mock() + mock_config.config_path = "/tmp/test.conf" + mock_parse_env.return_value = mock_config + + # Mock process manager + mock_process = Mock() + mock_process.poll.side_effect = [None, None, 0] # Running, then exit + mock_process.returncode = 0 + + supervisor = StandardSupervisor() + supervisor.process_manager.check_tools_available = Mock(return_value=(True, "")) + supervisor.process_manager.start = Mock(return_value=mock_process) + supervisor.signal_handler.setup = Mock() + + with patch.object(sys, "argv", ["standard-supervisor", "echo", "test"]): + with patch("time.sleep"): # Mock sleep to speed up test + result = supervisor.run() + + assert result == 0 + mock_write_config.assert_called_once() + supervisor.process_manager.start.assert_called_once() + + def test_run_missing_tools(self): + """Test run with missing supervisor tools.""" + supervisor = StandardSupervisor() + supervisor.process_manager.check_tools_available = Mock( + return_value=(False, "supervisord") + ) + + with patch.object(sys, "argv", ["standard-supervisor", "echo", "test"]): + result = supervisor.run() + + assert result == 1 + + @patch( + "model_hosting_container_standards.supervisor.scripts.standard_supervisor.parse_environment_variables" + ) + def test_run_configuration_error(self, mock_parse_env): + """Test run with configuration error.""" + from model_hosting_container_standards.supervisor.models import ( + ConfigurationError, + ) + + mock_parse_env.side_effect = ConfigurationError("Invalid config") + + supervisor = StandardSupervisor() + supervisor.process_manager.check_tools_available = Mock(return_value=(True, "")) + + with patch.object(sys, "argv", ["standard-supervisor", "echo", "test"]): + result = supervisor.run() + + assert result == 1 + + +if __name__ == "__main__": + pytest.main([__file__, "-v"])