|
| 1 | +name: SDK PR Notebook E2E Runner |
| 2 | + |
| 3 | +on: |
| 4 | + repository_dispatch: |
| 5 | + types: [run-sdk-pr-e2e-tests] |
| 6 | + |
| 7 | +jobs: |
| 8 | + run-notebook-test: |
| 9 | + runs-on: kubeflow-devx-testing |
| 10 | + permissions: |
| 11 | + contents: write |
| 12 | + |
| 13 | + steps: |
| 14 | + - name: Checkout SDK PR Code |
| 15 | + uses: actions/checkout@v4 |
| 16 | + with: |
| 17 | + repository: ${{ github.event.client_payload.sdk_repo }} |
| 18 | + ref: ${{ github.event.client_payload.sdk_sha }} |
| 19 | + path: sdk-repo # Checkout the SDK code into a subdirectory |
| 20 | + |
| 21 | + - name: Checkout Trainer Repo (Test Asset) |
| 22 | + uses: actions/checkout@v4 |
| 23 | + with: |
| 24 | + repository: opendatahub-io/trainer |
| 25 | + ref: main |
| 26 | + path: trainer-repo # Checkout the Trainer code into a subdirectory |
| 27 | + |
| 28 | + # Kubernetes Cluster Setup |
| 29 | + # This step may not be needed with the larger runner |
| 30 | + - name: Free up disk space |
| 31 | + run: | |
| 32 | + echo "Available disk space before cleanup:" |
| 33 | + df -h |
| 34 | +
|
| 35 | + # Remove unnecessary packages and files to free up space |
| 36 | + sudo rm -rf /usr/share/dotnet |
| 37 | + sudo rm -rf /usr/local/lib/android |
| 38 | + sudo rm -rf /opt/ghc |
| 39 | + sudo rm -rf /opt/hostedtoolcache/CodeQL |
| 40 | +
|
| 41 | + # Clean up Docker |
| 42 | + docker system prune -af |
| 43 | +
|
| 44 | + echo "Available disk space after cleanup:" |
| 45 | + df -h |
| 46 | +
|
| 47 | + - name: Set up Kind Kubernetes Cluster |
| 48 | + uses: helm/kind-action@v1 |
| 49 | + with: |
| 50 | + cluster_name: kubeflow-test |
| 51 | + wait: 300s |
| 52 | + |
| 53 | + - name: Verify Kind Cluster |
| 54 | + run: | |
| 55 | + kubectl cluster-info |
| 56 | + kubectl get nodes |
| 57 | + kubectl get pods -A |
| 58 | +
|
| 59 | + - name: Install Kubeflow Trainer |
| 60 | + run: | |
| 61 | + echo "Installing Kubeflow Trainer..." |
| 62 | + # Install the Trainer manager using kubectl with server-side apply to handle large CRDs |
| 63 | + kubectl apply --server-side -k "github.com/opendatahub-io/trainer/manifests/overlays/manager?ref=master" |
| 64 | +
|
| 65 | + # Wait for the trainer deployment to be ready |
| 66 | + echo "Waiting for Trainer controller to be ready..." |
| 67 | + kubectl wait --for=condition=available --timeout=300s \ |
| 68 | + deployment/kubeflow-trainer-controller-manager -n kubeflow-system |
| 69 | +
|
| 70 | + # Wait a bit more for webhooks to be fully ready |
| 71 | + echo "Waiting for webhooks to be ready..." |
| 72 | + sleep 30 |
| 73 | +
|
| 74 | + # Install the default training runtimes |
| 75 | + echo "Installing Trainer runtimes..." |
| 76 | + kubectl apply --server-side -k "github.com/opendatahub-io/trainer/manifests/overlays/runtimes?ref=master" |
| 77 | +
|
| 78 | + # Verify installation |
| 79 | + echo "Verifying Trainer installation..." |
| 80 | + kubectl get deployment -n kubeflow-system |
| 81 | + kubectl get pods -n kubeflow-system |
| 82 | +
|
| 83 | + # Verify runtimes are installed |
| 84 | + echo "Verifying ClusterTrainingRuntimes..." |
| 85 | + kubectl get clustertrainingruntimes |
| 86 | +
|
| 87 | + # Test Environment Setup |
| 88 | + - name: Set up Python 3.9 |
| 89 | + uses: actions/setup-python@v6 |
| 90 | + with: |
| 91 | + python-version: '3.9' |
| 92 | + |
| 93 | + - name: Setup Environment and Install Dependencies |
| 94 | + shell: bash |
| 95 | + run: | |
| 96 | + echo "Installing tools and dependencies..." |
| 97 | + |
| 98 | + # 1. Install Papermill and core execution tools |
| 99 | + pip install papermill==2.6.0 jupyter==1.1.1 ipykernel==6.29.5 |
| 100 | + |
| 101 | + # 2. Install main dependencies (Manually listed from pyproject.toml [project.dependencies]) |
| 102 | + echo "Installing main SDK dependencies..." |
| 103 | + pip install kubernetes>=27.2.0 pydantic>=2.10.0 kubeflow-trainer-api>=2.0.0 kubeflow-katib-api>=0.19.0 |
| 104 | +
|
| 105 | + # 3. Install the SDK Code itself in editable mode |
| 106 | + cd sdk-repo |
| 107 | + echo "Installing SDK PR code in editable mode..." |
| 108 | + pip install -e .[dev] |
| 109 | + |
| 110 | + # 4. Configure Notebook Kernel |
| 111 | + PYTHON_BIN=$(which python) |
| 112 | + $PYTHON_BIN -m ipykernel install --user --name=sdk-test-kernel --display-name "Python (SDK Test)" |
| 113 | + |
| 114 | + cd .. |
| 115 | + mkdir -p artifacts/notebooks # Create artifact directory |
| 116 | +
|
| 117 | + - name: Configure Kubernetes Access |
| 118 | + run: | |
| 119 | + # Ensure kubeconfig is accessible |
| 120 | + mkdir -p ~/.kube |
| 121 | + kind get kubeconfig --name kubeflow-test > ~/.kube/config |
| 122 | + chmod 600 ~/.kube/config |
| 123 | +
|
| 124 | + # Verify access |
| 125 | + kubectl get nodes |
| 126 | + kubectl get namespaces |
| 127 | +
|
| 128 | + - name: Run E2E Notebook Test with Papermill |
| 129 | + id: run-test |
| 130 | + run: | |
| 131 | + # Use the python executable where Papermill was installed |
| 132 | + PAPERMILL_BIN=$(which papermill) |
| 133 | + echo "Using Papermill from: $PAPERMILL_BIN" |
| 134 | +
|
| 135 | + # Set the notebook paths |
| 136 | + NOTEBOOK_INPUT="trainer-repo/examples/pytorch/image-classification/mnist.ipynb" |
| 137 | + NOTEBOOK_OUTPUT="artifacts/notebooks/sdk-pr-test-output.ipynb" |
| 138 | +
|
| 139 | + echo "Executing notebook: $NOTEBOOK_INPUT" |
| 140 | +
|
| 141 | + # Ensure the output directory exists |
| 142 | + mkdir -p "$(dirname "$NOTEBOOK_OUTPUT")" |
| 143 | +
|
| 144 | + # Execute the notebook using Papermill |
| 145 | + $PAPERMILL_BIN "$NOTEBOOK_INPUT" "$NOTEBOOK_OUTPUT" \ |
| 146 | + --kernel "sdk-test-kernel" \ |
| 147 | + --log-output \ |
| 148 | + --log-level INFO |
| 149 | +
|
| 150 | + echo "Notebook test execution finished successfully." |
| 151 | +
|
| 152 | + - name: Upload Executed Notebook Artifact |
| 153 | + uses: actions/upload-artifact@v5 |
| 154 | + if: always() |
| 155 | + with: |
| 156 | + name: sdk-pr-notebook-result-${{ github.event.client_payload.sdk_sha }} |
| 157 | + path: artifacts/notebooks/sdk-pr-test-output.ipynb |
| 158 | + retention-days: 1 |
| 159 | + |
| 160 | + - name: Cleanup Kind Cluster |
| 161 | + if: always() |
| 162 | + run: | |
| 163 | + kind delete cluster --name kubeflow-test || true |
0 commit comments