Skip to content

Commit a18fe81

Browse files
authored
Adding workflow for kubeflow sdk e2e tests on pr (#1)
1 parent 93ef635 commit a18fe81

File tree

1 file changed

+163
-0
lines changed

1 file changed

+163
-0
lines changed
Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
name: SDK PR Notebook E2E Runner
2+
3+
on:
4+
repository_dispatch:
5+
types: [run-sdk-pr-e2e-tests]
6+
7+
jobs:
8+
run-notebook-test:
9+
runs-on: kubeflow-devx-testing
10+
permissions:
11+
contents: write
12+
13+
steps:
14+
- name: Checkout SDK PR Code
15+
uses: actions/checkout@v4
16+
with:
17+
repository: ${{ github.event.client_payload.sdk_repo }}
18+
ref: ${{ github.event.client_payload.sdk_sha }}
19+
path: sdk-repo # Checkout the SDK code into a subdirectory
20+
21+
- name: Checkout Trainer Repo (Test Asset)
22+
uses: actions/checkout@v4
23+
with:
24+
repository: opendatahub-io/trainer
25+
ref: main
26+
path: trainer-repo # Checkout the Trainer code into a subdirectory
27+
28+
# Kubernetes Cluster Setup
29+
# This step may not be needed with the larger runner
30+
- name: Free up disk space
31+
run: |
32+
echo "Available disk space before cleanup:"
33+
df -h
34+
35+
# Remove unnecessary packages and files to free up space
36+
sudo rm -rf /usr/share/dotnet
37+
sudo rm -rf /usr/local/lib/android
38+
sudo rm -rf /opt/ghc
39+
sudo rm -rf /opt/hostedtoolcache/CodeQL
40+
41+
# Clean up Docker
42+
docker system prune -af
43+
44+
echo "Available disk space after cleanup:"
45+
df -h
46+
47+
- name: Set up Kind Kubernetes Cluster
48+
uses: helm/kind-action@v1
49+
with:
50+
cluster_name: kubeflow-test
51+
wait: 300s
52+
53+
- name: Verify Kind Cluster
54+
run: |
55+
kubectl cluster-info
56+
kubectl get nodes
57+
kubectl get pods -A
58+
59+
- name: Install Kubeflow Trainer
60+
run: |
61+
echo "Installing Kubeflow Trainer..."
62+
# Install the Trainer manager using kubectl with server-side apply to handle large CRDs
63+
kubectl apply --server-side -k "github.com/opendatahub-io/trainer/manifests/overlays/manager?ref=master"
64+
65+
# Wait for the trainer deployment to be ready
66+
echo "Waiting for Trainer controller to be ready..."
67+
kubectl wait --for=condition=available --timeout=300s \
68+
deployment/kubeflow-trainer-controller-manager -n kubeflow-system
69+
70+
# Wait a bit more for webhooks to be fully ready
71+
echo "Waiting for webhooks to be ready..."
72+
sleep 30
73+
74+
# Install the default training runtimes
75+
echo "Installing Trainer runtimes..."
76+
kubectl apply --server-side -k "github.com/opendatahub-io/trainer/manifests/overlays/runtimes?ref=master"
77+
78+
# Verify installation
79+
echo "Verifying Trainer installation..."
80+
kubectl get deployment -n kubeflow-system
81+
kubectl get pods -n kubeflow-system
82+
83+
# Verify runtimes are installed
84+
echo "Verifying ClusterTrainingRuntimes..."
85+
kubectl get clustertrainingruntimes
86+
87+
# Test Environment Setup
88+
- name: Set up Python 3.9
89+
uses: actions/setup-python@v6
90+
with:
91+
python-version: '3.9'
92+
93+
- name: Setup Environment and Install Dependencies
94+
shell: bash
95+
run: |
96+
echo "Installing tools and dependencies..."
97+
98+
# 1. Install Papermill and core execution tools
99+
pip install papermill==2.6.0 jupyter==1.1.1 ipykernel==6.29.5
100+
101+
# 2. Install main dependencies (Manually listed from pyproject.toml [project.dependencies])
102+
echo "Installing main SDK dependencies..."
103+
pip install kubernetes>=27.2.0 pydantic>=2.10.0 kubeflow-trainer-api>=2.0.0 kubeflow-katib-api>=0.19.0
104+
105+
# 3. Install the SDK Code itself in editable mode
106+
cd sdk-repo
107+
echo "Installing SDK PR code in editable mode..."
108+
pip install -e .[dev]
109+
110+
# 4. Configure Notebook Kernel
111+
PYTHON_BIN=$(which python)
112+
$PYTHON_BIN -m ipykernel install --user --name=sdk-test-kernel --display-name "Python (SDK Test)"
113+
114+
cd ..
115+
mkdir -p artifacts/notebooks # Create artifact directory
116+
117+
- name: Configure Kubernetes Access
118+
run: |
119+
# Ensure kubeconfig is accessible
120+
mkdir -p ~/.kube
121+
kind get kubeconfig --name kubeflow-test > ~/.kube/config
122+
chmod 600 ~/.kube/config
123+
124+
# Verify access
125+
kubectl get nodes
126+
kubectl get namespaces
127+
128+
- name: Run E2E Notebook Test with Papermill
129+
id: run-test
130+
run: |
131+
# Use the python executable where Papermill was installed
132+
PAPERMILL_BIN=$(which papermill)
133+
echo "Using Papermill from: $PAPERMILL_BIN"
134+
135+
# Set the notebook paths
136+
NOTEBOOK_INPUT="trainer-repo/examples/pytorch/image-classification/mnist.ipynb"
137+
NOTEBOOK_OUTPUT="artifacts/notebooks/sdk-pr-test-output.ipynb"
138+
139+
echo "Executing notebook: $NOTEBOOK_INPUT"
140+
141+
# Ensure the output directory exists
142+
mkdir -p "$(dirname "$NOTEBOOK_OUTPUT")"
143+
144+
# Execute the notebook using Papermill
145+
$PAPERMILL_BIN "$NOTEBOOK_INPUT" "$NOTEBOOK_OUTPUT" \
146+
--kernel "sdk-test-kernel" \
147+
--log-output \
148+
--log-level INFO
149+
150+
echo "Notebook test execution finished successfully."
151+
152+
- name: Upload Executed Notebook Artifact
153+
uses: actions/upload-artifact@v5
154+
if: always()
155+
with:
156+
name: sdk-pr-notebook-result-${{ github.event.client_payload.sdk_sha }}
157+
path: artifacts/notebooks/sdk-pr-test-output.ipynb
158+
retention-days: 1
159+
160+
- name: Cleanup Kind Cluster
161+
if: always()
162+
run: |
163+
kind delete cluster --name kubeflow-test || true

0 commit comments

Comments
 (0)