1+ apiVersion: "kubeflow.org/v1"
2+ kind: PyTorchJob
3+ metadata:
4+ name: fsdp
5+ spec:
6+ elasticPolicy:
7+ rdzvBackend: etcd
8+ rdzvHost: etcd
9+ rdzvPort: 2379
10+ minReplicas: 1
11+ maxReplicas: 96
12+ maxRestarts: 100
13+ #metrics:
14+ # - type: Resource
15+ # resource:
16+ # name: cpu
17+ # target:
18+ # type: Utilization
19+ # averageUtilization: 80
20+ pytorchReplicaSpecs:
21+ Worker:
22+ replicas: 2
23+ restartPolicy: OnFailure
24+ template:
25+ metadata:
26+ labels:
27+ app: fsdp
28+ nvidia-devtools-sidecar-injector: enabled
29+ spec:
30+ volumes:
31+ - name: shmem
32+ #emptyDir:
33+ # medium: Memory
34+ hostPath:
35+ path: /dev/shm
36+ #nodeSelector:
37+ # node.kubernetes.io/instance-type: "p5.48xlarge"
38+ containers:
39+ - name: pytorch
40+ image: 159553542841.dkr.ecr.us-west-2.amazonaws.com/fsdp:llama2-efa-main-02-13
41+ imagePullPolicy: Always
42+ resources:
43+ requests:
44+ nvidia.com/gpu:
45+ vpc.amazonaws.com/efa: 4
46+ limits:
47+ nvidia.com/gpu:
48+ vpc.amazonaws.com/efa: 4
49+ env:
50+ # for P5 FI_* should be commented out
51+ #- name: LOGLEVEL
52+ # value: "DEBUG"
53+ - name: FI_PROVIDER
54+ value: efa
55+ - name: FI_EFA_USE_DEVICE_RDMA
56+ value: "1"
57+ - name: FI_EFA_FORK_SAFE
58+ value: "1"
59+ - name: FI_LOG_LEVEL
60+ value: "1"
61+ - name: FI_EFA_ENABLE_SHM_TRANSFER
62+ value: "1"
63+ #- name: NCCL_DEBUG
64+ # value: "INFO"
65+ - name: NCCL_ASYNC_ERROR_HANDLING
66+ value: "1"
67+ #- name: NCCL_IGNORE_DISABLED_P2P
68+ # value: "1"
69+ - name: HF_TOKEN
70+ value: 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
71+ command:
72+ - bash
73+ - -c
74+ - "torchrun --nproc_per_node=8 --nnodes=2 examples/finetuning.py --num_epochs=1 --batch_size_training=3 --enable_fsdp --pure_bf16 --model_name meta-llama/Llama-2-7b-hf --output_dir ."
75+ volumeMounts:
76+ - name: shmem
77+ mountPath: /dev/shm
78+ root@cb9511473ccc:/eks/deployment/distributed-training/pytorch/pytorchjob/fsdp# cat Dockerfile.llama2-efa
79+ FROM nvidia/cuda:12.2.2-devel-ubuntu22.04
80+
81+ ARG EFA_INSTALLER_VERSION=1.29.1
82+ ARG AWS_OFI_NCCL_VERSION=v1.7.3-aws
83+ ARG NCCL_TESTS_VERSION=master
84+ ARG NCCL_VERSION=2.18.5
85+
86+ RUN apt-get update -y
87+ RUN apt-get remove -y --allow-change-held-packages \
88+ libmlx5-1 ibverbs-utils libibverbs-dev libibverbs1 libnccl2 libnccl-dev
89+
90+ RUN rm -rf /opt/hpcx \
91+ && rm -rf /usr/local/mpi \
92+ && rm -f /etc/ld.so.conf.d/hpcx.conf \
93+ && ldconfig
94+ ENV OPAL_PREFIX=
95+
96+ RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
97+ git \
98+ gcc \
99+ vim \
100+ kmod \
101+ openssh-client \
102+ openssh-server \
103+ build-essential \
104+ curl \
105+ autoconf \
106+ libtool \
107+ gdb \
108+ automake \
109+ python3-distutils \
110+ cmake \
111+ apt-utils \
112+ devscripts \
113+ debhelper \
114+ libsubunit-dev \
115+ check \
116+ pkg-config
117+
118+ RUN mkdir -p /var/run/sshd
119+ RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
120+ echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
121+ sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config
122+ ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/lib:$LD_LIBRARY_PATH
123+ ENV PATH /opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH
124+
125+ RUN curl https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py \
126+ && python3 /tmp/get-pip.py \
127+ && pip3 install awscli pynvml
128+
129+ #################################################
130+ ## Install NVIDIA GDRCopy
131+ #RUN git clone https://github.com/NVIDIA/gdrcopy.git /opt/gdrcopy \
132+ # && cd /opt/gdrcopy \
133+ # && make lib_install install \
134+ # && cd /opt/gdrcopy/tests \
135+ # && make \
136+ # && mv copylat copybw sanity apiperf /usr/bin/
137+
138+ #################################################
139+ ## Install EFA installer
140+ RUN cd $HOME \
141+ && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
142+ && tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
143+ && cd aws-efa-installer \
144+ && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \
145+ && rm -rf $HOME/aws-efa-installer
146+
147+ ###################################################
148+ ## Install NCCL
149+ RUN git clone https://github.com/NVIDIA/nccl -b v${NCCL_VERSION}-1 /opt/nccl \
150+ && cd /opt/nccl \
151+ && make -j $(nproc) src.build CUDA_HOME=/usr/local/cuda \
152+ NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90"
153+
154+ ###################################################
155+ ## Install AWS-OFI-NCCL plugin
156+ RUN apt-get install libtool autoconf cmake nasm unzip pigz parallel nfs-common build-essential hwloc libhwloc-dev libjemalloc2 libnuma-dev numactl libjemalloc-dev preload htop iftop liblapack-dev libgfortran5 ipcalc wget curl devscripts debhelper check libsubunit-dev fakeroot pkg-config dkms -y
157+ RUN export OPAL_PREFIX="" \
158+ && git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl \
159+ && cd /opt/aws-ofi-nccl \
160+ && git checkout ${AWS_OFI_NCCL_VERSION} \
161+ && ./autogen.sh \
162+ && ./configure --prefix=/opt/aws-ofi-nccl/install \
163+ --with-libfabric=/opt/amazon/efa/ \
164+ --with-cuda=/usr/local/cuda \
165+ --with-nccl=/opt/nccl/build \
166+ --with-mpi=/opt/amazon/openmpi/ \
167+ && make -j $(nproc) && make install
168+ ###################################################
169+ ## Install fsdp
170+
171+ RUN mkdir -p /workspace/
172+
173+ WORKDIR /workspace
174+
175+ #RUN git clone -b flop_counter https://github.com/facebookresearch/llama-recipes.git
176+ #RUN git clone -b flop_counter_gc https://github.com/facebookresearch/llama-recipes.git
177+ RUN git clone https://github.com/facebookresearch/llama-recipes.git
178+
179+ WORKDIR /workspace/llama-recipes
180+
181+ RUN pip3 install -U pip setuptools
182+
183+ RUN pip3 install fsspec==2023.1.0
184+ RUN pip3 install huggingface_hub==0.17.0
185+ RUN pip3 install -r requirements.txt
186+
187+ RUN pip3 install -e .
188+
189+ RUN pip3 install tabulate
190+
191+ RUN pip3 install protobuf
192+
193+ RUN pip3 install python-etcd
194+
195+ #RUN pip3 uninstall -y torch
196+ #RUN pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
197+
198+ ENV PYTHONPATH="${PYTHONPATH}:/workspace/llama-recipes/src"
0 commit comments