From 7c4eeb1c173c2b377e0f64f4fb1f216076bf75b3 Mon Sep 17 00:00:00 2001 From: Alex Iankoulski Date: Mon, 6 Jan 2025 15:12:48 -0800 Subject: [PATCH 1/5] Determine and display various versions of interest --- 4.validation_and_observability/versions.sh | 62 ++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100755 4.validation_and_observability/versions.sh diff --git a/4.validation_and_observability/versions.sh b/4.validation_and_observability/versions.sh new file mode 100755 index 000000000..eb6685f2d --- /dev/null +++ b/4.validation_and_observability/versions.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +echo "" +echo "Versions:" + +echo "" +echo "Linux family:" +uname + +echo "" +echo "Linux Kernel version:" +uname -r + +echo "" +echo "nvidia-smi:" +which nvidia-smi +if [ "$?" == "0" ]; then + echo "" + echo "NVIDIA versions:" + nvidia-smi --version | grep DRIVER + nvidia-smi --version | grep CUDA + + NCCL_LIB=$(ls /usr/local/cuda/lib/libnccl.so.*.*.*) + search="so." + NCCL_VERSION=${NCCL_LIB#*$search} + echo "NCCL version: $NCCL_VERSION" + + OFI_NCCL_LIB=$(strings /opt/aws-ofi-nccl/lib/libnccl-net.so | grep "Initializing aws-ofi-nccl") + OFI_NCCL_VERSION=$(echo $OFI_NCCL_LIB | cut -d ' ' -f 4) + echo "AWS OFI NCCL version: $OFI_NCCL_VERSION" +else + echo "not present" +fi + +echo "" +echo "Lustre client version:" +LUSTRE_CLIENT_VERSION="not found" +which yum +if [ "$?" == "0" ]; then + LUSTRE_CLIENT_VERSION=$(yum list lustre-client | grep lustre-client | awk '{print $2}') +else + LUSTRE_CLIENT_VERSION=$(apt list lustre-client | grep lustre-client | cut -d ' ' -f 2) +fi +echo $LUSTRE_CLIENT_VERSION + +echo "" +if [ -f /opt/amazon/efa_installed_packages ]; then + echo "EFA Installer version:" + EFA_LIBS=($(cat /opt/amazon/efa_installed_package)) + EFA_INSTALLER_VERSION=${EFA_LIBS[-1]} + echo $EFA_INSTALLER_VERSION + + echo "" + echo "Libfabric version:" + LIBFABRIC_VERSION=$(/opt/amazon/efa/bin/fi_info --version | grep libfabric: | cut -d ' ' -f 2) + echo $LIBFABRIC_VERSION + +else + echo "EFA Installer not found" +fi + +echo "" From 9e6a5a7a320f86991699fd32e5793ef3f1bec2f1 Mon Sep 17 00:00:00 2001 From: Alex Iankoulski Date: Mon, 6 Jan 2025 15:19:47 -0800 Subject: [PATCH 2/5] Add os info to versions script --- 4.validation_and_observability/versions.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/4.validation_and_observability/versions.sh b/4.validation_and_observability/versions.sh index eb6685f2d..8e544ff9c 100755 --- a/4.validation_and_observability/versions.sh +++ b/4.validation_and_observability/versions.sh @@ -1,7 +1,10 @@ #!/bin/bash -echo "" -echo "Versions:" +if [ -f /etc/os-release ]; then + echo "" + echo "OS info:" + cat /etc/os-release | head -n 4 +fi echo "" echo "Linux family:" From 4aaba66e49ef22734698da2b76b2c7ea77877e7b Mon Sep 17 00:00:00 2001 From: Alex Iankoulski Date: Mon, 6 Jan 2025 16:03:55 -0800 Subject: [PATCH 3/5] Add GDS version info, when it is installed --- 4.validation_and_observability/versions.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/4.validation_and_observability/versions.sh b/4.validation_and_observability/versions.sh index 8e544ff9c..c3333f3b6 100755 --- a/4.validation_and_observability/versions.sh +++ b/4.validation_and_observability/versions.sh @@ -31,6 +31,11 @@ if [ "$?" == "0" ]; then OFI_NCCL_LIB=$(strings /opt/aws-ofi-nccl/lib/libnccl-net.so | grep "Initializing aws-ofi-nccl") OFI_NCCL_VERSION=$(echo $OFI_NCCL_LIB | cut -d ' ' -f 4) echo "AWS OFI NCCL version: $OFI_NCCL_VERSION" + + if [ -f /usr/local/cuda/gds/tools/gdscheck ]; then + echo "NVIDIA GDS:" + /usr/local/cuda/gds/tools/gdscheck -v + fi else echo "not present" fi @@ -49,7 +54,7 @@ echo $LUSTRE_CLIENT_VERSION echo "" if [ -f /opt/amazon/efa_installed_packages ]; then echo "EFA Installer version:" - EFA_LIBS=($(cat /opt/amazon/efa_installed_package)) + EFA_LIBS=($(cat /opt/amazon/efa_installed_packages)) EFA_INSTALLER_VERSION=${EFA_LIBS[-1]} echo $EFA_INSTALLER_VERSION From d7931d780f3d938a402f47de3b607e017d48b7f0 Mon Sep 17 00:00:00 2001 From: Alex Iankoulski Date: Tue, 7 Jan 2025 10:32:54 -0800 Subject: [PATCH 4/5] Changed EFA Installer version -> EFA version --- 4.validation_and_observability/versions.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/4.validation_and_observability/versions.sh b/4.validation_and_observability/versions.sh index c3333f3b6..922c077fa 100755 --- a/4.validation_and_observability/versions.sh +++ b/4.validation_and_observability/versions.sh @@ -53,7 +53,7 @@ echo $LUSTRE_CLIENT_VERSION echo "" if [ -f /opt/amazon/efa_installed_packages ]; then - echo "EFA Installer version:" + echo "EFA version:" EFA_LIBS=($(cat /opt/amazon/efa_installed_packages)) EFA_INSTALLER_VERSION=${EFA_LIBS[-1]} echo $EFA_INSTALLER_VERSION From 131eee9ac706a914a365dbb17ac5ad480bc25c1b Mon Sep 17 00:00:00 2001 From: Alex Iankoulski Date: Thu, 6 Feb 2025 12:52:40 -0800 Subject: [PATCH 5/5] Implement code review feedback --- 4.validation_and_observability/versions.sh | 46 +++++++++++++--------- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/4.validation_and_observability/versions.sh b/4.validation_and_observability/versions.sh index 922c077fa..3c0b8c974 100755 --- a/4.validation_and_observability/versions.sh +++ b/4.validation_and_observability/versions.sh @@ -20,19 +20,30 @@ which nvidia-smi if [ "$?" == "0" ]; then echo "" echo "NVIDIA versions:" - nvidia-smi --version | grep DRIVER - nvidia-smi --version | grep CUDA + NVIDIA_DRIVER=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader --id=0 | grep '[0-9].*') + CUDA_VERSION_SUPPORT=$(nvidia-smi --version | grep CUDA | cut -d ':' -f 2 | xargs) + CUDA_VERSION_CURRENT=$(nvcc --version | sed -n 's/^.*release \([0-9]\+\.[0-9]\+\).*$/\1/p' | grep '[0-9].*') + CUDA_DEFAULT_PATH=$(ls -alh /usr/local | awk '{print $9 $10 $11}' | grep cuda | grep \>) + echo "Driver version : $NVIDIA_DRIVER" + echo "CUDA version support : $CUDA_VERSION_SUPPORT" + echo "CUDA version : $CUDA_VERSION_CURRENT" + echo "CUDA default path : /usr/local/$CUDA_DEFAULT_PATH" - NCCL_LIB=$(ls /usr/local/cuda/lib/libnccl.so.*.*.*) - search="so." - NCCL_VERSION=${NCCL_LIB#*$search} - echo "NCCL version: $NCCL_VERSION" + + CUDA_VERSIONS=$(ls -alh /usr/local | awk '{print $9 $10 $11}' | grep cuda | grep -v \>) + for v in $CUDA_VERSIONS ; do + NCCL_LIB=$(ls /usr/local/$v/lib/libnccl.so.*.*.* 2>/dev/null) + search="so." + NCCL_VERSION=${NCCL_LIB#*$search} + echo "NCCL version (for $v) : $NCCL_VERSION" + done OFI_NCCL_LIB=$(strings /opt/aws-ofi-nccl/lib/libnccl-net.so | grep "Initializing aws-ofi-nccl") OFI_NCCL_VERSION=$(echo $OFI_NCCL_LIB | cut -d ' ' -f 4) - echo "AWS OFI NCCL version: $OFI_NCCL_VERSION" + echo "AWS OFI NCCL version : $OFI_NCCL_VERSION" if [ -f /usr/local/cuda/gds/tools/gdscheck ]; then + echo "" echo "NVIDIA GDS:" /usr/local/cuda/gds/tools/gdscheck -v fi @@ -41,30 +52,29 @@ else fi echo "" -echo "Lustre client version:" -LUSTRE_CLIENT_VERSION="not found" +LUSTRE_CLIENT_VERSIONS_INSTALLED="not found" which yum if [ "$?" == "0" ]; then - LUSTRE_CLIENT_VERSION=$(yum list lustre-client | grep lustre-client | awk '{print $2}') + LUSTRE_CLIENT_VERSIONS_INSTALLED=$(yum list lustre-client | grep lustre-client | awk '{print $2}') else - LUSTRE_CLIENT_VERSION=$(apt list lustre-client | grep lustre-client | cut -d ' ' -f 2) + LUSTRE_CLIENT_VERSIONS_INSTALLED=$(apt list lustre-client | grep lustre-client | cut -d ' ' -f 2) fi -echo $LUSTRE_CLIENT_VERSION +LUSTRE_CLIENT_VERSION_LOADED=$(modinfo lustre | grep 'version:' | head -n 1 | awk '{print $2}') +echo "Lustre client versions installed: " +echo "$LUSTRE_CLIENT_VERSIONS_INSTALLED" +echo "Lustre client version loaded : $LUSTRE_CLIENT_VERSION_LOADED" echo "" if [ -f /opt/amazon/efa_installed_packages ]; then - echo "EFA version:" EFA_LIBS=($(cat /opt/amazon/efa_installed_packages)) EFA_INSTALLER_VERSION=${EFA_LIBS[-1]} - echo $EFA_INSTALLER_VERSION + echo "EFA version : $EFA_INSTALLER_VERSION" - echo "" - echo "Libfabric version:" LIBFABRIC_VERSION=$(/opt/amazon/efa/bin/fi_info --version | grep libfabric: | cut -d ' ' -f 2) - echo $LIBFABRIC_VERSION + echo "Libfabric version : $LIBFABRIC_VERSION" else echo "EFA Installer not found" -fi +fi echo ""