Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion cvs/input/config_file/ibperf/ibperf_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
{
"install_perf_package": "True",
"install_dir": "/home/{user-id}/",
"rocm_dir": "/opt/rocm",
"rocm_dir": "<changeme>",
"_comment_rocm_dir": "ROCm installation path. Set to auto-detect from /opt/rocm or /opt/rocm/core-* if left as changeme",
"qp_count_list": [ "8", "16" ],
"ib_bw_test_list": [ "ib_write_bw", "ib_send_bw"],
"ib_lat_test_list": [ "ib_write_lat", "ib_send_lat", "ib_read_lat" ],
Expand Down
61 changes: 60 additions & 1 deletion cvs/lib/ibperf_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,58 @@
from cvs.lib.utils_lib import *


def detect_rocm_path(phdl, config_rocm_path):
"""
Detect the ROCm installation path, supporting both old (/opt/rocm) and new
(/opt/rocm/core-X.Y) layouts.

Args:
phdl: Parallel SSH handle
config_rocm_path (str): Configured ROCm path from config file
(empty string or '<changeme>' for auto-detect)

Returns:
str: Detected ROCm path
"""
if config_rocm_path and config_rocm_path != '<changeme>':
out_dict = phdl.exec(
f'test -d {config_rocm_path}/lib && ls {config_rocm_path}/lib/libamdhip64.so* 2>/dev/null | head -1'
)
for node, output in out_dict.items():
if output.strip() and 'libamdhip64.so' in output:
log.info(f'Using configured ROCm path: {config_rocm_path} (validated)')
return config_rocm_path
else:
log.warning(
f'Configured ROCm path {config_rocm_path} does not contain required libraries, will auto-detect'
)

log.info('Auto-detecting ROCm path...')

# Try new ROCm 7.x structure first (/opt/rocm/core-X.Y)
out_dict = phdl.exec('ls -d /opt/rocm/core-* 2>/dev/null | sort -V | tail -1')
for node, output in out_dict.items():
if output and '/opt/rocm/core-' in output:
rocm_path = output.strip()
validate_dict = phdl.exec(
f'test -d {rocm_path}/lib && ls {rocm_path}/lib/libamdhip64.so* 2>/dev/null | head -1'
)
for _, lib_output in validate_dict.items():
if lib_output.strip() and 'libamdhip64.so' in lib_output:
log.info(f'Detected ROCm path (new layout): {rocm_path}')
return rocm_path

# Fall back to legacy /opt/rocm
out_dict = phdl.exec('test -d /opt/rocm/lib && ls /opt/rocm/lib/libamdhip64.so* 2>/dev/null | head -1')
for node, output in out_dict.items():
if output.strip() and 'libamdhip64.so' in output:
log.info('Detected ROCm path (legacy layout): /opt/rocm')
return '/opt/rocm'

log.warning('Could not detect ROCm path with required libraries, defaulting to /opt/rocm')
return '/opt/rocm'


def get_ib_bw_pps(phdl, msg_size, cmd):
res_dict = {}

Expand Down Expand Up @@ -136,6 +188,7 @@ def run_ib_perf_bw_test(
qp_count=8,
port_no=1516,
duration=60,
rocm_path='',
):
app_port = port_no
result_dict = {}
Expand All @@ -144,6 +197,9 @@ def run_ib_perf_bw_test(
phdl.exec('sudo rm -rf /tmp/ib_cmds_file.txt')
phdl.exec('sudo rm -rf /tmp/ib_perf*')
phdl.exec('touch /tmp/ib_cmds_file.txt')
if rocm_path:
log.info(f'Setting LD_LIBRARY_PATH to {rocm_path}/lib for perftest binaries')
phdl.exec(f'echo "export LD_LIBRARY_PATH={rocm_path}/lib:$LD_LIBRARY_PATH" >> /tmp/ib_cmds_file.txt')
server_addr = None
for node in bck_nic_dict.keys():
result_dict[node] = {}
Expand Down Expand Up @@ -213,7 +269,7 @@ def run_ib_perf_bw_test(


def run_ib_perf_lat_test(
phdl, lat_test, gpu_numa_dict, gpu_nic_dict, bck_nic_dict, app_path, msg_size, gid_index, port_no=1516
phdl, lat_test, gpu_numa_dict, gpu_nic_dict, bck_nic_dict, app_path, msg_size, gid_index, port_no=1516, rocm_path=''
):
app_port = port_no
result_dict = {}
Expand All @@ -222,6 +278,9 @@ def run_ib_perf_lat_test(
phdl.exec('sudo rm -rf /tmp/ib_cmds_file.txt')
phdl.exec('sudo rm -rf /tmp/ib_perf*')
phdl.exec('touch /tmp/ib_cmds_file.txt')
if rocm_path:
log.info(f'Setting LD_LIBRARY_PATH to {rocm_path}/lib for perftest binaries')
phdl.exec(f'echo "export LD_LIBRARY_PATH={rocm_path}/lib:$LD_LIBRARY_PATH" >> /tmp/ib_cmds_file.txt')
server_addr = None
for node in bck_nic_dict.keys():
result_dict[node] = {}
Expand Down
47 changes: 46 additions & 1 deletion cvs/lib/megatron_training_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,49 @@
# Library for building Megatron training jobs ..


def detect_rocm_path(phdl, config_rocm_path):
"""
Detect the ROCm installation path, supporting both old (/opt/rocm) and
new (/opt/rocm/core-X.Y) layouts.

Args:
phdl: Parallel SSH handle.
config_rocm_path (str): Configured ROCm path from config file
(empty string or '<changeme>' for auto-detect).

Returns:
str: Detected ROCm path.
"""
if config_rocm_path and config_rocm_path != '<changeme>':
log.info(f'Using configured ROCm path: {config_rocm_path}')
return config_rocm_path

log.info('Auto-detecting ROCm path...')

# Try new ROCm layout first (/opt/rocm/core-X.Y)
out_dict = phdl.exec('ls -d /opt/rocm/core-* 2>/dev/null | sort -V | tail -1')
for node, output in out_dict.items():
if output and '/opt/rocm/core-' in output:
rocm_path = output.strip()
validate_dict = phdl.exec(
f'test -d {rocm_path}/lib && ls {rocm_path}/lib/libamdhip64.so* 2>/dev/null | head -1'
)
for _, lib_output in validate_dict.items():
if lib_output.strip() and 'libamdhip64.so' in lib_output:
log.info(f'Detected ROCm path (new layout): {rocm_path}')
return rocm_path

# Fall back to legacy /opt/rocm
out_dict = phdl.exec('test -d /opt/rocm/lib && ls /opt/rocm/lib/libamdhip64.so* 2>/dev/null | head -1')
for node, output in out_dict.items():
if output.strip() and 'libamdhip64.so' in output:
log.info('Detected ROCm path (legacy layout): /opt/rocm')
return '/opt/rocm'

log.warning('Could not detect ROCm path, defaulting to /opt/rocm')
return '/opt/rocm'


class MegatronLlamaTrainingJob:
"""
Orchestrates a Megatron-LM Llama training job across one or more nodes.
Expand Down Expand Up @@ -129,6 +172,7 @@ def __init__(
tdict.setdefault('log_dir', f'{self.home_dir}/LOGS')
tdict.setdefault('master_address', '127.0.0.1')
tdict.setdefault('verify_network_errors', 'False')
tdict.setdefault('rocm_dir', '')

self.container_image = tdict['container_image']
self.container_name = tdict['container_name']
Expand All @@ -146,6 +190,7 @@ def __init__(
self.log_dir = tdict['log_dir']
self.master_address = tdict['master_address']
self.verify_network_errors = tdict['verify_network_errors']
self.rocm_path = detect_rocm_path(self.phdl, tdict['rocm_dir'])

# Get the model parameters dict
print('^^^^')
Expand Down Expand Up @@ -275,7 +320,7 @@ def build_training_job_cmd(
+ f'export HF_TOKEN="{self.hf_token}"; '
+ f'export DATA_CACHE_PATH={self.data_cache_dir}; '
+ f'export TOKENIZER_MODEL={self.tokenizer_model}; '
+ 'export LD_LIBRARY_PATH=/usr/local/lib/:/opt/rocm/lib:$LD_LIBRARY_PATH; '
+ f'export LD_LIBRARY_PATH=/usr/local/lib/:{self.rocm_path}/lib:$LD_LIBRARY_PATH; '
+ f'export LOG_DIR={self.log_dir}; '
+ 'export EXP_NAME="megatron_training"; '
)
Expand Down
16 changes: 10 additions & 6 deletions cvs/lib/rccl_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -558,7 +558,9 @@ def rccl_cluster_test(

# Environment variables exported into the mpirun context
PATH = f'{MPI_PATH}/bin:{ROCM_PATH}/bin:$PATH'
LD_LIBRARY_PATH = f'{RCCL_PATH}:{MPI_PATH}/lib:{ROCM_PATH}/lib:$LD_LIBRARY_PATH'
LD_LIBRARY_PATH = (
f'{RCCL_PATH}:{MPI_PATH}/lib:{ROCM_PATH}/lib:{ROCM_PATH}/lib64:{ROCM_PATH}/hip/lib:$LD_LIBRARY_PATH'
)

print(f'%% VPC Node IPs {vpc_node_list}')

Expand Down Expand Up @@ -592,7 +594,7 @@ def rccl_cluster_test(
test_cmd = f'env && {RCCL_TESTS_INSTALL_DIR}/{test_name} -b {start_msg_size} -e {end_msg_size} -f {step_function} \
-g {threads_per_gpu} -c {check_iteration_count} -w {warmup_iterations} \
-d {data_type} -n {no_of_iterations} -N {no_of_cycles} \
-Z json -x {rccl_result_file}'
-Z json -X {rccl_result_file}'

if env_source_script and env_source_script.lower() != 'none':
test_cmd = f'bash -c "source {env_source_script} && {test_cmd}"'
Expand Down Expand Up @@ -784,7 +786,9 @@ def rccl_cluster_test_default(

# Environment variables exported into the mpirun context
PATH = f'{MPI_PATH}/bin:{ROCM_PATH}/bin:$PATH'
LD_LIBRARY_PATH = f'{RCCL_PATH}:{MPI_PATH}/lib:{ROCM_PATH}/lib:$LD_LIBRARY_PATH'
LD_LIBRARY_PATH = (
f'{RCCL_PATH}:{MPI_PATH}/lib:{ROCM_PATH}/lib:{ROCM_PATH}/lib64:{ROCM_PATH}/hip/lib:$LD_LIBRARY_PATH'
)

print(f'%% VPC Node IPs {vpc_node_list}')

Expand Down Expand Up @@ -827,7 +831,7 @@ def rccl_cluster_test_default(
test_cmd = (
f'env && {RCCL_TESTS_INSTALL_DIR}/{test_name} -b {start_msg_size} -e {end_msg_size} -f {step_function} \
-g {threads_per_gpu} -c {check_iteration_count} -w {warmup_iterations} \
-d {dtype} -n {no_of_iterations} -N {no_of_cycles} -Z json -x {dtype_result_file}'
-d {dtype} -n {no_of_iterations} -N {no_of_cycles} -Z json -X {dtype_result_file}'
)

if env_source_script and env_source_script.lower() != 'none':
Expand Down Expand Up @@ -1058,13 +1062,13 @@ def rccl_single_node_test(

# Environment variables exported into the mpirun context
PATH = f'{ROCM_PATH}/bin:$PATH'
LD_LIBRARY_PATH = f'{RCCL_PATH}:{ROCM_PATH}/lib:$LD_LIBRARY_PATH'
LD_LIBRARY_PATH = f'{RCCL_PATH}:{ROCM_PATH}/lib:{ROCM_PATH}/lib64:{ROCM_PATH}/hip/lib:$LD_LIBRARY_PATH'

# Build the test command
# Wrap test binary in shell to source env script if provided
test_cmd = f'env && {RCCL_TESTS_INSTALL_DIR}/{test_name} -b {start_msg_size} -e {end_msg_size} -f {step_function} \
-g {no_of_local_ranks} -c {check_iteration_count} -w {warmup_iterations} -n {no_of_iterations} -N {no_of_cycles} \
-Z json -x {rccl_result_file}'
-Z json -X {rccl_result_file}'

if env_source_script and env_source_script.lower() != 'none':
test_cmd = f'bash -c "source {env_source_script} && {test_cmd}"'
Expand Down
47 changes: 39 additions & 8 deletions cvs/tests/health/install/install_rvs.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,13 +277,15 @@ def test_install_rvs(phdl, shdl, config_dict):
rvs_found = True

# Check if RVS config files exist
# Check MI300X path first (same order as final verification) and suppress stderr
# so a missing fallback path's "No such file" does not contaminate the output.
out_dict = phdl.exec(
f'ls -l {config_dict["config_path_default"]}/gst_single.conf || ls -l {config_dict["config_path_mi300x"]}/gst_single.conf',
f'ls -l {config_dict["config_path_mi300x"]}/gst_single.conf 2>/dev/null || ls -l {config_dict["config_path_default"]}/gst_single.conf 2>/dev/null',
timeout=30,
)
config_found = False
for node in out_dict.keys():
if not re.search('No such file', out_dict[node], re.I):
if re.search(r'gst_single\.conf', out_dict[node], re.I):
log.info(f'RVS configuration files found on node {node}')
config_found = True

Expand All @@ -302,13 +304,44 @@ def test_install_rvs(phdl, shdl, config_dict):

for node in out_dict.keys():
if re.search(
'Unable to locate package|Package.*not found|E: Could not get lock|dpkg: error', out_dict[node], re.I
'Unable to locate package|Package.*not found|E: Could not get lock|dpkg: error'
'|has no installation candidate|unmet dependencies|not available',
out_dict[node],
re.I,
):
log.info(f'RVS package installation failed on node {node}, will try building from source')
else:
log.info(f'RVS package installation successful on node {node}')
package_installed = True

# After apt-get install, verify the binary actually exists and locate it.
# The rocm-validation-suite package may install to /opt/rocm even when the
# detected rocm_path is /opt/rocm/core-X.Y (new layout), causing path mismatches.
if package_installed:
verify_bin = phdl.exec(
f'which rvs 2>/dev/null || ls {rocm_path}/bin/rvs 2>/dev/null || ls /opt/rocm/bin/rvs 2>/dev/null',
timeout=60,
)
rvs_bin_found = False
for node, output in verify_bin.items():
stripped = output.strip()
if stripped and 'rvs' in stripped and not re.search('No such file', stripped, re.I):
rvs_bin_found = True
# If the binary landed at /opt/rocm instead of rocm_path, realign all paths
if '/opt/rocm/bin/rvs' in stripped and not stripped.startswith(rocm_path):
actual_rocm = '/opt/rocm'
log.info(
f'RVS installed to {actual_rocm}/bin/rvs; updating rocm_path from {rocm_path} to {actual_rocm}'
)
for key in ('path', 'config_path_mi300x', 'config_path_default'):
if key in config_dict:
config_dict[key] = config_dict[key].replace(rocm_path, actual_rocm)
rocm_path = actual_rocm
break
if not rvs_bin_found:
log.info('RVS binary not found after package install, falling back to source build')
package_installed = False

# If package installation failed, build from source
if not package_installed:
log.info('Installing RVS from source')
Expand All @@ -329,20 +362,18 @@ def test_install_rvs(phdl, shdl, config_dict):
f'cd {git_install_path}/ROCmValidationSuite; cmake -B ./build -DROCM_PATH={rocm_path} -DCMAKE_INSTALL_PREFIX={rocm_path} -DCPACK_PACKAGING_INSTALL_PREFIX={rocm_path} -DHIP_PLATFORM=amd',
timeout=1200,
)
out_dict = hdl.exec(
f'cd {git_install_path}/ROCmValidationSuite/build; make -C -j$(nproc)', timeout=1200
)
out_dict = hdl.exec(f'cd {git_install_path}/ROCmValidationSuite/build; make -j$(nproc)', timeout=1200)

out_dict = hdl.exec(
f'cd {git_install_path}/ROCmValidationSuite/build; make -j$(nproc) package', timeout=1200
)
out_dict = hdl.exec(
f'cd {git_install_path}/ROCmValidationSuite/build; sudo make install',
f'cd {git_install_path}/ROCmValidationSuite/build; sudo make install; echo "RVS_INSTALL_STATUS:$?"',
timeout=1200,
)

for node in out_dict.keys():
if re.search('Error|FAILED|No such file', out_dict[node], re.I):
if not re.search(r'RVS_INSTALL_STATUS:0', out_dict[node]):
fail_test(f'RVS build/installation failed on node {node}')

except Exception as e:
Expand Down
Loading
Loading