From e9e220c53270885d892df82d5c9943b9949055ba Mon Sep 17 00:00:00 2001 From: Aayush Joglekar Date: Tue, 12 May 2026 16:48:44 +0300 Subject: [PATCH] Add gpu tag for ROCm-LLVM and refactor method Improve search for ROCm-LLVM dep and add tests Fix tuple index access Apply suggestions from code review Co-authored-by: ocaisa Co-authored-by: Caspar van Leeuwen <33718780+casparvl@users.noreply.github.com> Get the error in logs Fix numbered indexing to keys Normalise return object for direct dependency of ROCm-LLVM Add more logs Add logs for rompi dry run Simplify check for rocm-compilers based toolchains add logs again, failed the test Add ignore cleanup flag add error log run Take out rompi test until it's available in EESSI --- .github/workflows/test-eb-hooks.yml | 52 ++++++++++++++ eb_hooks.py | 103 +++++++++++++++++++++------- 2 files changed, 132 insertions(+), 23 deletions(-) diff --git a/.github/workflows/test-eb-hooks.yml b/.github/workflows/test-eb-hooks.yml index 416cd99c..dfcc576c 100644 --- a/.github/workflows/test-eb-hooks.yml +++ b/.github/workflows/test-eb-hooks.yml @@ -116,3 +116,55 @@ jobs: eb --hooks=$PWD/eb_hooks.py "$INCOMPATIBLE_EASYCONFIG" --stop fetch 2>&1 1>/dev/null | grep -q "does not contain a valid list of dictionaries" echo "Incorrect format for EESSI_SITE_TOP_LEVEL_TOOLCHAINS caught" + check_inject_gpu_property: + runs-on: ubuntu-24.04 + strategy: + matrix: + # ROCm-LLVM / rocm-compilers / rompi only exist in the 2025.06 stack + EESSI_VERSION: + - '2025.06' + + steps: + - name: Check out software-layer repository + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + + - name: Mount EESSI CernVM-FS repository + uses: eessi/github-action-eessi@v3 + with: + eessi_stack_version: ${{matrix.EESSI_VERSION}} + use_eessi_module: true + + - name: Test that inject_gpu_property tags GPU software correctly + if: ${{ github.event_name == 'pull_request' }} + run: | + module load EESSI-extend/${{matrix.EESSI_VERSION}}-easybuild + + echo "Running inject_gpu_property tests for EESSI/${{matrix.EESSI_VERSION}}..." + + # Control: a non-GPU easyconfig must not receive the gpu property + eb --hooks=$PWD/eb_hooks.py --extended-dry-run M4-1.4.19-GCCcore-14.2.0.eb 2>&1 | grep -qiF 'add_property("arch","gpu")' && exit 1 + echo "Non-GPU easyconfig untouched" + + # ROCm-LLVM as a direct dependency (rocm-compilers bundle) + echo "Testing ROCm-LLVM as a direct dependency" + OUT=$(eb --hooks=$PWD/eb_hooks.py --extended-dry-run rocm-compilers-19.0.0-ROCm-6.4.1.eb 2>&1) + echo "$OUT" | grep -qiF 'add_property("arch","gpu")' + echo "$OUT" | grep -qiF 'setenv("EESSIROCMVERSION","6.4.1")' + echo "Direct ROCm-LLVM dependency detected" + + # rocm-compilers as the toolchain + echo "Testing ROCm-LLVM in a rocm-compilers toolchain" + OUT=$(eb --hooks=$PWD/eb_hooks.py --extended-dry-run rocBLAS-4.4.0-rocm-compilers-19.0.0-ROCm-6.4.1.eb 2>&1) + echo "$OUT" | grep -qiF 'add_property("arch","gpu")' + echo "$OUT" | grep -qiF 'setenv("EESSIROCMVERSION","6.4.1")' + echo "rocm-compilers toolchain handled" + + # TODO: rompi test to be brought back once rompi/2025a is available in EESSI + + # # rompi as the toolchain (ROCm-LLVM nested inside rocm-compilers bundle) + # echo "Testing ROCm-LLVM in a rompi toolchain (rocm-compilers bundle)" + # # added --disable-cleanup-tmpdir to ignore irrelevant EasyBuild cleanup error + # OUT=$(eb --hooks=$PWD/eb_hooks.py --extended-dry-run OSU-Micro-Benchmarks-7.5-rompi-2025a.eb 2>&1) + # echo "$OUT" | grep -qiF 'add_property("arch","gpu")' + # echo "$OUT" | grep -qiF 'setenv("EESSIROCMVERSION","6.4.1")' + # echo "rompi toolchain handled" diff --git a/eb_hooks.py b/eb_hooks.py index 9ba727ad..4c41227d 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -11,7 +11,9 @@ import easybuild.tools.environment as env from easybuild.easyblocks.generic.configuremake import obtain_config_guess from easybuild.framework.easyconfig.constants import EASYCONFIG_CONSTANTS -from easybuild.framework.easyconfig.easyconfig import get_toolchain_hierarchy +from easybuild.framework.easyconfig.easyconfig import ( + get_toolchain_hierarchy, +) from easybuild.tools import config from easybuild.tools.build_log import EasyBuildError, print_msg, print_warning from easybuild.tools.config import build_option, install_path, update_build_option @@ -2014,38 +2016,93 @@ def replace_binary_non_distributable_files_with_symlinks(log, install_dir, pkg_n symlink(host_inj_path, full_path) +def find_rocm_llvm_dependency(ec): + """ + Return the ROCm-LLVM dependency for this easyconfig, or None. ROCm-LLVM can + be a direct dependency, a direct toolchain component (rocm-compilers as the + toolchain), or one level deeper inside the rocm-compilers bundle when the + toolchain is rompi/rfbf/rfoss. + """ + # Check if ROCm-LLVM is a direct dependency, and if so, return that + # + for dep in ec.asdict()['dependencies']: + # dep is a tuple (name, version, versionsuffix, toolchain); normalise it to the + # same dict format as ec.toolchain.tcdeps entries before returning + if dep[0] == 'ROCm-LLVM': + return { + 'name': dep[0], + 'version': dep[1], + 'versionsuffix': dep[2] if len(dep) > 2 else '', + 'toolchain': dep[3] if len(dep) > 3 else None, + } + + # ROCm-LLVM can also be part of the toolchain. First, return early if this is NOT a ROCm-based toolchain + if ec['toolchain']['name'] not in ('rocm-compilers', 'rompi', 'rfbf', 'rfoss'): + return None + + tcdeps = ec.toolchain.tcdeps or [] + # Check if ROCm-LLVM is a direct dependency for this toolchain (which would be the case for rocm-compilers) + for dep in tcdeps: + if dep['name'] == 'ROCm-LLVM': + return dep + # For rompi, rfbf, rfoss, ROCm-LLVM is pulled in indirectly via rocm-compilers. the rocm-compilers + # toolchain dependency already encodes the ROCm version in its version string (e.g. '19.0.0-ROCm-6.4.1') + rocm_prefix = '-ROCm-' + for dep in tcdeps: + if dep['name'] == 'rocm-compilers': + full_version = dep['version'] + dep.get('versionsuffix', '') + if rocm_prefix in full_version: + version, rocm_version = full_version.split(rocm_prefix, 1) + return { + 'name': dep['name'], + 'version': version, + 'versionsuffix': rocm_prefix + rocm_version, + } + + return None + + def inject_gpu_property(ec): """ Add 'gpu' property and EESSIVERSION envvars via modluafooter easyconfig parameter, and drop dependencies to build dependencies """ ec_dict = ec.asdict() - # Check if CUDA, cuDNN, you-name-it is in the dependencies, if so - # - drop dependency to build dependency - # - add 'gpu' Lmod property - # - add envvar with package version - pkg_names = ( "CUDA", "cuDNN" ) pkg_versions = { } add_gpu_property = '' - for pkg_name in pkg_names: - # Check if pkg_name is in the dependencies, if so drop dependency to build - # dependency and set variable for later adding the 'gpu' Lmod property - # to '.remove' dependencies from ec_dict['dependencies'] we make a copy, - # iterate over the copy and can then savely use '.remove' on the original - # ec_dict['dependencies']. - deps = ec_dict['dependencies'][:] - if (pkg_name in [dep[0] for dep in deps]): + # Check if pkg_name is related to CUDA, if so drop dependency to build + # dependency and set variable for later adding the 'gpu' Lmod property + # to '.remove' dependencies from ec_dict['dependencies'] we make a copy, + # iterate over the copy and can then savely use '.remove' on the original + # ec_dict['dependencies']. + for pkg_name in ('CUDA', 'cuDNN'): + for dep in ec_dict['dependencies'][:]: + if dep[0] != pkg_name: + continue + add_gpu_property = 'add_property("arch","gpu")' - for dep in deps: - if pkg_name == dep[0]: - # make pkg_name a build dependency only (rpathing saves us from link errors) - ec.log.info("Dropping dependency on %s to build dependency" % pkg_name) - ec_dict['dependencies'].remove(dep) - if dep not in ec_dict['builddependencies']: - ec_dict['builddependencies'].append(dep) - # take note of version for creating the modluafooter - pkg_versions[pkg_name] = dep[1] + pkg_versions[pkg_name] = dep[1] + + ec.log.info("Dropping dependency on %s to build dependency" % pkg_name) + ec_dict['dependencies'].remove(dep) + if dep not in ec_dict['builddependencies']: + ec_dict['builddependencies'].append(dep) + + # ROCm-LLVM is handled separately: it is redistributable (kept as a runtime dep) + # and may be pulled in via a ROCm toolchain rather than as a direct dependency. + rocm_llvm_dep = find_rocm_llvm_dependency(ec) + if rocm_llvm_dep is not None: + add_gpu_property = 'add_property("arch","gpu")' + versionsuffix = rocm_llvm_dep['versionsuffix'] + rocm_prefix = "-ROCm-" + + if versionsuffix.startswith(rocm_prefix): + rocm_version = versionsuffix[len(rocm_prefix):] + else: + raise EasyBuildError(f"Invalid format for ROCm versionssuffix: {versionsuffix}") + pkg_versions['ROCm'] = rocm_version + if add_gpu_property: ec.log.info("Injecting gpu as Lmod arch property and envvars for dependencies with their version") modluafooter = 'modluafooter'