Skip to content

Commit e42f5a9

Browse files
infra: [TRTLLM-5879] Spilt single GPU test and multi GPU test into 2 pipelines (#5199)
Signed-off-by: ZhanruiSunCh <184402041+ZhanruiSunCh@users.noreply.github.com> Signed-off-by: Zhanrui Sun <184402041+ZhanruiSunCh@users.noreply.github.com> Co-authored-by: Yanchao Lu <yanchaol@nvidia.com>
1 parent fc2347e commit e42f5a9

File tree

2 files changed

+132
-128
lines changed

2 files changed

+132
-128
lines changed

jenkins/L0_MergeRequest.groovy

Lines changed: 106 additions & 118 deletions
Original file line numberDiff line numberDiff line change
@@ -878,6 +878,45 @@ def triggerJob(jobName, parameters, jenkinsUrl = "", credentials = "")
878878
return status
879879
}
880880

881+
def launchJob(jobName, reuseBuild, enableFailFast, globalVars, platform="x86_64", additionalParameters = [:]) {
882+
def parameters = getCommonParameters()
883+
String globalVarsJson = writeJSON returnText: true, json: globalVars
884+
parameters += [
885+
'enableFailFast': enableFailFast,
886+
'globalVars': globalVarsJson,
887+
] + additionalParameters
888+
889+
if (env.alternativeTRT && platform == "x86_64") {
890+
parameters += [
891+
'alternativeTRT': env.alternativeTRT,
892+
]
893+
}
894+
895+
if (env.alternativeTrtSBSA && platform == "SBSA") {
896+
parameters += [
897+
'alternativeTRT': env.alternativeTrtSBSA,
898+
]
899+
}
900+
901+
if (env.testPhase2StageName) {
902+
parameters += [
903+
'testPhase2StageName': env.testPhase2StageName,
904+
]
905+
}
906+
907+
if (reuseBuild) {
908+
parameters['reuseArtifactPath'] = "sw-tensorrt-generic/llm-artifacts/${JOB_NAME}/${reuseBuild}"
909+
}
910+
911+
echo "Trigger ${jobName} job, params: ${parameters}"
912+
913+
def status = triggerJob(jobName, parameters)
914+
if (status != "SUCCESS") {
915+
error "Downstream job did not succeed"
916+
}
917+
return status
918+
}
919+
881920
def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
882921
{
883922
stages = [
@@ -889,78 +928,88 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
889928
"x86_64-linux": {
890929
script {
891930
stage("Build") {
892-
def parameters = getCommonParameters()
893-
String globalVarsJson = writeJSON returnText: true, json: globalVars
894-
parameters += [
895-
'enableFailFast': enableFailFast,
931+
def additionalParameters = [
896932
'dockerImage': globalVars["LLM_DOCKER_IMAGE"],
897933
'wheelDockerImagePy310': globalVars["LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE"],
898934
'wheelDockerImagePy312': globalVars["LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE"],
899-
'globalVars': globalVarsJson,
900935
]
901-
902-
if (env.alternativeTRT) {
903-
parameters += [
904-
'alternativeTRT': env.alternativeTRT,
905-
]
906-
}
907-
908-
if (reuseBuild) {
909-
parameters['reuseArtifactPath'] = "sw-tensorrt-generic/llm-artifacts/${JOB_NAME}/${reuseBuild}"
910-
}
911-
912-
echo "trigger x86_64 build job, params: ${parameters}"
913-
914-
def status = triggerJob("/LLM/helpers/Build-x86_64", parameters)
915-
if (status != "SUCCESS") {
916-
error "Downstream job did not succeed"
917-
}
918-
919-
}
920-
def testStageName = "[Test-x86_64] Run"
921-
if (env.localJobCredentials) {
922-
testStageName = "[Test-x86_64] Remote Run"
936+
launchJob("/LLM/helpers/Build-x86_64", reuseBuild, enableFailFast, globalVars, "x86_64", additionalParameters)
923937
}
938+
def testStageName = "[Test-x86_64-Single-GPU] ${env.localJobCredentials ? "Remote Run" : "Run"}"
939+
def singleGpuTestFailed = false
924940
stage(testStageName) {
925941
if (X86_TEST_CHOICE == STAGE_CHOICE_SKIP) {
926942
echo "x86_64 test job is skipped due to Jenkins configuration"
927943
return
928944
}
929945
try {
930-
parameters = getCommonParameters()
931946
String testFilterJson = writeJSON returnText: true, json: testFilter
932-
String globalVarsJson = writeJSON returnText: true, json: globalVars
933-
parameters += [
934-
'enableFailFast': enableFailFast,
947+
def additionalParameters = [
935948
'testFilter': testFilterJson,
936949
'dockerImage': globalVars["LLM_DOCKER_IMAGE"],
937950
'wheelDockerImagePy310': globalVars["LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE"],
938951
'wheelDockerImagePy312': globalVars["LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE"],
939-
'globalVars': globalVarsJson,
940952
]
941953

942-
if (env.alternativeTRT) {
943-
parameters += [
944-
'alternativeTRT': env.alternativeTRT,
945-
]
954+
launchJob("L0_Test-x86_64-Single-GPU", false, enableFailFast, globalVars, "x86_64", additionalParameters)
955+
} catch (InterruptedException e) {
956+
throw e
957+
} catch (Exception e) {
958+
if (X86_TEST_CHOICE == STAGE_CHOICE_IGNORE) {
959+
catchError(
960+
buildResult: 'SUCCESS',
961+
stageResult: 'FAILURE') {
962+
error "x86_64 test failed but ignored due to Jenkins configuration"
963+
}
964+
} else {
965+
catchError(
966+
buildResult: 'FAILURE',
967+
stageResult: 'FAILURE') {
968+
error "x86_64 single-GPU test failed"
969+
}
970+
singleGpuTestFailed = true
946971
}
972+
}
973+
}
947974

948-
if (env.testPhase2StageName) {
949-
parameters += [
950-
'testPhase2StageName': env.testPhase2StageName,
951-
]
975+
def requireMultiGpuTesting = currentBuild.description?.contains("Require Multi-GPU Testing") ?: false
976+
echo "requireMultiGpuTesting: ${requireMultiGpuTesting}"
977+
if (!requireMultiGpuTesting) {
978+
return
979+
}
980+
981+
if (singleGpuTestFailed) {
982+
if (env.JOB_NAME ==~ /.*PostMerge.*/) {
983+
echo "In the official post-merge pipeline, single-GPU test failed, whereas multi-GPU test is still kept running."
984+
} else {
985+
stage("[Test-x86_64-Multi-GPU] Blocked") {
986+
catchError(
987+
buildResult: 'FAILURE',
988+
stageResult: 'FAILURE') {
989+
error "This pipeline requires running multi-GPU test, but single-GPU test has failed."
990+
}
952991
}
992+
return
993+
}
994+
}
953995

954-
echo "trigger x86_64 test job, params: ${parameters}"
996+
testStageName = "[Test-x86_64-Multi-GPU] ${env.localJobCredentials ? "Remote Run" : "Run"}"
997+
stage(testStageName) {
998+
if (X86_TEST_CHOICE == STAGE_CHOICE_SKIP) {
999+
echo "x86_64 test job is skipped due to Jenkins configuration"
1000+
return
1001+
}
1002+
try {
1003+
def testFilterJson = writeJSON returnText: true, json: testFilter
1004+
def additionalParameters = [
1005+
'testFilter': testFilterJson,
1006+
'dockerImage': globalVars["LLM_DOCKER_IMAGE"],
1007+
'wheelDockerImagePy310': globalVars["LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE"],
1008+
'wheelDockerImagePy312': globalVars["LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE"],
1009+
]
9551010

956-
def status = triggerJob(
957-
"L0_Test-x86_64",
958-
parameters,
959-
)
1011+
launchJob("L0_Test-x86_64-Multi-GPU", false, enableFailFast, globalVars, "x86_64", additionalParameters)
9601012

961-
if (status != "SUCCESS") {
962-
error "Downstream job did not succeed"
963-
}
9641013
} catch (InterruptedException e) {
9651014
throw e
9661015
} catch (Exception e) {
@@ -991,79 +1040,26 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
9911040
return
9921041
}
9931042

994-
def stageName = "Build"
995-
stage(stageName) {
996-
def parameters = getCommonParameters()
997-
String globalVarsJson = writeJSON returnText: true, json: globalVars
998-
parameters += [
999-
'enableFailFast': enableFailFast,
1043+
stage("Build") {
1044+
def additionalParameters = [
10001045
"dockerImage": globalVars["LLM_SBSA_DOCKER_IMAGE"],
1001-
'globalVars': globalVarsJson,
10021046
]
1003-
1004-
if (env.alternativeTrtSBSA) {
1005-
parameters += [
1006-
"alternativeTRT": env.alternativeTrtSBSA,
1007-
]
1008-
}
1009-
1010-
if (reuseBuild) {
1011-
parameters['reuseArtifactPath'] = "sw-tensorrt-generic/llm-artifacts/${JOB_NAME}/${reuseBuild}"
1012-
}
1013-
1014-
echo "trigger SBSA build job, params: ${parameters}"
1015-
1016-
def status = triggerJob(
1017-
"/LLM/helpers/Build-SBSA",
1018-
parameters,
1019-
jenkinsUrl,
1020-
credentials,
1021-
)
1022-
1023-
if (status != "SUCCESS") {
1024-
error "Downstream job did not succeed"
1025-
}
1047+
launchJob("/LLM/helpers/Build-SBSA", reuseBuild, enableFailFast, globalVars, "SBSA", additionalParameters)
10261048
}
10271049
stage(testStageName) {
10281050
if (SBSA_TEST_CHOICE == STAGE_CHOICE_SKIP) {
10291051
echo "SBSA test job is skipped due to Jenkins configuration"
10301052
return
10311053
}
10321054
try {
1033-
def parameters = getCommonParameters()
1034-
String testFilterJson = writeJSON returnText: true, json: testFilter
1035-
String globalVarsJson = writeJSON returnText: true, json: globalVars
1036-
parameters += [
1037-
'enableFailFast': enableFailFast,
1055+
def testFilterJson = writeJSON returnText: true, json: testFilter
1056+
def additionalParameters = [
10381057
'testFilter': testFilterJson,
10391058
"dockerImage": globalVars["LLM_SBSA_DOCKER_IMAGE"],
1040-
'globalVars': globalVarsJson,
10411059
]
10421060

1043-
if (env.alternativeTrtSBSA) {
1044-
parameters += [
1045-
"alternativeTRT": env.alternativeTrtSBSA,
1046-
]
1047-
}
1048-
1049-
if (env.testPhase2StageName) {
1050-
parameters += [
1051-
'testPhase2StageName': env.testPhase2StageName,
1052-
]
1053-
}
1054-
1055-
echo "trigger SBSA test job, params: ${parameters}"
1061+
launchJob("L0_Test-SBSA", false, enableFailFast, globalVars, "SBSA", additionalParameters)
10561062

1057-
def status = triggerJob(
1058-
"L0_Test-SBSA",
1059-
parameters,
1060-
jenkinsUrl,
1061-
credentials,
1062-
)
1063-
1064-
if (status != "SUCCESS") {
1065-
error "Downstream job did not succeed"
1066-
}
10671063
} catch (InterruptedException e) {
10681064
throw e
10691065
} catch (Exception e) {
@@ -1085,31 +1081,23 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
10851081
"Build-Docker-Images": {
10861082
script {
10871083
stage("[Build-Docker-Images] Remote Run") {
1088-
def parameters = getCommonParameters()
1089-
String globalVarsJson = writeJSON returnText: true, json: globalVars
10901084
def branch = env.gitlabBranch ? env.gitlabBranch : "main"
10911085
if (globalVars[GITHUB_PR_API_URL]) {
10921086
branch = "github-pr-" + globalVars[GITHUB_PR_API_URL].split('/').last()
10931087
}
10941088

1095-
parameters += [
1096-
'enableFailFast': enableFailFast,
1089+
def additionalParameters = [
10971090
'branch': branch,
10981091
'action': "push",
10991092
'triggerType': env.JOB_NAME ==~ /.*PostMerge.*/ ? "post-merge" : "pre-merge",
1100-
'globalVars': globalVarsJson,
11011093
]
11021094

1103-
echo "trigger BuildDockerImages job, params: ${parameters}"
1104-
1105-
def status = triggerJob("/LLM/helpers/BuildDockerImages", parameters)
1106-
if (status != "SUCCESS") {
1107-
error "Downstream job did not succeed"
1108-
}
1095+
launchJob("/LLM/helpers/BuildDockerImages", false, enableFailFast, globalVars, "x86_64", additionalParameters)
11091096
}
11101097
}
11111098
}
11121099
]
1100+
11131101
if (env.JOB_NAME ==~ /.*PostMerge.*/) {
11141102
stages += dockerBuildJob
11151103
}

jenkins/L0_Test.groovy

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2274,7 +2274,7 @@ pipeline {
22742274
when {
22752275
expression {
22762276
// Only run the test list validation when necessary
2277-
env.targetArch == X86_64_TRIPLE && testFilter[ONLY_DOCS_FILE_CHANGED] == false
2277+
env.targetArch == X86_64_TRIPLE && testFilter[ONLY_DOCS_FILE_CHANGED] == false && !(env.JOB_NAME ==~ /.*Multi-GPU.*/)
22782278
}
22792279
}
22802280
steps
@@ -2299,17 +2299,33 @@ pipeline {
22992299
dgxJobs = parallelJobs.findAll{dgxSigns.any{sign -> it.key.contains(sign)}}
23002300
}
23012301

2302-
if (singleGpuJobs.size() > 0) {
2303-
singleGpuJobs.failFast = params.enableFailFast
2304-
parallel singleGpuJobs
2305-
} else {
2306-
echo "Skip single-GPU testing. No test to run."
2307-
}
2308-
2309-
if (dgxJobs.size() > 0) {
2310-
stage(testPhase2StageName) {
2302+
if (env.JOB_NAME ==~ /.*Single-GPU.*/) {
2303+
echo "Only run single-GPU tests."
2304+
if (dgxJobs.size() > 0) {
2305+
if (globalVars[ACTION_INFO]['parents'].size() > 0) {
2306+
// We add a special marker to the parent job's description.
2307+
// This will be used to decide whether to run multi-GPU test stage.
2308+
def parentJob = globalVars[ACTION_INFO]['parents'][-2]
2309+
trtllm_utils.appendBuildDescription(this, parentJob['name'], parentJob['build_number'], "====Require Multi-GPU Testing====<br/>")
2310+
} else {
2311+
echo "No parent job found to add the special marker for executing multi-GPU test stage."
2312+
}
2313+
} else {
2314+
echo "Skip multi-GPU testing. No test to run."
2315+
}
2316+
if (singleGpuJobs.size() > 0) {
2317+
singleGpuJobs.failFast = params.enableFailFast
2318+
parallel singleGpuJobs
2319+
} else {
2320+
echo "Skip single-GPU testing. No test to run."
2321+
}
2322+
} else if (env.JOB_NAME ==~ /.*Multi-GPU.*/) {
2323+
echo "Only run multi-GPU tests."
2324+
if (dgxJobs.size() > 0) {
23112325
dgxJobs.failFast = params.enableFailFast
23122326
parallel dgxJobs
2327+
} else {
2328+
error "Skip multi-GPU testing. No test to run."
23132329
}
23142330
}
23152331
}

0 commit comments

Comments
 (0)