Skip to content

Commit

Permalink
[OPAL-11149] Add more logs and set the estimator worker after chief r…
Browse files Browse the repository at this point in the history
…ole finished timeout
  • Loading branch information
zuston committed Dec 1, 2021
1 parent 3107b4c commit 5dd9270
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,12 @@ public boolean isHealthy(Configuration tonyConf) {
* tony.application.dependency.evaluator.timeout.after.A = 3600
*
*/
String errorMsg = groupDependencyTimeout(tonyConf);
String errorMsg = null;
try {
errorMsg = groupDependencyTimeout(tonyConf);
} catch (Exception e) {
log.error("Failed to check dependency timeout.", e);
}
if (errorMsg != null) {
session.setFinalStatus(FinalApplicationStatus.FAILED, errorMsg);
return false;
Expand All @@ -154,6 +159,7 @@ public boolean isHealthy(Configuration tonyConf) {
protected String groupDependencyTimeout(Configuration tonyConf) {
if (taskWithDependentGrpsIndex == null) {
taskWithDependentGrpsIndex = Utils.getJobTypeDependentGrps(tonyConf);
log.info("Task types dependent grp: " + taskWithDependentGrpsIndex);
}
// groupDependencies is map, key: waiting role, value: pre-dependent groups and waiting timeout
if (taskWithDependentGrpsIndex == null || taskWithDependentGrpsIndex.isEmpty()) {
Expand All @@ -163,6 +169,7 @@ protected String groupDependencyTimeout(Configuration tonyConf) {
// groupMembers is map, key: groupName, value: its members in this group
if (grpWithMembersIndex == null) {
grpWithMembersIndex = Utils.getAllGroupJobTypes(tonyConf);
log.info("Group members: " + grpWithMembersIndex);
}

// memberInGroups is map. key: jobtype name, value: in which groups
Expand Down Expand Up @@ -211,10 +218,12 @@ protected String groupDependencyTimeout(Configuration tonyConf) {
continue;
}

log.info("Running job type: " + runningTaskType + ", all dependent task finished: " + allDependentTaskFinished);

if (System.currentTimeMillis() - latestEndTimeInAllDependentTasks > timeout) {
return String.format("Jobtype: %s runs exceeded timeout because it's "
+ "dependent group: %s (task set: [%s]) has been finished.",
runningTaskType, dependentGroupName,
return String.format("Jobtype: %s runs exceeded timeout(%s sec) because it's "
+ "dependent group: %s (task set: [%s]) has been finished",
runningTaskType, dependentGroupPair.getValue(), dependentGroupName,
StringUtils.join(grpWithMembersIndex.get(dependentGroupName), ","));
}
}
Expand Down
10 changes: 10 additions & 0 deletions tony-core/src/main/resources/tony-default.xml
Original file line number Diff line number Diff line change
Expand Up @@ -389,4 +389,14 @@
<name>tony.horovod.driver.mode.debug</name>
<value>false</value>
</property>

<property>
<name>tony.application.group.TFESTIMATOR</name>
<value>chief</value>
</property>

<property>
<name>tony.application.dependency.worker.timeout.after.TFESTIMATOR</name>
<value>7200</value>
</property>
</configuration>

0 comments on commit 5dd9270

Please sign in to comment.