-
Notifications
You must be signed in to change notification settings - Fork 217
Refactor existing logs and add logs #840
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -369,7 +369,8 @@ public Optional<BestFit> findBestFit(TaskExecutorBatchAssignmentRequest request) | |
| } | ||
|
|
||
| if (noResourcesAvailable) { | ||
| log.warn("Not all scheduling constraints had enough workers available to fulfill the request {}", request); | ||
| log.warn("Not all scheduling constraints had enough workers for jobId={}, cluster={}", | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would the machine definition help determine the SKU that the request is trying to schedule? |
||
| request.getJobId(), request.getClusterID()); | ||
| return Optional.empty(); | ||
| } else { | ||
| // Return best fit only if there are enough available TEs for all scheduling constraints | ||
|
|
@@ -393,28 +394,41 @@ private Optional<Map<TaskExecutorID, TaskExecutorState>> findBestFitFor(TaskExec | |
| return Optional.empty(); | ||
| } | ||
|
|
||
| Stream<TaskExecutorHolder> availableTEs = this.executorsByGroup.get(bestFitTeGroupKey.get()) | ||
| NavigableSet<TaskExecutorHolder> groupHolders = this.executorsByGroup.get(bestFitTeGroupKey.get()); | ||
|
|
||
| List<TaskExecutorHolder> availableTEList = groupHolders | ||
| .descendingSet() | ||
| .stream() | ||
| .filter(teHolder -> { | ||
| if (!this.taskExecutorStateMap.containsKey(teHolder.getId())) { | ||
| log.debug("findBestFitFor: TE {} excluded - not in stateMap", teHolder.getId()); | ||
| return false; | ||
| } | ||
| if (currentBestFit.contains(teHolder.getId())) { | ||
| log.debug("findBestFitFor: TE {} excluded - already in bestFit", teHolder.getId()); | ||
| return false; | ||
| } | ||
| TaskExecutorState st = this.taskExecutorStateMap.get(teHolder.getId()); | ||
| return st.isAvailable() && | ||
| // when a TE is returned from here to be used for scheduling, its state remain active until | ||
| // the scheduler trigger another message to update (lock) the state. However when large number | ||
| // of the requests are active at the same time on same sku, the gap between here and the message | ||
| // to lock the state can be large so another schedule request message can be in between and | ||
| // got the same set of TEs. To avoid this, a lease is added to each TE state to temporarily | ||
| // lock the TE to be used again. Since this is only lock between actor messages and lease | ||
| // duration can be short. | ||
| st.getLastSchedulerLeasedDuration().compareTo(this.schedulerLeaseExpirationDuration) > 0 && | ||
| st.getRegistration() != null; | ||
| }); | ||
| if (!st.isAvailable()) { | ||
| log.debug("findBestFitFor: TE {} excluded - not available", teHolder.getId()); | ||
| return false; | ||
| } | ||
| if (st.getLastSchedulerLeasedDuration().compareTo(this.schedulerLeaseExpirationDuration) <= 0) { | ||
| log.debug("findBestFitFor: TE {} excluded - lease not expired ({}ms)", teHolder.getId(), st.getLastSchedulerLeasedDuration().toMillis()); | ||
| return false; | ||
| } | ||
| if (st.getRegistration() == null) { | ||
| log.debug("findBestFitFor: TE {} excluded - no registration", teHolder.getId()); | ||
| return false; | ||
| } | ||
| return true; | ||
| }) | ||
| .collect(Collectors.toList()); | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. double collect in this function btw. perf punishment.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we can revert it back after we diagnose |
||
|
|
||
| log.info("findBestFitFor: group={}, holdersInGroup={}, availableAfterFilter={}, requested={}", | ||
| bestFitTeGroupKey.get(), groupHolders.size(), availableTEList.size(), numWorkers); | ||
|
|
||
| Stream<TaskExecutorHolder> availableTEs = availableTEList.stream(); | ||
|
|
||
| if(availableTaskExecutorMutatorHook != null) { | ||
| availableTEs = availableTaskExecutorMutatorHook.mutate(availableTEs, request, schedulingConstraints); | ||
|
|
@@ -642,6 +656,7 @@ private Map<String, Integer> mapReservationsToSku( | |
|
|
||
| Map<String, Integer> reservationCountBySku = new HashMap<>(); | ||
|
|
||
| log.info("mapReservationsToSku called with {} reservations", pendingReservations == null ? 0 : pendingReservations.size()); | ||
| if (pendingReservations == null || pendingReservations.isEmpty()) { | ||
| return reservationCountBySku; | ||
| } | ||
|
|
@@ -783,14 +798,12 @@ private Optional<Map<TaskExecutorID, TaskExecutorState>> findTaskExecutorsFor(Ta | |
| if (taskExecutors.isPresent() && taskExecutors.get().size() == allocationRequests.size()) { | ||
| return taskExecutors; | ||
| } else { | ||
| log.warn("Not enough available TEs found for scheduling constraints {}, request: {}", schedulingConstraints, request); | ||
| if (taskExecutors.isPresent()) { | ||
| log.debug("Found {} Task Executors: {} for request: {} with constraints: {}", | ||
| taskExecutors.get().size(), taskExecutors.get(), request, schedulingConstraints); | ||
| } else { | ||
| log.warn("No suitable Task Executors found for request: {} with constraints: {}", | ||
| request, schedulingConstraints); | ||
| } | ||
| log.warn("Not enough available TEs for constraints {} (found={}, needed={}, jobId={}, workerId={})", | ||
| schedulingConstraints, | ||
| taskExecutors.isPresent() ? taskExecutors.get().size() : 0, | ||
| allocationRequests.size(), | ||
| request.getJobId(), | ||
| allocationRequests.stream().map(a -> a.getWorkerId().toString()).collect(Collectors.joining(","))); | ||
|
|
||
| // If there are not enough workers with the given spec then add the request the pending ones | ||
| if (!isJobIdAlreadyPending && request.getAllocationRequests().size() > 2) { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i think you still need workerId + schedulingConstraint info
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
worker id and constraint info already logged at findTaskExecutorsFor before coming into this log line.
I can put constraint again in this log line.
worker id we can't output here because it's in the array nested fields.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
maybe include the reservation instance too? could make search easier