@@ -22,6 +22,7 @@ use crate::state::execution_graph::{ExecutionGraph, Task};
22
22
use crate :: state:: executor_manager:: { ExecutorManager , ExecutorReservation } ;
23
23
use crate :: state:: { decode_protobuf, encode_protobuf, with_lock} ;
24
24
use ballista_core:: config:: BallistaConfig ;
25
+ #[ cfg( not( test) ) ]
25
26
use ballista_core:: error:: BallistaError ;
26
27
use ballista_core:: error:: Result ;
27
28
use ballista_core:: serde:: protobuf:: executor_grpc_client:: ExecutorGrpcClient ;
@@ -41,7 +42,6 @@ use rand::distributions::Alphanumeric;
41
42
use rand:: { thread_rng, Rng } ;
42
43
use std:: collections:: HashMap ;
43
44
use std:: default:: Default ;
44
- use std:: sync:: atomic:: { AtomicUsize , Ordering } ;
45
45
use std:: sync:: Arc ;
46
46
use std:: time:: { SystemTime , UNIX_EPOCH } ;
47
47
use tokio:: sync:: RwLock ;
@@ -60,7 +60,6 @@ pub struct TaskManager<T: 'static + AsLogicalPlan, U: 'static + AsExecutionPlan>
60
60
scheduler_id : String ,
61
61
// Cache for active execution graphs curated by this scheduler
62
62
active_job_cache : ExecutionGraphCache ,
63
- pending_task_queue_size : Arc < AtomicUsize > ,
64
63
}
65
64
66
65
impl < T : ' static + AsLogicalPlan , U : ' static + AsExecutionPlan > TaskManager < T , U > {
@@ -77,7 +76,6 @@ impl<T: 'static + AsLogicalPlan, U: 'static + AsExecutionPlan> TaskManager<T, U>
77
76
codec,
78
77
scheduler_id,
79
78
active_job_cache : Arc :: new ( RwLock :: new ( HashMap :: new ( ) ) ) ,
80
- pending_task_queue_size : Arc :: new ( AtomicUsize :: new ( 0 ) ) ,
81
79
}
82
80
}
83
81
@@ -103,12 +101,11 @@ impl<T: 'static + AsLogicalPlan, U: 'static + AsExecutionPlan> TaskManager<T, U>
103
101
. await ?;
104
102
105
103
graph. revive ( ) ;
106
- let available_tasks = graph. available_tasks ( ) ;
107
104
108
105
let mut active_graph_cache = self . active_job_cache . write ( ) . await ;
109
106
active_graph_cache. insert ( job_id. to_owned ( ) , Arc :: new ( RwLock :: new ( graph) ) ) ;
110
107
111
- self . increase_pending_queue_size ( available_tasks )
108
+ Ok ( ( ) )
112
109
}
113
110
114
111
/// Get the status of of a job. First look in the active cache.
@@ -202,19 +199,16 @@ impl<T: 'static + AsLogicalPlan, U: 'static + AsExecutionPlan> TaskManager<T, U>
202
199
let mut pending_tasks = 0usize ;
203
200
let mut assign_tasks = 0usize ;
204
201
let job_cache = self . active_job_cache . read ( ) . await ;
205
-
206
202
for ( _job_id, graph) in job_cache. iter ( ) {
207
203
let mut graph = graph. write ( ) . await ;
208
204
for reservation in free_reservations. iter ( ) . skip ( assign_tasks) {
209
205
if let Some ( task) = graph. pop_next_task ( & reservation. executor_id ) ? {
210
206
assignments. push ( ( reservation. executor_id . clone ( ) , task) ) ;
211
207
assign_tasks += 1 ;
212
- self . decrease_pending_queue_size ( 1 ) ?;
213
208
} else {
214
209
break ;
215
210
}
216
211
}
217
-
218
212
if assign_tasks >= free_reservations. len ( ) {
219
213
pending_tasks = graph. available_tasks ( ) ;
220
214
break ;
@@ -259,6 +253,18 @@ impl<T: 'static + AsLogicalPlan, U: 'static + AsExecutionPlan> TaskManager<T, U>
259
253
) -> Result < ( ) > {
260
254
let lock = self . state . lock ( Keyspace :: ActiveJobs , "" ) . await ?;
261
255
256
+ let running_tasks = self
257
+ . get_execution_graph ( job_id)
258
+ . await
259
+ . map ( |graph| graph. running_tasks ( ) )
260
+ . unwrap_or_else ( |_| vec ! [ ] ) ;
261
+
262
+ info ! (
263
+ "Cancelling {} running tasks for job {}" ,
264
+ running_tasks. len( ) ,
265
+ job_id
266
+ ) ;
267
+
262
268
let failed_at = SystemTime :: now ( )
263
269
. duration_since ( UNIX_EPOCH )
264
270
. expect ( "Time went backwards" )
@@ -267,47 +273,39 @@ impl<T: 'static + AsLogicalPlan, U: 'static + AsExecutionPlan> TaskManager<T, U>
267
273
self . fail_job_inner ( lock, job_id, "Cancelled" . to_owned ( ) , failed_at)
268
274
. await ?;
269
275
270
- if let Ok ( graph) = self . get_execution_graph ( job_id) . await {
271
- let running_tasks = graph. running_tasks ( ) ;
272
- let mut tasks: HashMap < & str , Vec < protobuf:: PartitionId > > = Default :: default ( ) ;
273
-
274
- info ! (
275
- "Cancelling {} running tasks for job {}" ,
276
- running_tasks. len( ) ,
277
- job_id
278
- ) ;
279
- for ( partition, executor_id) in & running_tasks {
280
- if let Some ( parts) = tasks. get_mut ( executor_id. as_str ( ) ) {
281
- parts. push ( protobuf:: PartitionId {
276
+ let mut tasks: HashMap < & str , Vec < protobuf:: PartitionId > > = Default :: default ( ) ;
277
+
278
+ for ( partition, executor_id) in & running_tasks {
279
+ if let Some ( parts) = tasks. get_mut ( executor_id. as_str ( ) ) {
280
+ parts. push ( protobuf:: PartitionId {
281
+ job_id : job_id. to_owned ( ) ,
282
+ stage_id : partition. stage_id as u32 ,
283
+ partition_id : partition. partition_id as u32 ,
284
+ } )
285
+ } else {
286
+ tasks. insert (
287
+ executor_id. as_str ( ) ,
288
+ vec ! [ protobuf:: PartitionId {
282
289
job_id: job_id. to_owned( ) ,
283
290
stage_id: partition. stage_id as u32 ,
284
291
partition_id: partition. partition_id as u32 ,
285
- } )
286
- } else {
287
- tasks. insert (
288
- executor_id. as_str ( ) ,
289
- vec ! [ protobuf:: PartitionId {
290
- job_id: job_id. to_owned( ) ,
291
- stage_id: partition. stage_id as u32 ,
292
- partition_id: partition. partition_id as u32 ,
293
- } ] ,
294
- ) ;
295
- }
292
+ } ] ,
293
+ ) ;
296
294
}
295
+ }
297
296
298
- for ( executor_id, partitions) in tasks {
299
- if let Ok ( mut client) = executor_manager. get_client ( executor_id) . await {
300
- client
301
- . cancel_tasks ( CancelTasksParams {
302
- partition_id : partitions,
303
- } )
304
- . await ?;
305
- } else {
306
- error ! ( "Failed to get client for executor ID {}" , executor_id)
307
- }
297
+ for ( executor_id, partitions) in tasks {
298
+ if let Ok ( mut client) = executor_manager. get_client ( executor_id) . await {
299
+ client
300
+ . cancel_tasks ( CancelTasksParams {
301
+ partition_id : partitions,
302
+ } )
303
+ . await ?;
304
+ } else {
305
+ error ! ( "Failed to get client for executor ID {}" , executor_id)
308
306
}
309
- self . decrease_pending_queue_size ( graph. available_tasks ( ) ) ?;
310
307
}
308
+
311
309
Ok ( ( ) )
312
310
}
313
311
@@ -366,7 +364,6 @@ impl<T: 'static + AsLogicalPlan, U: 'static + AsExecutionPlan> TaskManager<T, U>
366
364
pub async fn fail_running_job ( & self , job_id : & str ) -> Result < ( ) > {
367
365
if let Some ( graph) = self . get_active_execution_graph ( job_id) . await {
368
366
let graph = graph. read ( ) . await . clone ( ) ;
369
- let available_tasks = graph. available_tasks ( ) ;
370
367
let value = self . encode_execution_graph ( graph) ?;
371
368
372
369
debug ! ( "Moving job {} from Active to Failed" , job_id) ;
@@ -375,7 +372,6 @@ impl<T: 'static + AsLogicalPlan, U: 'static + AsExecutionPlan> TaskManager<T, U>
375
372
self . state
376
373
. put ( Keyspace :: FailedJobs , job_id. to_owned ( ) , value)
377
374
. await ?;
378
- self . decrease_pending_queue_size ( available_tasks) ?
379
375
} else {
380
376
warn ! ( "Fail to find job {} in the cache" , job_id) ;
381
377
}
@@ -389,13 +385,10 @@ impl<T: 'static + AsLogicalPlan, U: 'static + AsExecutionPlan> TaskManager<T, U>
389
385
let mut graph = graph. write ( ) . await ;
390
386
graph. revive ( ) ;
391
387
let graph = graph. clone ( ) ;
392
- let available_tasks = graph. available_tasks ( ) ;
393
388
let value = self . encode_execution_graph ( graph) ?;
394
-
395
389
self . state
396
390
. put ( Keyspace :: ActiveJobs , job_id. to_owned ( ) , value)
397
391
. await ?;
398
- self . increase_pending_queue_size ( available_tasks) ?;
399
392
} else {
400
393
warn ! ( "Fail to find job {} in the cache" , job_id) ;
401
394
}
@@ -581,34 +574,4 @@ impl<T: 'static + AsLogicalPlan, U: 'static + AsExecutionPlan> TaskManager<T, U>
581
574
. take ( 7 )
582
575
. collect ( )
583
576
}
584
-
585
- pub fn increase_pending_queue_size ( & self , num : usize ) -> Result < ( ) > {
586
- match self . pending_task_queue_size . fetch_update (
587
- Ordering :: Relaxed ,
588
- Ordering :: Relaxed ,
589
- |s| Some ( s + num) ,
590
- ) {
591
- Ok ( _) => Ok ( ( ) ) ,
592
- Err ( _) => Err ( BallistaError :: Internal (
593
- "Unable to update pending task counter" . to_owned ( ) ,
594
- ) ) ,
595
- }
596
- }
597
-
598
- pub fn decrease_pending_queue_size ( & self , num : usize ) -> Result < ( ) > {
599
- match self . pending_task_queue_size . fetch_update (
600
- Ordering :: Relaxed ,
601
- Ordering :: Relaxed ,
602
- |s| Some ( s - num) ,
603
- ) {
604
- Ok ( _) => Ok ( ( ) ) ,
605
- Err ( _) => Err ( BallistaError :: Internal (
606
- "Unable to update pending task counter" . to_owned ( ) ,
607
- ) ) ,
608
- }
609
- }
610
-
611
- pub fn get_pending_task_queue_size ( & self ) -> usize {
612
- self . pending_task_queue_size . load ( Ordering :: SeqCst )
613
- }
614
577
}
0 commit comments