25
25
package io .airbyte .scheduler .app ;
26
26
27
27
import com .google .common .annotations .VisibleForTesting ;
28
+ import com .google .common .collect .Sets ;
28
29
import io .airbyte .commons .concurrency .LifecycledCallable ;
29
30
import io .airbyte .commons .enums .Enums ;
30
31
import io .airbyte .config .helpers .LogClientSingleton ;
36
37
import io .airbyte .scheduler .persistence .job_tracker .JobTracker .JobState ;
37
38
import java .nio .file .Path ;
38
39
import java .util .Optional ;
40
+ import java .util .Set ;
39
41
import java .util .concurrent .ExecutorService ;
42
+ import java .util .function .Consumer ;
40
43
import org .slf4j .Logger ;
41
44
import org .slf4j .LoggerFactory ;
42
45
import org .slf4j .MDC ;
@@ -50,6 +53,9 @@ public class JobSubmitter implements Runnable {
50
53
private final TemporalWorkerRunFactory temporalWorkerRunFactory ;
51
54
private final JobTracker jobTracker ;
52
55
56
+ // See attemptJobSubmit() to understand the need for this Concurrent Set.
57
+ private final Set <Long > runningJobs = Sets .newConcurrentHashSet ();
58
+
53
59
public JobSubmitter (final ExecutorService threadPool ,
54
60
final JobPersistence persistence ,
55
61
final TemporalWorkerRunFactory temporalWorkerRunFactory ,
@@ -67,18 +73,42 @@ public void run() {
67
73
68
74
final Optional <Job > nextJob = persistence .getNextJob ();
69
75
70
- nextJob .ifPresent (job -> {
71
- trackSubmission (job );
72
- submitJob (job );
73
- LOGGER .info ("Job-Submitter Summary. Submitted job with scope {}" , job .getScope ());
74
- });
76
+ nextJob .ifPresent (attemptJobSubmit ());
75
77
76
78
LOGGER .debug ("Completed Job-Submitter..." );
77
79
} catch (Throwable e ) {
78
80
LOGGER .error ("Job Submitter Error" , e );
79
81
}
80
82
}
81
83
84
+ /**
85
+ * Since job submission and job execution happen in two separate thread pools, and job execution is
86
+ * what removes a job from the submission queue, it is possible for a job to be submitted multiple
87
+ * times.
88
+ *
89
+ * This synchronised block guarantees only a single thread can utilise the concurrent set to decide
90
+ * whether a job should be submitted. This job id is added here, and removed in the finish block of
91
+ * {@link #submitJob(Job)}.
92
+ *
93
+ * Since {@link JobPersistence#getNextJob()} returns the next queued job, this solution cause
94
+ * head-of-line blocking as the JobSubmitter tries to submit the same job. However, this suggests
95
+ * the Worker Pool needs more workers and is inevitable when dealing with pending jobs.
96
+ *
97
+ * See https://github.com/airbytehq/airbyte/issues/4378 for more info.
98
+ */
99
+ synchronized private Consumer <Job > attemptJobSubmit () {
100
+ return job -> {
101
+ if (!runningJobs .contains (job .getId ())) {
102
+ runningJobs .add (job .getId ());
103
+ trackSubmission (job );
104
+ submitJob (job );
105
+ LOGGER .info ("Job-Submitter Summary. Submitted job with scope {}" , job .getScope ());
106
+ } else {
107
+ LOGGER .info ("Attempting to submit already running job {}. There are probably too many queued jobs." , job .getId ());
108
+ }
109
+ };
110
+ }
111
+
82
112
@ VisibleForTesting
83
113
void submitJob (Job job ) {
84
114
final WorkerRun workerRun = temporalWorkerRunFactory .create (job );
@@ -94,7 +124,6 @@ void submitJob(Job job) {
94
124
final Path logFilePath = workerRun .getJobRoot ().resolve (LogClientSingleton .LOG_FILENAME );
95
125
final long persistedAttemptId = persistence .createAttempt (job .getId (), logFilePath );
96
126
assertSameIds (attemptNumber , persistedAttemptId );
97
-
98
127
LogClientSingleton .setJobMdc (workerRun .getJobRoot ());
99
128
})
100
129
.setOnSuccess (output -> {
@@ -114,7 +143,10 @@ void submitJob(Job job) {
114
143
persistence .failAttempt (job .getId (), attemptNumber );
115
144
trackCompletion (job , io .airbyte .workers .JobStatus .FAILED );
116
145
})
117
- .setOnFinish (MDC ::clear )
146
+ .setOnFinish (() -> {
147
+ runningJobs .remove (job .getId ());
148
+ MDC .clear ();
149
+ })
118
150
.build ());
119
151
}
120
152
0 commit comments