|
17 | 17 | #' \code{FALSE}. This argument may be required on some systems where, e.g.,
|
18 | 18 | #' expired jobs or jobs on hold are problematic to detect. If you don't want
|
19 | 19 | #' a timeout, set this to \code{Inf}. Default is \code{604800} (one week).
|
| 20 | +#' @param expire.after [\code{integer(1)}]\cr |
| 21 | +#' Jobs count as \dQuote{expired} if they are not found on the system but have not communicated back |
| 22 | +#' their results (or error message). This frequently happens on managed system if the scheduler kills |
| 23 | +#' a job because the job has hit the walltime or request more memory than reserved. |
| 24 | +#' On the other hand, network file systems often require several seconds for new files to be found, |
| 25 | +#' which can lead to false positives in the detection heuristic. |
| 26 | +#' \code{waitForJobs} treats such jobs as expired after they have not been detected on the system |
| 27 | +#' for \code{expire.after} iterations (default 3 iterations). |
20 | 28 | #' @param stop.on.error [\code{logical(1)}]\cr
|
21 | 29 | #' Immediately cancel if a job terminates with an error? Default is
|
22 | 30 | #' \code{FALSE}.
|
|
25 | 33 | #' successfully and \code{FALSE} if either the timeout is reached or at least
|
26 | 34 | #' one job terminated with an exception.
|
27 | 35 | #' @export
|
28 |
| -waitForJobs = function(ids = NULL, sleep = NULL, timeout = 604800, stop.on.error = FALSE, reg = getDefaultRegistry()) { |
| 36 | +waitForJobs = function(ids = NULL, sleep = NULL, timeout = 604800, expire.after = 3L, stop.on.error = FALSE, reg = getDefaultRegistry()) { |
29 | 37 | assertRegistry(reg, writeable = FALSE, sync = TRUE)
|
30 | 38 | assertNumber(timeout, lower = 0)
|
| 39 | + assertCount(expire.after, positive = TRUE) |
31 | 40 | assertFlag(stop.on.error)
|
32 | 41 | sleep = getSleepFunction(reg, sleep)
|
33 | 42 | ids = convertIds(reg, ids, default = .findSubmitted(reg = reg))
|
34 | 43 |
|
35 |
| - .findNotTerminated = function(reg, ids = NULL) { |
36 |
| - done = NULL |
37 |
| - filter(reg$status, ids, c("job.id", "done"))[is.na(done), "job.id"] |
38 |
| - } |
39 |
| - |
40 | 44 | if (nrow(.findNotSubmitted(ids = ids, reg = reg)) > 0L) {
|
41 | 45 | warning("Cannot wait for unsubmitted jobs. Removing from ids.")
|
42 | 46 | ids = ids[.findSubmitted(ids = ids, reg = reg), nomatch = 0L]
|
43 | 47 | }
|
44 | 48 |
|
45 |
| - n.jobs = nrow(ids) |
46 |
| - if (n.jobs == 0L) |
| 49 | + if (nrow(ids) == 0L) { |
47 | 50 | return(TRUE)
|
| 51 | + } |
48 | 52 |
|
49 |
| - batch.ids = getBatchIds(reg) |
50 |
| - "!DEBUG [waitForJobs]: Using `nrow(ids)` ids and `nrow(batch.ids)` initial batch ids" |
| 53 | + terminated = on.sys = expire.counter = NULL |
| 54 | + ids$terminated = FALSE |
| 55 | + ids$on.sys = FALSE |
| 56 | + ids$expire.counter = 0L |
51 | 57 |
|
52 | 58 | timeout = Sys.time() + timeout
|
53 |
| - ids.disappeared = noIds() |
54 |
| - pb = makeProgressBar(total = n.jobs, format = "Waiting (S::system R::running D::done E::error) [:bar] :percent eta: :eta") |
55 |
| - i = 1L |
| 59 | + pb = makeProgressBar(total = nrow(ids), format = "Waiting (S::system R::running D::done E::error) [:bar] :percent eta: :eta") |
| 60 | + i = 0L |
56 | 61 |
|
57 | 62 | repeat {
|
58 |
| - # case 1: all jobs terminated -> nothing on system |
59 |
| - ids.nt = .findNotTerminated(reg, ids) |
60 |
| - if (nrow(ids.nt) == 0L) { |
| 63 | + ### case 1: all jobs terminated -> nothing on system |
| 64 | + ids[.findTerminated(reg, ids), "terminated" := TRUE] |
| 65 | + if (ids[!(terminated), .N] == 0L) { |
61 | 66 | "!DEBUG [waitForJobs]: All jobs terminated"
|
62 | 67 | pb$update(1)
|
63 | 68 | waitForResults(reg, ids)
|
64 | 69 | return(nrow(.findErrors(reg, ids)) == 0L)
|
65 | 70 | }
|
66 | 71 |
|
67 |
| - stats = getStatusTable(ids = ids, batch.ids = batch.ids, reg = reg) |
68 |
| - pb$update((n.jobs - nrow(ids.nt)) / n.jobs, tokens = as.list(stats)) |
69 |
| - |
70 |
| - # case 2: there are errors and stop.on.error is TRUE |
| 72 | + ### case 2: there are errors and stop.on.error is TRUE |
71 | 73 | if (stop.on.error && nrow(.findErrors(reg, ids)) > 0L) {
|
72 | 74 | "!DEBUG [waitForJobs]: Errors found and stop.on.error is TRUE"
|
73 | 75 | pb$update(1)
|
74 | 76 | return(FALSE)
|
75 | 77 | }
|
76 | 78 |
|
77 |
| - # case 3: we have reached a timeout |
78 |
| - if (Sys.time() > timeout) { |
| 79 | + batch.ids = getBatchIds(reg) |
| 80 | + ids[, "on.sys" := FALSE][.findOnSystem(reg, ids, batch.ids = batch.ids), "on.sys" := TRUE] |
| 81 | + ids[!(on.sys) & !(terminated), "expire.counter" := expire.counter + 1L] |
| 82 | + stats = getStatusTable(ids = ids, batch.ids = batch.ids, reg = reg) |
| 83 | + pb$update(mean(ids$terminated), tokens = as.list(stats)) |
| 84 | + "!DEBUG [waitForJobs]: batch.ids: `stri_flatten(batch.ids$batch.id, ',')`" |
| 85 | + |
| 86 | + ### case 3: jobs disappeared, we cannot find them on the system in [expire.after] iterations |
| 87 | + if (ids[!(terminated) & expire.counter > expire.after, .N] > 0L) { |
| 88 | + warning("Some jobs disappeared from the system") |
79 | 89 | pb$update(1)
|
80 |
| - warning("Timeout reached") |
| 90 | + waitForResults(reg, ids) |
81 | 91 | return(FALSE)
|
82 | 92 | }
|
83 | 93 |
|
84 |
| - # case 4: jobs disappeared, we cannot find them on the system |
85 |
| - # heuristic: |
86 |
| - # job is not terminated, not on system and has not been on the system |
87 |
| - # in the previous iteration |
88 |
| - ids.on.sys = .findOnSystem(reg, ids, batch.ids = batch.ids) |
89 |
| - if (nrow(ids.disappeared) > 0L) { |
90 |
| - if (nrow(ids.nt[!ids.on.sys, on = "job.id"][ids.disappeared, on = "job.id", nomatch = 0L]) > 0L) { |
91 |
| - warning("Some jobs disappeared from the system") |
92 |
| - pb$update(1) |
93 |
| - waitForResults(reg, ids) |
94 |
| - return(FALSE) |
95 |
| - } |
96 |
| - } |
97 |
| - |
98 |
| - ids.disappeared = ids[!ids.on.sys, on = "job.id"] |
99 |
| - "!DEBUG [waitForJobs]: `nrow(ids.disappeared)` jobs disappeared" |
100 |
| - |
| 94 | + # case 4: we reach a timeout |
101 | 95 | sleep(i)
|
102 | 96 | i = i + 1L
|
103 |
| - suppressMessages(sync(reg = reg)) |
104 |
| - saveRegistry(reg) |
105 |
| - batch.ids = getBatchIds(reg) |
106 |
| - "!DEBUG [waitForJobs]: New batch.ids: `stri_flatten(batch.ids$batch.id, ',')`" |
| 97 | + if (Sys.time() > timeout) { |
| 98 | + pb$update(1) |
| 99 | + warning("Timeout reached") |
| 100 | + return(FALSE) |
| 101 | + } |
| 102 | + |
| 103 | + if (suppressMessages(sync(reg = reg))) |
| 104 | + saveRegistry(reg) |
107 | 105 | }
|
108 | 106 | }
|
| 107 | + |
0 commit comments