Skip to content

Commit a27b63f

Browse files
committed
Change to task based job description as discussed in #625
1 parent 331f6ac commit a27b63f

File tree

17 files changed

+263
-261
lines changed

17 files changed

+263
-261
lines changed

src/integrationTest/java/nl/esciencecenter/xenon/adaptors/schedulers/gridengine/GridengineSchedulerDockerTest.java

+6-6
Original file line numberDiff line numberDiff line change
@@ -24,19 +24,19 @@
2424
import java.util.HashMap;
2525
import java.util.Map;
2626

27-
import nl.esciencecenter.xenon.schedulers.JobDescription;
28-
import nl.esciencecenter.xenon.schedulers.JobStatus;
29-
import nl.esciencecenter.xenon.utils.LocalFileSystemUtils;
3027
import org.junit.ClassRule;
28+
import org.junit.Test;
3129

3230
import com.palantir.docker.compose.DockerComposeRule;
3331
import com.palantir.docker.compose.connection.waiting.HealthChecks;
3432

3533
import nl.esciencecenter.xenon.XenonException;
3634
import nl.esciencecenter.xenon.adaptors.schedulers.SchedulerLocationConfig;
3735
import nl.esciencecenter.xenon.credentials.PasswordCredential;
36+
import nl.esciencecenter.xenon.schedulers.JobDescription;
37+
import nl.esciencecenter.xenon.schedulers.JobStatus;
3838
import nl.esciencecenter.xenon.schedulers.Scheduler;
39-
import org.junit.Test;
39+
import nl.esciencecenter.xenon.utils.LocalFileSystemUtils;
4040

4141
public class GridengineSchedulerDockerTest extends GridengineSchedulerTestParent {
4242

@@ -84,8 +84,8 @@ public void test_submitBatch_peUsingCoreCount() throws XenonException {
8484

8585
JobDescription job = new JobDescription();
8686
job.setExecutable("/bin/hostname");
87-
job.setProcessesPerNode(2);
88-
job.setStartSingleProcess(true);
87+
job.setCoresPerTask(2);
88+
job.setStartPerTask(false);
8989

9090
String jobID = scheduler.submitBatchJob(job);
9191

src/main/java/nl/esciencecenter/xenon/adaptors/schedulers/JobQueueScheduler.java

+6-6
Original file line numberDiff line numberDiff line change
@@ -311,16 +311,16 @@ private void verifyJobDescription(JobDescription description, boolean interactiv
311311
throw new IncompleteJobDescriptionException(adaptorName, "Executable missing in JobDescription!");
312312
}
313313

314-
int nodeCount = description.getNodeCount();
314+
int tasks = description.getTasks();
315315

316-
if (nodeCount != 1) {
317-
throw new InvalidJobDescriptionException(adaptorName, "Illegal node count: " + nodeCount);
316+
if (tasks != 1) {
317+
throw new InvalidJobDescriptionException(adaptorName, "Unsupported task count: " + tasks);
318318
}
319319

320-
int processesPerNode = description.getProcessesPerNode();
320+
int tasksPerNode = description.getTasksPerNode();
321321

322-
if (processesPerNode != 1) {
323-
throw new InvalidJobDescriptionException(adaptorName, "Illegal processes per node count: " + processesPerNode);
322+
if (tasksPerNode > 1) {
323+
throw new InvalidJobDescriptionException(adaptorName, "Unsupported task per node count: " + tasksPerNode);
324324
}
325325

326326
int maxTime = description.getMaxRuntime();

src/main/java/nl/esciencecenter/xenon/adaptors/schedulers/ScriptingUtils.java

+6-6
Original file line numberDiff line numberDiff line change
@@ -193,16 +193,16 @@ public static void verifyJobDescription(JobDescription description, String[] que
193193
throw new IncompleteJobDescriptionException(adaptorName, "Executable missing in JobDescription!");
194194
}
195195

196-
int nodeCount = description.getNodeCount();
196+
int tasks = description.getTasks();
197197

198-
if (nodeCount < 1) {
199-
throw new InvalidJobDescriptionException(adaptorName, "Illegal node count: " + nodeCount);
198+
if (tasks < 1) {
199+
throw new InvalidJobDescriptionException(adaptorName, "Illegal task count: " + tasks);
200200
}
201201

202-
int processesPerNode = description.getProcessesPerNode();
202+
int coresPerTask = description.getCoresPerTask();
203203

204-
if (processesPerNode < 1) {
205-
throw new InvalidJobDescriptionException(adaptorName, "Illegal processes per node count: " + processesPerNode);
204+
if (coresPerTask < 1) {
205+
throw new InvalidJobDescriptionException(adaptorName, "Illegal cores per task count: " + coresPerTask);
206206
}
207207

208208
// Check if the time is set to -1 (default), 0 (infinite), or a value.

src/main/java/nl/esciencecenter/xenon/adaptors/schedulers/at/AtUtils.java

+6-6
Original file line numberDiff line numberDiff line change
@@ -160,16 +160,16 @@ public static void verifyJobDescription(JobDescription description, String[] que
160160
ScriptingUtils.verifyJobDescription(description, queueNames, ADAPTOR_NAME);
161161

162162
// Perform at specific checks
163-
int nodeCount = description.getNodeCount();
163+
int tasks = description.getTasks();
164164

165-
if (nodeCount > 1) {
166-
throw new InvalidJobDescriptionException(ADAPTOR_NAME, "Unsupported node count: " + nodeCount);
165+
if (tasks > 1) {
166+
throw new InvalidJobDescriptionException(ADAPTOR_NAME, "Unsupported task count: " + tasks);
167167
}
168168

169-
int processesPerNode = description.getProcessesPerNode();
169+
int tasksPerNode = description.getTasksPerNode();
170170

171-
if (processesPerNode > 1) {
172-
throw new InvalidJobDescriptionException(ADAPTOR_NAME, "Unsupported processes per node count: " + processesPerNode);
171+
if (tasksPerNode > 1) {
172+
throw new InvalidJobDescriptionException(ADAPTOR_NAME, "Unsupported task per node count: " + tasksPerNode);
173173
}
174174

175175
int maxTime = description.getMaxRuntime();

src/main/java/nl/esciencecenter/xenon/adaptors/schedulers/gridengine/GridEngineUtils.java

+27-19
Original file line numberDiff line numberDiff line change
@@ -50,24 +50,27 @@ final class GridEngineUtils {
5050
private static final int MINUTES_PER_HOUR = 60;
5151

5252
protected static void generateParallelEnvironmentSpecification(JobDescription description, GridEngineSetup setup, Formatter script) throws XenonException {
53-
if (description.getNodeCount() == 1 && description.getProcessesPerNode() == 1) {
54-
// Single node + single core
53+
if (description.getTasks() == 1 && description.getCoresPerTask() == 1) {
54+
// Single task + single core
5555
// nothing to do
56-
} else if (description.getNodeCount() == 1 && description.getProcessesPerNode() > 1) {
56+
} else if (description.getTasks() == 1 && description.getCoresPerTask() > 1) {
5757
// Single node + multi core
58-
Optional<ParallelEnvironmentInfo> pe = setup.getSingleNodeParallelEnvironment(description.getProcessesPerNode(), description.getQueueName());
58+
Optional<ParallelEnvironmentInfo> pe = setup.getSingleNodeParallelEnvironment(description.getCoresPerTask(), description.getQueueName());
5959
if (pe.isPresent()) {
60-
script.format("#$ -pe %s %d\n", pe.get().getName(), description.getProcessesPerNode());
60+
script.format("#$ -pe %s %d\n", pe.get().getName(), description.getCoresPerTask());
6161
} else {
62-
throw new InvalidJobDescriptionException(ADAPTOR_NAME, "Unable to find a parallel environment for multi core on single node, replace node count and cores per node with scheduler.addSchedulerArgument(\"-pe <name of parallel environment (qconf -spl)> <number of slots>\")");
62+
throw new InvalidJobDescriptionException(ADAPTOR_NAME,
63+
"Unable to find a parallel environment for multi core on single node, replace node count and cores per node with scheduler.addSchedulerArgument(\"-pe <name of parallel environment (qconf -spl)> <number of slots>\")");
6364
}
6465
} else {
6566
// Multi node + multi core
66-
Optional<ParallelEnvironmentInfo> pe = setup.getMultiNodeParallelEnvironment(description.getProcessesPerNode(), description.getNodeCount(), description.getQueueName());
67+
Optional<ParallelEnvironmentInfo> pe = setup.getMultiNodeParallelEnvironment(description.getCoresPerTask(), description.getTasks(),
68+
description.getQueueName());
6769
if (pe.isPresent()) {
68-
script.format("#$ -pe %s %d\n", pe.get().getName(), description.getProcessesPerNode() * description.getNodeCount());
70+
script.format("#$ -pe %s %d\n", pe.get().getName(), description.getCoresPerTask() * description.getTasks());
6971
} else {
70-
throw new InvalidJobDescriptionException(ADAPTOR_NAME, "Unable to find a parallel environment for multiple nodes, replace node count and cores per node with scheduler.addSchedulerArgument(\"-pe <name of parallel environment (qconf -spl)> <number of slots>\")");
72+
throw new InvalidJobDescriptionException(ADAPTOR_NAME,
73+
"Unable to find a parallel environment for multiple nodes, replace node count and cores per node with scheduler.addSchedulerArgument(\"-pe <name of parallel environment (qconf -spl)> <number of slots>\")");
7174
}
7275
}
7376
}
@@ -84,14 +87,19 @@ protected static void generateSerialScriptContent(JobDescription description, Fo
8487
protected static void generateParallelScriptContent(JobDescription description, Formatter script) {
8588
script.format("%s\n", "for host in `cat $PE_HOSTFILE | cut -d \" \" -f 1` ; do");
8689

87-
for (int i = 0; i < description.getProcessesPerNode(); i++) {
88-
script.format("%s", " ssh -o StrictHostKeyChecking=false $host \"cd `pwd` && ");
89-
script.format("%s", description.getExecutable());
90-
for (String argument : description.getArguments()) {
91-
script.format(" %s", ScriptingUtils.protectAgainstShellMetas(argument));
92-
}
93-
script.format("%c&\n", '"');
90+
// TODO: the PE_HOSTFILE seems to contain "<host> <slots> <queue> <processor range>"
91+
//
92+
// What happens if we want 2x2 slots on a PE with 4 slots per host? Will we get 1 or 2 lines in the PE_HOSTFILE???
93+
94+
// for (int i = 0; i < 1; i++) {
95+
script.format("%s", " ssh -o StrictHostKeyChecking=false $host \"cd `pwd` && ");
96+
script.format("%s", description.getExecutable());
97+
for (String argument : description.getArguments()) {
98+
script.format(" %s", ScriptingUtils.protectAgainstShellMetas(argument));
9499
}
100+
script.format("%c&\n", '"');
101+
// }
102+
95103
// wait for all ssh connections to finish
96104
script.format("%s\n\n", "done");
97105
script.format("%s\n", "wait");
@@ -181,10 +189,10 @@ protected static String generate(JobDescription description, Path fsEntryPath, G
181189

182190
script.format("\n");
183191

184-
if ((description.getNodeCount() == 1 && description.getProcessesPerNode() == 1) || description.isStartSingleProcess()) {
185-
generateSerialScriptContent(description, script);
186-
} else {
192+
if (description.startPerTask() && description.getTasks() > 1) {
187193
generateParallelScriptContent(description, script);
194+
} else {
195+
generateSerialScriptContent(description, script);
188196
}
189197

190198
script.close();

src/main/java/nl/esciencecenter/xenon/adaptors/schedulers/slurm/SlurmUtils.java

+13-17
Original file line numberDiff line numberDiff line change
@@ -310,10 +310,6 @@ protected static void verifyJobDescription(JobDescription description, String[]
310310
throw new InvalidJobDescriptionException(ADAPTOR_NAME, "Custom job script not supported in interactive mode");
311311
}
312312

313-
if (description.isStartSingleProcess()) {
314-
throw new InvalidJobDescriptionException(ADAPTOR_NAME, "StartSingleProcess option not supported in interactive mode");
315-
}
316-
317313
if (description.getStdin() != null) {
318314
throw new InvalidJobDescriptionException(ADAPTOR_NAME, "Stdin redirect not supported in interactive mode");
319315
}
@@ -366,15 +362,15 @@ public static String[] generateInteractiveArguments(JobDescription description,
366362
}
367363

368364
// number of nodes
369-
arguments.add("--nodes=" + description.getNodeCount());
365+
arguments.add("--ntasks=" + description.getTasks());
370366

371-
// number of processer per node
372-
arguments.add("--ntasks-per-node=" + description.getProcessesPerNode());
367+
// number of processor per node
368+
if (description.getTasksPerNode() > 0) {
369+
arguments.add("--ntasks-per-node=" + description.getTasksPerNode());
370+
}
373371

374372
// number of thread per process
375-
if (description.getThreadsPerProcess() > 0) {
376-
arguments.add("--cpus-per-task=" + description.getThreadsPerProcess());
377-
}
373+
arguments.add("--cpus-per-task=" + description.getCoresPerTask());
378374

379375
// the max amount of memory per node.
380376
if (description.getMaxMemory() > 0) {
@@ -427,14 +423,14 @@ public static String generate(JobDescription description, Path fsEntryPath, int
427423
}
428424

429425
// number of nodes
430-
script.format("#SBATCH --nodes=%d\n", description.getNodeCount());
431-
432-
// number of processer per node
433-
script.format("#SBATCH --ntasks-per-node=%d\n", description.getProcessesPerNode());
426+
script.format("#SBATCH --ntasks=%d\n", description.getTasks());
434427

435428
// number of thread per process
436-
if (description.getThreadsPerProcess() > 0) {
437-
script.format("#SBATCH --cpus-per-task=%d\n", description.getThreadsPerProcess());
429+
script.format("#SBATCH --cpus-per-task=%d\n", description.getCoresPerTask());
430+
431+
// number of processer per node
432+
if (description.getTasksPerNode() > 0) {
433+
script.format("#SBATCH --ntasks-per-node=%d\n", description.getTasksPerNode());
438434
}
439435

440436
// add maximum runtime in hour:minute:second format (converted from minutes in description)
@@ -484,7 +480,7 @@ public static String generate(JobDescription description, Path fsEntryPath, int
484480

485481
script.format("\n");
486482

487-
if (!description.isStartSingleProcess()) {
483+
if (description.startPerTask()) {
488484
// run commands through srun
489485
script.format("%s ", "srun");
490486
}

src/main/java/nl/esciencecenter/xenon/adaptors/schedulers/torque/TorqueUtils.java

+14-5
Original file line numberDiff line numberDiff line change
@@ -186,16 +186,25 @@ public static String generate(JobDescription description, Path workdir, int defa
186186
script.format("#PBS -q %s\n", description.getQueueName());
187187
}
188188

189-
int processorsPerNode = description.getProcessesPerNode();
189+
int tasks = description.getTasks();
190+
int coresPerTask = description.getCoresPerTask();
191+
int tasksPerNode = description.getTasksPerNode();
190192

191-
int threads = description.getThreadsPerProcess();
193+
if (tasksPerNode < 0) {
194+
// assume 1 task per node
195+
tasksPerNode = 1;
196+
}
197+
198+
int nodes = tasks / tasksPerNode;
192199

193-
if (threads > 1) {
194-
processorsPerNode = processorsPerNode * threads;
200+
if (tasks % tasksPerNode > 0) {
201+
nodes += 1;
195202
}
196203

204+
int ppn = coresPerTask * tasksPerNode;
205+
197206
// number of nodes and processes per node
198-
script.format("#PBS -l nodes=%d:ppn=%d\n", description.getNodeCount(), processorsPerNode);
207+
script.format("#PBS -l nodes=%d:ppn=%d\n", nodes, ppn);
199208

200209
// the max amount of memory per node.
201210
if (description.getMaxMemory() > 0) {

0 commit comments

Comments
 (0)