Skip to content

Commit 03d1636

Browse files
committed
Implement secondary file testing, staging, and indexing.
- The staging part work previously, long ago but things have changed a lot and this needed to be touched up. - Implements secondary files when summarizing cwl outputs for testing and CLI. - Upgrades cwltool to latest since the previous target version had a bug in it, required to adapt the path mapper interface a bit. - Implement indexing of secondary files - the test cases require that secondary files are ordered. - Tweaks to upload.py to allow uploading files with secondaryFiles. - Fix a bug related to not matching workflow step to correct tool. - Fix a bug related to NO_REPLACEMENT handling for FieldParameters. The big thing not implemented in this commit is Directories with secondaryFiles - so ultimately the search.cwl conformance test still fails.
1 parent 7a00dcb commit 03d1636

File tree

12 files changed

+214
-59
lines changed

12 files changed

+214
-59
lines changed

lib/galaxy/dependencies/pinned-requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,5 +84,5 @@ chronos-python==0.38.0
8484
python-genomespaceclient==0.1.8
8585

8686
# For CWL support.
87-
cwltool==1.0.20170727112954
87+
cwltool==1.0.20170828135420
8888
cwltest==1.0.20170809112706 # TODO: only required for testing...

lib/galaxy/model/__init__.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4004,6 +4004,11 @@ def __init__(self):
40044004
self.merge_type = self.default_merge_type
40054005
self.scatter_type = self.default_scatter_type
40064006

4007+
def log_str(self):
4008+
return "WorkflowStepInput[name=%s]" % (
4009+
self.name,
4010+
)
4011+
40074012

40084013
class WorkflowStepConnection(object):
40094014
# Constant used in lieu of output_name and input_name to indicate an
@@ -4035,6 +4040,11 @@ def copy(self):
40354040
copied_connection.input_name = self.input_name
40364041
return copied_connection
40374042

4043+
def log_str(self):
4044+
return "WorkflowStepConnection[output_step_id=%s,output_name=%s,input_step_id=%s,input_name=%s]" % (
4045+
self.output_step_id, self.output_name, self.input_step_id, self.input_name
4046+
)
4047+
40384048

40394049
class WorkflowOutput(object):
40404050

lib/galaxy/tools/cwl/parser.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,11 @@
3333
)
3434

3535
from .schema import non_strict_schema_loader, schema_loader
36+
from .util import SECONDARY_FILES_EXTRA_PREFIX
3637

3738
log = logging.getLogger(__name__)
3839

3940
JOB_JSON_FILE = ".cwl_job.json"
40-
SECONDARY_FILES_EXTRA_PREFIX = "__secondary_files__"
4141

4242
DOCKER_REQUIREMENT = "DockerRequirement"
4343
SUPPORTED_TOOL_REQUIREMENTS = [
@@ -223,7 +223,8 @@ def id(self):
223223
def galaxy_id(self):
224224
raw_id = self.id
225225
tool_id = None
226-
if raw_id:
226+
# don't reduce "search.cwl#index" to search
227+
if raw_id and "#" not in raw_id:
227228
tool_id = os.path.splitext(os.path.basename(raw_id))[0]
228229
if not tool_id:
229230
from galaxy.tools.hash import build_tool_hash
@@ -485,14 +486,15 @@ def stageFunc(resolved_path, target_path):
485486
process.stageFiles(cwl_job.pathmapper, stageFunc, ignoreWritable=True, symLink=False)
486487

487488
if hasattr(cwl_job, "generatefiles"):
489+
outdir = os.path.join(self._job_directory, "working")
488490
# TODO: Why doesn't cwl_job.generatemapper work?
489491
generate_mapper = pathmapper.PathMapper(cwl_job.generatefiles["listing"],
490-
os.path.join(self._job_directory, "working"), os.path.join(self._job_directory, "working"), separateDirs=False)
492+
outdir, outdir, separateDirs=False)
491493
# TODO: figure out what inplace_update should be.
492494
inplace_update = getattr(cwl_job, "inplace_update")
493495
process.stageFiles(generate_mapper, stageFunc, ignoreWritable=inplace_update, symLink=False)
494496
from cwltool import job
495-
job.relink_initialworkdir(generate_mapper, inplace_update=inplace_update)
497+
job.relink_initialworkdir(generate_mapper, outdir, outdir, inplace_update=inplace_update)
496498
# else: expression tools do not have a path mapper.
497499

498500
@staticmethod

lib/galaxy/tools/cwl/representation.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -127,18 +127,26 @@ def dataset_wrapper_to_file_json(inputs_dir, dataset_wrapper):
127127
extra_files_path = dataset_wrapper.extra_files_path
128128
secondary_files_path = os.path.join(extra_files_path, "__secondary_files__")
129129
path = str(dataset_wrapper)
130+
raw_file_object = {"class": "File"}
131+
130132
if os.path.exists(secondary_files_path):
131133
safe_makedirs(inputs_dir)
132134
name = os.path.basename(path)
133135
new_input_path = os.path.join(inputs_dir, name)
134136
os.symlink(path, new_input_path)
137+
secondary_files = []
135138
for secondary_file_name in os.listdir(secondary_files_path):
136139
secondary_file_path = os.path.join(secondary_files_path, secondary_file_name)
137-
os.symlink(secondary_file_path, new_input_path + secondary_file_name)
140+
target = os.path.join(inputs_dir, secondary_file_name)
141+
log.info("linking [%s] to [%s]" % (secondary_file_path, target))
142+
os.symlink(secondary_file_path, target)
143+
is_dir = os.path.isdir(os.path.realpath(secondary_file_path))
144+
secondary_files.append({"class": "File" if not is_dir else "Directory", "location": target})
145+
146+
raw_file_object["secondaryFiles"] = secondary_files
138147
path = new_input_path
139148

140-
raw_file_object = {"location": path,
141-
"class": "File"}
149+
raw_file_object["location"] = path
142150
set_basename_and_derived_properties(raw_file_object, str(dataset_wrapper.cwl_filename or dataset_wrapper.name))
143151
return raw_file_object
144152

lib/galaxy/tools/cwl/runtime_actions.py

Lines changed: 59 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,19 @@
88
load_job_proxy,
99
)
1010

11+
from .util import (
12+
SECONDARY_FILES_INDEX_PATH,
13+
STORE_SECONDARY_FILES_WITH_BASENAME,
14+
)
15+
16+
17+
def _possible_uri_to_path(location):
18+
if location.startswith("file://"):
19+
path = ref_resolver.uri_file_path(location)
20+
else:
21+
path = location
22+
return path
23+
1124

1225
def handle_outputs(job_directory=None):
1326
# Relocate dynamically collected files to pre-determined locations
@@ -30,25 +43,54 @@ def handle_outputs(job_directory=None):
3043

3144
def move_output_file(output, target_path, output_name=None):
3245
assert output["class"] == "File"
33-
output_path = ref_resolver.uri_file_path(output["location"])
46+
output_path = _possible_uri_to_path(output["location"])
3447
shutil.move(output_path, target_path)
3548

36-
for secondary_file in output.get("secondaryFiles", []):
37-
if output_name is None:
38-
raise NotImplementedError("secondaryFiles are unimplemented for dynamic list elements")
39-
40-
# TODO: handle nested files...
41-
secondary_file_path = ref_resolver.uri_file_path(secondary_file["location"])
42-
assert secondary_file_path.startswith(output_path)
43-
secondary_file_name = secondary_file_path[len(output_path):]
44-
secondary_files_dir = job_proxy.output_secondary_files_dir(
45-
output_name, create=True
46-
)
47-
extra_target = os.path.join(secondary_files_dir, secondary_file_name)
48-
shutil.move(
49-
secondary_file_path,
50-
extra_target,
51-
)
49+
secondary_files = output.get("secondaryFiles", [])
50+
if secondary_files:
51+
52+
order = []
53+
index_contents = {
54+
"order": order
55+
}
56+
57+
for secondary_file in secondary_files:
58+
if output_name is None:
59+
raise NotImplementedError("secondaryFiles are unimplemented for dynamic list elements")
60+
61+
# TODO: handle nested files...
62+
secondary_file_path = _possible_uri_to_path(secondary_file["location"])
63+
# assert secondary_file_path.startswith(output_path), "[%s] does not start with [%s]" % (secondary_file_path, output_path)
64+
secondary_file_basename = secondary_file["basename"]
65+
66+
if not STORE_SECONDARY_FILES_WITH_BASENAME:
67+
output_basename = output["basename"]
68+
prefix = ""
69+
while True:
70+
if secondary_file_basename.startswith(output_basename):
71+
secondary_file_name = prefix + secondary_file_basename[len(output_basename):]
72+
break
73+
prefix = "^%s" % prefix
74+
if "." not in output_basename:
75+
secondary_file_name = prefix + secondary_file_name
76+
break
77+
else:
78+
output_basename = output_basename.rsplit(".", 1)[0]
79+
else:
80+
secondary_file_name = secondary_file_basename
81+
# Convert to ^ format....
82+
secondary_files_dir = job_proxy.output_secondary_files_dir(
83+
output_name, create=True
84+
)
85+
extra_target = os.path.join(secondary_files_dir, secondary_file_name)
86+
shutil.move(
87+
secondary_file_path,
88+
extra_target,
89+
)
90+
order.append(secondary_file_name)
91+
92+
with open(os.path.join(secondary_files_dir, "..", SECONDARY_FILES_INDEX_PATH), "w") as f:
93+
json.dump(index_contents, f)
5294

5395
return {"cwl_filename": output["basename"]}
5496

lib/galaxy/tools/cwl/util.py

Lines changed: 63 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@
1212

1313
from six import iteritems, StringIO
1414

15+
STORE_SECONDARY_FILES_WITH_BASENAME = True
16+
SECONDARY_FILES_EXTRA_PREFIX = "__secondary_files__"
17+
SECONDARY_FILES_INDEX_PATH = "__secondary_files_index.json"
18+
1519

1620
def set_basename_and_derived_properties(properties, basename):
1721
properties["basename"] = basename
@@ -62,11 +66,11 @@ def galactic_job_json(
6266
datasets = []
6367
dataset_collections = []
6468

65-
def upload_file(file_path):
69+
def upload_file(file_path, secondary_files):
6670
if not os.path.isabs(file_path):
6771
file_path = os.path.join(test_data_directory, file_path)
6872
_ensure_file_exists(file_path)
69-
target = FileUploadTarget(file_path)
73+
target = FileUploadTarget(file_path, secondary_files)
7074
upload_response = upload_func(target)
7175
dataset = upload_response["outputs"][0]
7276
datasets.append((dataset, target))
@@ -125,7 +129,30 @@ def replacement_file(value):
125129
if file_path is None:
126130
return value
127131

128-
return upload_file(file_path)
132+
secondary_files = value.get("secondaryFiles", [])
133+
secondary_files_tar_path = None
134+
if secondary_files:
135+
tmp = tempfile.NamedTemporaryFile(delete=False)
136+
tf = tarfile.open(fileobj=tmp, mode='w:')
137+
order = []
138+
index_contents = {
139+
"order": order
140+
}
141+
for secondary_file in secondary_files:
142+
secondary_file_path = secondary_file.get("location", None) or value.get("path", None)
143+
assert secondary_file_path, "Invalid secondaryFile entry found [%s]" % secondary_file
144+
full_secondary_file_path = os.path.join(test_data_directory, secondary_file_path)
145+
basename = secondary_file.get("basename") or secondary_file_path
146+
order.append(basename)
147+
tf.add(full_secondary_file_path, os.path.join(SECONDARY_FILES_EXTRA_PREFIX, basename))
148+
tmp_index = tempfile.NamedTemporaryFile(delete=False)
149+
json.dump(index_contents, tmp_index)
150+
tmp_index.close()
151+
tf.add(tmp_index.name, SECONDARY_FILES_INDEX_PATH)
152+
tf.close()
153+
secondary_files_tar_path = tmp.name
154+
155+
return upload_file(file_path, secondary_files_tar_path)
129156

130157
def replacement_directory(value):
131158
file_path = value.get("location", None) or value.get("path", None)
@@ -196,8 +223,9 @@ def _ensure_file_exists(file_path):
196223

197224
class FileUploadTarget(object):
198225

199-
def __init__(self, path):
226+
def __init__(self, path, secondary_files=None):
200227
self.path = path
228+
self.secondary_files = secondary_files
201229

202230

203231
class ObjectUploadTarget(object):
@@ -257,31 +285,49 @@ def element_to_cwl_json(element):
257285
return output_to_cwl_json(element_output, get_metadata, get_dataset)
258286

259287
output_metadata = get_metadata(galaxy_output.history_content_type, galaxy_output.history_content_id)
288+
289+
def dataset_dict_to_json_content(dataset_dict):
290+
if "content" in dataset_dict:
291+
return json.loads(dataset_dict["content"])
292+
else:
293+
with open(dataset_dict["path"]) as f:
294+
return json.load(f)
295+
260296
if output_metadata["history_content_type"] == "dataset":
261297
ext = output_metadata["file_ext"]
262298
assert output_metadata["state"] == "ok"
263299
dataset_dict = get_dataset(output_metadata)
264300
if ext == "expression.json":
265-
if "content" in dataset_dict:
266-
return json.loads(dataset_dict["content"])
267-
else:
268-
with open(dataset_dict["path"]) as f:
269-
return json.load(f)
301+
return dataset_dict_to_json_content(dataset_dict)
270302
else:
271303
properties = output_properties(pseduo_location=pseduo_location, **dataset_dict)
272304
basename = properties["basename"]
273305
extra_files = get_extra_files(output_metadata)
306+
found_index = False
274307
for extra_file in extra_files:
275308
if extra_file["class"] == "File":
276309
path = extra_file["path"]
277-
if path.startswith("__secondary_files__/"):
278-
ec = get_dataset(output_metadata, filename=path)
279-
ec["basename"] = basename + os.path.basename(path)
280-
ec_properties = output_properties(pseduo_location=pseduo_location, **ec)
281-
if "secondaryFiles" not in properties:
282-
properties["secondaryFiles"] = []
283-
284-
properties["secondaryFiles"].append(ec_properties)
310+
if path == SECONDARY_FILES_INDEX_PATH:
311+
found_index = True
312+
313+
if found_index:
314+
ec = get_dataset(output_metadata, filename=SECONDARY_FILES_INDEX_PATH)
315+
index = dataset_dict_to_json_content(ec)
316+
for basename in index["order"]:
317+
for extra_file in extra_files:
318+
if extra_file["class"] == "File":
319+
path = extra_file["path"]
320+
if path == os.path.join(SECONDARY_FILES_EXTRA_PREFIX, basename):
321+
ec = get_dataset(output_metadata, filename=path)
322+
if not STORE_SECONDARY_FILES_WITH_BASENAME:
323+
ec["basename"] = basename + os.path.basename(path)
324+
else:
325+
ec["basename"] = os.path.basename(path)
326+
ec_properties = output_properties(pseduo_location=pseduo_location, **ec)
327+
if "secondaryFiles" not in properties:
328+
properties["secondaryFiles"] = []
329+
330+
properties["secondaryFiles"].append(ec_properties)
285331

286332
return properties
287333

lib/galaxy/tools/parameters/grouping.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,8 @@ def get_composite_dataset_name(self, context):
212212
dataset_name = context.get('files_metadata|base_name', None)
213213
if dataset_name is None:
214214
dataset_name = context.get('files_metadata', {}).get('base_name', None)
215+
if dataset_name is None:
216+
dataset_name = context.get("files")[0].get("NAME", None)
215217
if dataset_name is None:
216218
dataset_name = 'Uploaded Composite Dataset (%s)' % self.get_file_type(context)
217219
return dataset_name

lib/galaxy/workflow/modules.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1142,7 +1142,7 @@ def callback(input, prefixed_name, **kwargs):
11421142
replacement = {"src": "hda", "value": replacement}
11431143
elif isinstance(replacement, model.HistoryDatasetCollectionAssociation):
11441144
replacement = {"src": "hdca", "value": replacement}
1145-
else:
1145+
elif replacement is not NO_REPLACEMENT:
11461146
replacement = {"src": "json", "value": replacement}
11471147

11481148
log.info("replacement for [%s] is [%s]" % (prefixed_name, replacement))

test/api/test_workflows_cwl.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,24 @@ def test_simplest_wf(self):
4343
output = self.dataset_populator.get_history_dataset_content(self.history_id, hid=2)
4444
assert re.search(r"\s+4\s+9\s+47\s+", output)
4545

46+
def test_load_ids(self):
47+
workflow_id = self._load_workflow("v1.0/search.cwl#main")
48+
workflow_content = self._download_workflow(workflow_id)
49+
for step_index, step in workflow_content["steps"].items():
50+
if "tool_representation" in step:
51+
del step["tool_representation"]
52+
53+
print(workflow_content)
54+
steps = workflow_content["steps"]
55+
step_3 = steps["3"]
56+
step_4 = steps["4"]
57+
58+
assert step_3["label"] == "index", step_3
59+
assert step_4["label"] == "search", step_4
60+
61+
print(step_3)
62+
print(step_4)
63+
4664
def test_count_line1_v1(self):
4765
"""Test simple workflow v1.0/count-lines1-wf.cwl."""
4866
self._run_count_lines_wf("v1.0/count-lines1-wf.cwl")

test/base/populators.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -238,11 +238,24 @@ def upload_func(upload_target):
238238
with open(path, "rb") as f:
239239
content = f.read()
240240

241+
name = os.path.basename(path)
242+
243+
extra_inputs = dict()
244+
if upload_target.secondary_files:
245+
assert UPLOAD_VIA == "path"
246+
extra_inputs["files_1|url_paste"] = "file://%s" % upload_target.secondary_files
247+
extra_inputs["files_1|type"] = "upload_dataset"
248+
extra_inputs["files_1|auto_decompress"] = True
249+
extra_inputs["file_count"] = "2"
250+
extra_inputs["force_composite"] = "True"
251+
241252
return self.dataset_populator.new_dataset_request(
242253
history_id=history_id,
243-
content='content',
254+
content=content,
244255
file_type="auto",
245-
name=os.path.basename(path),
256+
name=name,
257+
auto_decompress=False,
258+
extra_inputs=extra_inputs,
246259
).json()
247260
elif isinstance(upload_target, DirectoryUploadTarget):
248261
path = upload_target.tar_path

0 commit comments

Comments
 (0)