Skip to content

Commit c73bb69

Browse files
committed
test
Signed-off-by: Salvatore Daniele <[email protected]>
1 parent 3a73ff2 commit c73bb69

File tree

2 files changed

+88
-69
lines changed

2 files changed

+88
-69
lines changed

extraConfigDpu.py

Lines changed: 83 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -310,75 +310,89 @@ def ExtraConfigDpuHost(cc: ClustersConfig, cfg: ExtraConfigArgs, futures: dict[s
310310
lh = host.LocalHost()
311311
client = K8sClient(cc.kubeconfig)
312312

313-
if cfg.rebuild_dpu_operators_images:
314-
registry = build_dpu_operator_images()
315-
else:
316-
logger.info("Will not rebuild dpu-operator images")
317-
registry = _ensure_local_registry_running(lh, delete_all=False)
318-
operator_image = f"{registry}/openshift-dpu-operator/cda-dpu-operator:latest"
319-
daemon_image = f"{registry}/openshift-dpu-operator/cda-dpu-daemon:latest"
320-
321-
# Need to trust the registry in OCP / Microshift
322-
logger.info("Ensuring local registry is trusted in OCP")
323-
reglocal.ocp_trust(client, reglocal.get_local_registry_base_directory(lh), reglocal.get_local_registry_hostname(lh), 5000)
324-
325-
h = host.Host(cc.workers[0].node)
326-
vendor_plugin = init_vendor_plugin(h)
327-
vendor_plugin.build_and_start(lh, client, registry)
328-
329-
start_dpu_operator(lh, client, operator_image, daemon_image)
330-
client.oc_run_or_die("wait --for=condition=Ready pod --all -n openshift-dpu-operator --timeout=2m")
331-
332-
def helper(h: host.Host, node: NodeConfig) -> Optional[host.Result]:
333-
# Temporary workaround, remove once 4.16 installations are working
334-
logger.info("Ensuring Rhel 9.4 kernel is installed")
335-
ensure_rhel_9_4_kernel_is_installed(h)
336-
# There is a bug with the idpf driver that causes the IPU to fail to be enumerated over PCIe on boot
337-
# As a result, we will need to trigger cold boots of the node until the device is available
338-
# TODO: Remove when no longer needed
339-
retries = 3
340-
h.ssh_connect("core")
341-
ret = h.run(f"test -d /sys/class/net/{cfg.dpu_net_interface}")
342-
while ret.returncode != 0:
343-
logger.error(f"{h.hostname()} does not have a network device {cfg.dpu_net_interface} cold booting node to try to recover")
344-
h.cold_boot()
345-
logger.info("Cold boot triggered, waiting for host to reboot")
346-
time.sleep(60)
347-
h.ssh_connect("core")
348-
retries = retries - 1
349-
if retries == 0:
350-
logger.error_and_exit(f"Failed to bring up IPU net device on {h.hostname()}")
351-
ret = h.run(f"test -d /sys/class/net/{cfg.dpu_net_interface}")
352-
353-
# Label the node
354-
logger.info(f"labeling node {h.hostname()} dpu=true")
355-
client.oc_run_or_die(f"label no {e.name} dpu=true")
356-
return None
357-
358-
executor = ThreadPoolExecutor(max_workers=len(cc.workers))
359-
f = []
360-
# Assuming that all workers have a DPU
361-
for e in cc.workers:
362-
logger.info(f"Calling helper function for node {e.node}")
363-
bmc = host.BMC.from_bmc(e.bmc, e.bmc_user, e.bmc_password)
364-
h = host.Host(e.node, bmc)
365-
f.append(executor.submit(helper, h, e))
366-
367-
for thread in f:
368-
logger.info(thread.result())
369-
370-
logger.info("Verified idpf is providing net-devs on DPU worker nodes")
371-
372-
# Create host nad
373-
# TODO: Remove when this is automatically created by the dpu operator
374-
logger.info("Creating dpu NAD")
375-
client.oc("delete -f manifests/dpu/dpu_nad.yaml")
376-
client.oc_run_or_die("create -f manifests/dpu/dpu_nad.yaml")
377-
# Deploy dpu daemon and wait for dpu pods to come up
378-
logger.info("Creating dpu operator config")
379-
client.oc_run_or_die(f"create -f {REPO_DIR}/examples/dpu.yaml")
380-
time.sleep(30)
381-
client.oc_run_or_die("wait --for=condition=Ready pod --all -n openshift-dpu-operator --timeout=5m")
313+
# if cfg.rebuild_dpu_operators_images:
314+
# registry = build_dpu_operator_images()
315+
# else:
316+
# logger.info("Will not rebuild dpu-operator images")
317+
# registry = _ensure_local_registry_running(lh, delete_all=False)
318+
# operator_image = f"{registry}/openshift-dpu-operator/cda-dpu-operator:latest"
319+
# daemon_image = f"{registry}/openshift-dpu-operator/cda-dpu-daemon:latest"
320+
321+
# # Need to trust the registry in OCP / Microshift
322+
# logger.info("Ensuring local registry is trusted in OCP")
323+
# reglocal.ocp_trust(client, reglocal.get_local_registry_base_directory(lh), reglocal.get_local_registry_hostname(lh), 5000)
324+
325+
326+
logger.info("creating test container")
327+
image = "alpine:latest"
328+
name = "ipu_host_test"
329+
cmd = f"podman pull {image}"
330+
lh.run_or_die(cmd)
331+
cmd = f"podman run -d --name {name} {image} sh -c 'while true; do sleep 1; done'"
332+
lh.run_or_die(cmd)
333+
334+
CONTAINER_NAME = "local-container-registry"
335+
#cmd = f"podman run -d --name {CONTAINER_NAME} -p 5000:5000 -v /root/.local-container-registry/data:/var/lib/registry:z -v /root/.local-container-registry/auth:/auth:z -v /root/.local-container-registry/certs:/certs:z -e REGISTRY_HTTP_TLS_CERTIFICATE=/certs/domain.crt -e REGISTRY_HTTP_TLS_KEY=/certs/domain.key -e REGISTRY_COMPATIBILITY_SCHEMA1_ENABLED=true --annotation=LOCAL_CONTAINER_REGISTRY_HOSTNAME=wsfd-advnetlab217.anl.eng.bos2.dc.redhat.com docker.io/library/registry:latest"
336+
cmd = f"podman run -d --name {CONTAINER_NAME} docker.io/library/registry:latest sh -c 'while true; do sleep 1; done'"
337+
lh.run_or_die(cmd)
338+
339+
# h = host.Host(cc.workers[0].node)
340+
# vendor_plugin = init_vendor_plugin(h)
341+
# vendor_plugin.build_and_start(lh, client, registry)
342+
343+
# start_dpu_operator(lh, client, operator_image, daemon_image)
344+
# client.oc_run_or_die("wait --for=condition=Ready pod --all -n openshift-dpu-operator --timeout=2m")
345+
346+
# def helper(h: host.Host, node: NodeConfig) -> Optional[host.Result]:
347+
# # Temporary workaround, remove once 4.16 installations are working
348+
# logger.info("Ensuring Rhel 9.4 kernel is installed")
349+
# ensure_rhel_9_4_kernel_is_installed(h)
350+
# # There is a bug with the idpf driver that causes the IPU to fail to be enumerated over PCIe on boot
351+
# # As a result, we will need to trigger cold boots of the node until the device is available
352+
# # TODO: Remove when no longer needed
353+
# retries = 3
354+
# h.ssh_connect("core")
355+
# ret = h.run(f"test -d /sys/class/net/{cfg.dpu_net_interface}")
356+
# while ret.returncode != 0:
357+
# logger.error(f"{h.hostname()} does not have a network device {cfg.dpu_net_interface} cold booting node to try to recover")
358+
# h.cold_boot()
359+
# logger.info("Cold boot triggered, waiting for host to reboot")
360+
# time.sleep(60)
361+
# h.ssh_connect("core")
362+
# retries = retries - 1
363+
# if retries == 0:
364+
# logger.error_and_exit(f"Failed to bring up IPU net device on {h.hostname()}")
365+
# ret = h.run(f"test -d /sys/class/net/{cfg.dpu_net_interface}")
366+
367+
# # Label the node
368+
# logger.info(f"labeling node {h.hostname()} dpu=true")
369+
# client.oc_run_or_die(f"label no {e.name} dpu=true")
370+
# return None
371+
372+
# executor = ThreadPoolExecutor(max_workers=len(cc.workers))
373+
# f = []
374+
# # Assuming that all workers have a DPU
375+
# for e in cc.workers:
376+
# logger.info(f"Calling helper function for node {e.node}")
377+
# bmc = host.BMC.from_bmc(e.bmc, e.bmc_user, e.bmc_password)
378+
# h = host.Host(e.node, bmc)
379+
# f.append(executor.submit(helper, h, e))
380+
381+
# for thread in f:
382+
# logger.info(thread.result())
383+
384+
# logger.info("Verified idpf is providing net-devs on DPU worker nodes")
385+
386+
# # Create host nad
387+
# # TODO: Remove when this is automatically created by the dpu operator
388+
# logger.info("Creating dpu NAD")
389+
# client.oc("delete -f manifests/dpu/dpu_nad.yaml")
390+
# client.oc_run_or_die("create -f manifests/dpu/dpu_nad.yaml")
391+
# # Deploy dpu daemon and wait for dpu pods to come up
392+
# logger.info("Creating dpu operator config")
393+
# client.oc_run_or_die(f"create -f {REPO_DIR}/examples/dpu.yaml")
394+
# time.sleep(30)
395+
# client.oc_run_or_die("wait --for=condition=Ready pod --all -n openshift-dpu-operator --timeout=5m")
382396
logger.info("Finished setting up dpu operator on host")
383397

384398

reglocal.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,11 @@ def ensure_running(rsh: host.Host, *, delete_all: bool = False, listen_port: int
110110
)
111111
)
112112

113+
# Add logging to check the status of the container
114+
logger.info("Checking if the container is still running")
115+
status_ret = rsh.run(shlex.join(["podman", "ps", "-a", "--filter", f"name={CONTAINER_NAME}"]))
116+
logger.info(f"Container status: {status_ret.out}")
117+
113118
return dir_name, hostname, listen_port, ret.out.strip()
114119

115120

0 commit comments

Comments
 (0)