Skip to content

Auto Recover From CUDA error #204

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
48 changes: 46 additions & 2 deletions agent_scheduler/task_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -462,8 +462,30 @@ def __execute_ui_task(self, task_id: str, is_img2img: bool, *args):
if result[0] is None and hasattr(shared.state, "oom") and shared.state.oom:
res = OutOfMemoryError()
elif "CUDA out of memory" in result[2]:
res = OutOfMemoryError()
if getattr(shared.opts, "queue_recovery", True):
log.error("img2img: CUDA out of memory")
os._exit(1)
else:
res = OutOfMemoryError()
elif "A tensor with all NaNs was produced in Unet." in result[2]:
if getattr(shared.opts, "queue_recovery", True):
log.error("img2img: A tensor with all NaNs was produced in Unet.")
os._exit(1)
else:
log.error("img2img: A tensor with all NaNs was produced in Unet.")
elif "wildcard" in result[2]:
log.error("img2img: Dropped by DiffusionDefender")
elif "list index out of range" in result[2]:
log.error("img2img: list index out of range1")
elif "CUDA error" in result[2]:
#CUDA error: unknown error
#CUDA error: an illegal memory access was encountered
#CUDA error: misaligned address
log.error("img2img: CUDA error")
if getattr(shared.opts, "queue_recovery", True):
os._exit(1)
else:
log.error("img2img: else error")
res = result[1]
except Exception as e:
res = e
Expand All @@ -487,8 +509,30 @@ def __execute_api_task(self, task_id: str, is_img2img: bool, **kwargs):
res = result.info
except Exception as e:
if "CUDA out of memory" in str(e):
res = OutOfMemoryError()
if getattr(shared.opts, "queue_recovery", True):
log.error("txt2img: CUDA out of memory")
os._exit(1)
else:
res = OutOfMemoryError()
elif "A tensor with all NaNs was produced in Unet." in str(e):
if getattr(shared.opts, "queue_recovery", True):
log.error("txt2img: A tensor with all NaNs was produced in Unet.")
os._exit(1)
else:
log.error("txt2img: A tensor with all NaNs was produced in Unet.")
elif "wildcard" in str(e):
log.error("txt2img: Dropped by DiffusionDefender")
elif "list index out of range" in str(e):
log.error("txt2img: list index out of range")
elif "CUDA error" in str(e):
#CUDA error: unknown error
#CUDA error: an illegal memory access was encountered
#CUDA error: misaligned address
log.error("txt2img: CUDA error")
if getattr(shared.opts, "queue_recovery", True):
os._exit(1)
else:
log.error("txt2img: else")
res = e
finally:
progress.finish_task(task_id)
Expand Down
10 changes: 10 additions & 0 deletions scripts/task_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -674,6 +674,16 @@ def on_ui_settings():
section=section,
),
)
shared.opts.add_option(
"queue_recovery",
shared.OptionInfo(
True,
"Exit Automatic1111 on CUDA error. This is useful if your using webui-user-loop.bat to automaticly re-start Automatic111. (This applies to all CUDA error(s), not just Agent Scheduler Tasks.)",
gr.Checkbox,
{"interactive": True},
section=section,
),
)

def enqueue_keyboard_shortcut(disabled: bool, modifiers, key_code: str):
if disabled:
Expand Down
20 changes: 20 additions & 0 deletions webui-user-loop.bat
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
@echo off

set LOGFILE=batch.log

cls

set PYTHON=
set GIT=
set VENV_DIR=
set COMMANDLINE_ARGS=--autolaunch --update-check --xformers --api --theme dark
set XFORMERS_PACKAGE=xformers==0.0.20

cd ..
cd ..

:start
echo "loop Start"
call webui.bat < batch.log
echo "looped"
goto start