Skip to content

Commit 368dd2f

Browse files
authored
Safely pass Prometheus key deletion errors since not every setup will use it (#585)
* Encased Prometheus metric deletion calls in utility function to notify about failures * Added function to inspect contents of the Prometheus gauges and counters that have been set up
1 parent 2f8fd7d commit 368dd2f

File tree

2 files changed

+91
-17
lines changed

2 files changed

+91
-17
lines changed

src/murfey/server/api/__init__.py

Lines changed: 70 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
Proposal,
2323
)
2424
from PIL import Image
25+
from prometheus_client import Counter, Gauge
2526
from pydantic import BaseModel
2627
from sqlalchemy import func
2728
from sqlalchemy.exc import OperationalError
@@ -50,7 +51,7 @@
5051
from murfey.server.api.spa import _cryolo_model_path
5152
from murfey.server.gain import Camera, prepare_eer_gain, prepare_gain
5253
from murfey.server.murfey_db import murfey_db
53-
from murfey.util import secure_path
54+
from murfey.util import safe_run, secure_path
5455
from murfey.util.config import MachineConfig, from_file, settings
5556
from murfey.util.db import (
5657
AutoProcProgram,
@@ -1616,33 +1617,59 @@ def remove_session_by_id(session_id: MurfeySessionID, db=murfey_db):
16161617
sessions_for_visit = db.exec(
16171618
select(Session).where(Session.visit == session.visit)
16181619
).all()
1620+
# Don't remove prometheus metrics if there are other sessions using them
16191621
if len(sessions_for_visit) == 1:
1620-
# Don't remove prometheus metrics if there are other sessions using them
1621-
try:
1622-
prom.monitoring_switch.remove(session.visit)
1623-
except KeyError:
1624-
pass
1622+
safe_run(
1623+
prom.monitoring_switch.remove,
1624+
args=(session.visit,),
1625+
label="monitoring_switch",
1626+
)
16251627
rsync_instances = db.exec(
16261628
select(RsyncInstance).where(RsyncInstance.session_id == session_id)
16271629
).all()
16281630
for ri in rsync_instances:
1629-
prom.seen_files.remove(ri.source, session.visit)
1630-
prom.transferred_files.remove(ri.source, session.visit)
1631-
prom.transferred_files_bytes.remove(ri.source, session.visit)
1632-
prom.seen_data_files.remove(ri.source, session.visit)
1633-
prom.transferred_data_files.remove(ri.source, session.visit)
1634-
prom.transferred_data_files_bytes.remove(ri.source, session.visit)
1631+
safe_run(
1632+
prom.seen_files.remove,
1633+
args=(ri.source, session.visit),
1634+
label="seen_files",
1635+
)
1636+
safe_run(
1637+
prom.transferred_files.remove,
1638+
args=(ri.source, session.visit),
1639+
label="transferred_files",
1640+
)
1641+
safe_run(
1642+
prom.transferred_files_bytes.remove,
1643+
args=(ri.source, session.visit),
1644+
label="transferred_files_bytes",
1645+
)
1646+
safe_run(
1647+
prom.seen_data_files.remove,
1648+
args=(ri.source, session.visit),
1649+
label="seen_data_files",
1650+
)
1651+
safe_run(
1652+
prom.transferred_data_files.remove,
1653+
args=(ri.source, session.visit),
1654+
label="transferred_data_files",
1655+
)
1656+
safe_run(
1657+
prom.transferred_data_files_bytes.remove,
1658+
args=(ri.source, session.visit),
1659+
label="transferred_data_file_bytes",
1660+
)
16351661
collected_ids = db.exec(
16361662
select(DataCollectionGroup, DataCollection, ProcessingJob)
16371663
.where(DataCollectionGroup.session_id == session_id)
16381664
.where(DataCollection.dcg_id == DataCollectionGroup.id)
16391665
.where(ProcessingJob.dc_id == DataCollection.id)
16401666
).all()
16411667
for c in collected_ids:
1642-
try:
1643-
prom.preprocessed_movies.remove(c[2].id)
1644-
except KeyError:
1645-
continue
1668+
safe_run(
1669+
prom.preprocessed_movies.remove,
1670+
args=(c[2].id,),
1671+
label="preprocessed_movies",
1672+
)
16461673
db.delete(session)
16471674
db.commit()
16481675
return
@@ -1954,3 +1981,30 @@ def update_current_gain_ref(
19541981
session.current_gain_ref = new_gain_ref.path
19551982
db.add(session)
19561983
db.commit()
1984+
1985+
1986+
@router.get("/prometheus/{metric_name}")
1987+
def inspect_prometheus_metrics(
1988+
metric_name: str,
1989+
):
1990+
"""
1991+
A debugging endpoint that returns the current contents of any Prometheus
1992+
gauges and counters that have been set up thus far.
1993+
"""
1994+
1995+
# Extract the Prometheus metric defined in the Prometheus module
1996+
metric: Optional[Counter | Gauge] = getattr(prom, metric_name, None)
1997+
if metric is None or not isinstance(metric, (Counter, Gauge)):
1998+
raise LookupError("No matching metric was found")
1999+
2000+
# Package contents into dict and return
2001+
results = {}
2002+
if hasattr(metric, "_metrics"):
2003+
for i, (label_tuple, sub_metric) in enumerate(metric._metrics.items()):
2004+
labels = dict(zip(metric._labelnames, label_tuple))
2005+
labels["value"] = sub_metric._value.get()
2006+
results[i] = labels
2007+
return results
2008+
else:
2009+
value = metric._value.get()
2010+
return {"value": value}

src/murfey/util/__init__.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from pathlib import Path
55
from queue import Queue
66
from threading import Thread
7-
from typing import Optional
7+
from typing import Any, Callable, Optional
88
from uuid import uuid4
99

1010
from werkzeug.utils import secure_filename
@@ -132,3 +132,23 @@ def filter(self, record: logging.LogRecord) -> bool:
132132
if "." not in logger_name:
133133
return False
134134
logger_name = logger_name.rsplit(".", maxsplit=1)[0]
135+
136+
137+
def safe_run(
138+
func: Callable,
139+
args: list | tuple = [],
140+
kwargs: dict[str, Any] = {},
141+
label: str = "",
142+
):
143+
"""
144+
A wrapper to encase individual functions in try-except blocks so that a warning
145+
is raised if the function fails, but the process continues as normal otherwise.
146+
"""
147+
try:
148+
return func(*args, **kwargs)
149+
except Exception:
150+
logger.warning(
151+
f"Function {func.__name__!r} failed to run for object {label!r}",
152+
exc_info=True,
153+
)
154+
return None

0 commit comments

Comments
 (0)