Skip to content

Commit 915e5f1

Browse files
authored
feat(coordinator): Adding a builtin rule for checking pod status (#4553)
As titled.
1 parent f4b8fc8 commit 915e5f1

File tree

6 files changed

+92
-1
lines changed

6 files changed

+92
-1
lines changed

charts/graphscope-store/templates/portal/statefulset.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,14 @@ spec:
107107
value: {{ .Values.frontend.service.gremlinPort | quote }}
108108
- name: GROOT_CYPHER_PORT
109109
value: {{ .Values.frontend.service.cypherPort | quote }}
110+
- name: GROOT_FRONTEND_POD_SUFFIX
111+
value: {{ printf "%s-frontend" (include "graphscope-store.name" .) }}
112+
- name: GROOT_COORDINATOR_POD_SUFFIX
113+
value: {{ printf "%s-coordinator" (include "graphscope-store.name" .) }}
114+
- name: GROOT_PORTAAL_POD_SUFFIX
115+
value: {{ printf "%s-portal" (include "graphscope-store.name" .) }}
116+
- name: GROOT_STORE_POD_SUFFIX
117+
value: {{ printf "%s-store" (include "graphscope-store.name" .) }}
110118
- name: INSTANCE_NAME
111119
value: {{ .Release.Name | quote }}
112120
- name: NAMESPACE

coordinator/gscoordinator/flex/core/alert/builtin_rules.py

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,43 @@ def run_alert(self):
119119
if not available:
120120
alert_message = self.generate_alert_message("-", message)
121121
self.alert(alert_message)
122+
123+
class PodAvailableAlert(AlertRule):
124+
def __init__(
125+
self,
126+
name,
127+
severity,
128+
metric_type,
129+
conditions_description,
130+
frequency,
131+
message_collector,
132+
enable=True,
133+
):
134+
super().__init__(
135+
name,
136+
severity,
137+
metric_type,
138+
conditions_description,
139+
frequency,
140+
message_collector,
141+
enable,
142+
)
143+
144+
def run_alert(self):
145+
"""This function needs to handle exception by itself"""
146+
try:
147+
available = get_client_wrapper().pod_available()
148+
if not available:
149+
message = f"Pod unavailable: unknown reason"
150+
except Exception as e:
151+
available = False
152+
message = "Pod unavailable: {0}".format(str(e))
153+
finally:
154+
# unable to distinguish whether frontend or executor is unavailable,
155+
# so we set the target "-"
156+
if not available:
157+
alert_message = self.generate_alert_message("-", message)
158+
self.alert(alert_message)
122159

123160

124161
def init_builtin_alert_rules(message_collector: AlertMessageCollector):
@@ -144,7 +181,17 @@ def init_builtin_alert_rules(message_collector: AlertMessageCollector):
144181
conditions_description="g.V().limit(1) failed",
145182
frequency=5,
146183
message_collector=message_collector,
147-
enable=True,
184+
enable=False, # GremlinServiceAvailableAlert is disabled by default
148185
)
149186
alert_rules[gremlin_service_available.id] = gremlin_service_available
187+
pod_available = PodAvailableAlert(
188+
name="PodAvailable",
189+
severity="emergency",
190+
metric_type="service",
191+
conditions_description="pod not available",
192+
frequency=3,
193+
message_collector=message_collector,
194+
enable=True,
195+
)
196+
alert_rules[pod_available.id] = pod_available
150197
return alert_rules

coordinator/gscoordinator/flex/core/client_wrapper.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -368,6 +368,8 @@ def get_storage_usage(self) -> GetStorageUsageResponse:
368368
def gremlin_service_available(self) -> bool:
369369
return self._client.gremlin_service_available()
370370

371+
def pod_available(self) -> bool:
372+
return self._client.pod_available()
371373

372374
client_wrapper = None
373375

coordinator/gscoordinator/flex/core/config.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,11 @@ def str_to_bool(s):
109109
except Exception as e:
110110
logger.warn("Invalid base64-encoded string found, use original value: %s", str(e))
111111
GROOT_FRONTEND_POD_SUFFIX = os.environ.get("GROOT_FRONTEND_POD_SUFFIX", "graphscope-store-frontend")
112+
GROOT_STORE_POD_SUFFIX = os.environ.get("GROOT_STORE_POD_SUFFIX", "graphscope-store-store")
113+
GROOT_COORDINATOR_POD_SUFFIX = os.environ.get("GROOT_COORDINATOR_POD_SUFFIX", "graphscope-store-coordinator")
114+
GROOT_PORTAL_POD_SUFFIX = os.environ.get("GROOT_PORTAL_POD_SUFFIX", "graphscope-store-portal")
115+
116+
112117

113118
# dataloading service for groot
114119
STUDIO_WRAPPER_ENDPOINT = os.environ.get("STUDIO_WRAPPER_ENDPOINT", None)

coordinator/gscoordinator/flex/core/insight/graph.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,14 @@
2929

3030
from gscoordinator.flex.core.config import CLUSTER_TYPE
3131
from gscoordinator.flex.core.config import CREATION_TIME
32+
from gscoordinator.flex.core.config import GROOT_COORDINATOR_POD_SUFFIX
3233
from gscoordinator.flex.core.config import GROOT_CYPHER_PORT
3334
from gscoordinator.flex.core.config import GROOT_FRONTEND_POD_SUFFIX
3435
from gscoordinator.flex.core.config import GROOT_GREMLIN_PORT
3536
from gscoordinator.flex.core.config import GROOT_GRPC_PORT
3637
from gscoordinator.flex.core.config import GROOT_PASSWORD
38+
from gscoordinator.flex.core.config import GROOT_PORTAL_POD_SUFFIX
39+
from gscoordinator.flex.core.config import GROOT_STORE_POD_SUFFIX
3740
from gscoordinator.flex.core.config import GROOT_USERNAME
3841
from gscoordinator.flex.core.config import INSTANCE_NAME
3942
from gscoordinator.flex.core.config import NAMESPACE
@@ -126,6 +129,28 @@ def _fetch_endpoints_impl(self):
126129
self._endpoints["cypher_endpoint"] = cypher_endpoint
127130
logger.info(f"Update frontend endpoints: {str(endpoints)}")
128131

132+
def pod_available(self):
133+
if CLUSTER_TYPE != "KUBERNETES":
134+
return True
135+
expected_prefixes = [
136+
"{0}-{1}-".format(INSTANCE_NAME, GROOT_FRONTEND_POD_SUFFIX),
137+
"{0}-{1}-".format(INSTANCE_NAME, GROOT_COORDINATOR_POD_SUFFIX),
138+
"{0}-{1}-".format(INSTANCE_NAME, GROOT_STORE_POD_SUFFIX),
139+
"{0}-{1}-".format(INSTANCE_NAME, GROOT_PORTAL_POD_SUFFIX),
140+
]
141+
all_pod = self._core_api.list_namespaced_pod(NAMESPACE)
142+
if len(all_pod.items) == 0:
143+
raise RuntimeError("No pod found in namespace {0}".format(NAMESPACE))
144+
for pod in all_pod.items:
145+
for prefix in expected_prefixes:
146+
if pod.metadata.name.startswith(prefix):
147+
if pod.status.phase != "Running":
148+
raise RuntimeError(
149+
"Pod {0} is not running, {1}".format(pod.metadata.name, pod.status.phase)
150+
)
151+
return True
152+
153+
129154
def __del__(self):
130155
self._conn.close()
131156

coordinator/gscoordinator/flex/core/insight/groot.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,10 @@ def gremlin_service_available(self) -> bool:
309309
except: # noqa: E722
310310
pass
311311
return True
312+
313+
def pod_available(self) -> bool:
314+
return self._graph.pod_available()
315+
312316

313317

314318
def init_groot_client(config: Config):

0 commit comments

Comments
 (0)