Skip to content

Commit f63b4c9

Browse files
shijujose4mchehab
authored andcommitted
rasdaemon: Add support for the CXL memory module events
Add support to log and record the CXL memory module events. Signed-off-by: Shiju Jose <[email protected]> Signed-off-by: Mauro Carvalho Chehab <[email protected]>
1 parent 9a2f618 commit f63b4c9

File tree

8 files changed

+375
-0
lines changed

8 files changed

+375
-0
lines changed

ras-cxl-handler.c

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1016,3 +1016,159 @@ int ras_cxl_dram_event_handler(struct trace_seq *s,
10161016

10171017
return 0;
10181018
}
1019+
1020+
/*
1021+
* Memory Module Event Record - MMER
1022+
*
1023+
* CXL res 3.0 section 8.2.9.2.1.3; Table 8-45
1024+
*/
1025+
static const char* cxl_dev_evt_type[] = {
1026+
"Health Status Change",
1027+
"Media Status Change",
1028+
"Life Used Change",
1029+
"Temperature Change",
1030+
"Data Path Error",
1031+
"LSA Error",
1032+
};
1033+
1034+
/*
1035+
* Device Health Information - DHI
1036+
*
1037+
* CXL res 3.0 section 8.2.9.8.3.1; Table 8-100
1038+
*/
1039+
#define CXL_DHI_HS_MAINTENANCE_NEEDED BIT(0)
1040+
#define CXL_DHI_HS_PERFORMANCE_DEGRADED BIT(1)
1041+
#define CXL_DHI_HS_HW_REPLACEMENT_NEEDED BIT(2)
1042+
1043+
static const struct cxl_event_flags cxl_health_status[] = {
1044+
{ .bit = CXL_DHI_HS_MAINTENANCE_NEEDED, .flag = "MAINTENANCE_NEEDED" },
1045+
{ .bit = CXL_DHI_HS_PERFORMANCE_DEGRADED, .flag = "PERFORMANCE_DEGRADED" },
1046+
{ .bit = CXL_DHI_HS_HW_REPLACEMENT_NEEDED, .flag = "REPLACEMENT_NEEDED" },
1047+
};
1048+
1049+
static const char* cxl_media_status[] = {
1050+
"Normal",
1051+
"Not Ready",
1052+
"Write Persistency Lost",
1053+
"All Data Lost",
1054+
"Write Persistency Loss in the Event of Power Loss",
1055+
"Write Persistency Loss in Event of Shutdown",
1056+
"Write Persistency Loss Imminent",
1057+
"All Data Loss in Event of Power Loss",
1058+
"All Data loss in the Event of Shutdown",
1059+
"All Data Loss Imminent",
1060+
};
1061+
1062+
static const char* cxl_two_bit_status[] = {
1063+
"Normal",
1064+
"Warning",
1065+
"Critical",
1066+
};
1067+
1068+
static const char* cxl_one_bit_status[] = {
1069+
"Normal",
1070+
"Warning",
1071+
};
1072+
1073+
#define CXL_DHI_AS_LIFE_USED(as) (as & 0x3)
1074+
#define CXL_DHI_AS_DEV_TEMP(as) ((as & 0xC) >> 2)
1075+
#define CXL_DHI_AS_COR_VOL_ERR_CNT(as) ((as & 0x10) >> 4)
1076+
#define CXL_DHI_AS_COR_PER_ERR_CNT(as) ((as & 0x20) >> 5)
1077+
1078+
int ras_cxl_memory_module_event_handler(struct trace_seq *s,
1079+
struct tep_record *record,
1080+
struct tep_event *event, void *context)
1081+
{
1082+
unsigned long long val;
1083+
struct ras_events *ras = context;
1084+
struct ras_cxl_memory_module_event ev;
1085+
1086+
memset(&ev, 0, sizeof(ev));
1087+
if (handle_ras_cxl_common_hdr(s, record, event, context, &ev.hdr) < 0)
1088+
return -1;
1089+
1090+
if (tep_get_field_val(s, event, "event_type", record, &val, 1) < 0)
1091+
return -1;
1092+
ev.event_type = val;
1093+
if (trace_seq_printf(s, "event_type:%s ", get_cxl_type_str(cxl_dev_evt_type,
1094+
ARRAY_SIZE(cxl_dev_evt_type), ev.event_type)) <= 0)
1095+
return -1;
1096+
1097+
if (tep_get_field_val(s, event, "health_status", record, &val, 1) < 0)
1098+
return -1;
1099+
ev.health_status = val;
1100+
if (trace_seq_printf(s, "health_status:") <= 0)
1101+
return -1;
1102+
if (decode_cxl_event_flags(s, ev.health_status, cxl_health_status,
1103+
ARRAY_SIZE(cxl_health_status)) < 0)
1104+
return -1;
1105+
1106+
if (tep_get_field_val(s, event, "media_status", record, &val, 1) < 0)
1107+
return -1;
1108+
ev.media_status = val;
1109+
if (trace_seq_printf(s, "media_status:%s ", get_cxl_type_str(cxl_media_status,
1110+
ARRAY_SIZE(cxl_media_status), ev.media_status)) <= 0)
1111+
return -1;
1112+
1113+
if (tep_get_field_val(s, event, "add_status", record, &val, 1) < 0)
1114+
return -1;
1115+
ev.add_status = val;
1116+
if (trace_seq_printf(s, "as_life_used:%s ", get_cxl_type_str(cxl_two_bit_status,
1117+
ARRAY_SIZE(cxl_two_bit_status),
1118+
CXL_DHI_AS_LIFE_USED(ev.add_status))) <= 0)
1119+
return -1;
1120+
if (trace_seq_printf(s, "as_dev_temp:%s ", get_cxl_type_str(cxl_two_bit_status,
1121+
ARRAY_SIZE(cxl_two_bit_status),
1122+
CXL_DHI_AS_DEV_TEMP(ev.add_status))) <= 0)
1123+
return -1;
1124+
if (trace_seq_printf(s, "as_cor_vol_err_cnt:%s ", get_cxl_type_str(cxl_one_bit_status,
1125+
ARRAY_SIZE(cxl_one_bit_status),
1126+
CXL_DHI_AS_COR_VOL_ERR_CNT(ev.add_status))) <= 0)
1127+
return -1;
1128+
if (trace_seq_printf(s, "as_cor_per_err_cnt:%s ", get_cxl_type_str(cxl_one_bit_status,
1129+
ARRAY_SIZE(cxl_one_bit_status),
1130+
CXL_DHI_AS_COR_PER_ERR_CNT(ev.add_status))) <= 0)
1131+
return -1;
1132+
1133+
if (tep_get_field_val(s, event, "life_used", record, &val, 1) < 0)
1134+
return -1;
1135+
ev.life_used = val;
1136+
if (trace_seq_printf(s, "life_used:%u ", ev.life_used) <= 0)
1137+
return -1;
1138+
1139+
if (tep_get_field_val(s, event, "device_temp", record, &val, 1) < 0)
1140+
return -1;
1141+
ev.device_temp = val;
1142+
if (trace_seq_printf(s, "device_temp:%u ", ev.device_temp) <= 0)
1143+
return -1;
1144+
1145+
if (tep_get_field_val(s, event, "dirty_shutdown_cnt", record, &val, 1) < 0)
1146+
return -1;
1147+
ev.dirty_shutdown_cnt = val;
1148+
if (trace_seq_printf(s, "dirty_shutdown_cnt:%u ", ev.dirty_shutdown_cnt) <= 0)
1149+
return -1;
1150+
1151+
if (tep_get_field_val(s, event, "cor_vol_err_cnt", record, &val, 1) < 0)
1152+
return -1;
1153+
ev.cor_vol_err_cnt = val;
1154+
if (trace_seq_printf(s, "cor_vol_err_cnt:%u ", ev.cor_vol_err_cnt) <= 0)
1155+
return -1;
1156+
1157+
if (tep_get_field_val(s, event, "cor_per_err_cnt", record, &val, 1) < 0)
1158+
return -1;
1159+
ev.cor_per_err_cnt = val;
1160+
if (trace_seq_printf(s, "cor_per_err_cnt:%u ", ev.cor_per_err_cnt) <= 0)
1161+
return -1;
1162+
1163+
/* Insert data into the SGBD */
1164+
#ifdef HAVE_SQLITE3
1165+
ras_store_cxl_memory_module_event(ras, &ev);
1166+
#endif
1167+
1168+
#ifdef HAVE_ABRT_REPORT
1169+
/* Report event to ABRT */
1170+
ras_report_cxl_memory_module_event(ras, &ev);
1171+
#endif
1172+
1173+
return 0;
1174+
}

ras-cxl-handler.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,4 +41,7 @@ int ras_cxl_general_media_event_handler(struct trace_seq *s,
4141
int ras_cxl_dram_event_handler(struct trace_seq *s,
4242
struct tep_record *record,
4343
struct tep_event *event, void *context);
44+
int ras_cxl_memory_module_event_handler(struct trace_seq *s,
45+
struct tep_record *record,
46+
struct tep_event *event, void *context);
4447
#endif

ras-events.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,7 @@ int toggle_ras_mc_event(int enable)
252252
rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_generic_event", enable);
253253
rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_general_media", enable);
254254
rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_dram", enable);
255+
rc |= __toggle_ras_mc_event(ras, "cxl", "cxl_memory_module", enable);
255256
#endif
256257

257258
free_ras:
@@ -1081,6 +1082,14 @@ int handle_ras_events(int record_events)
10811082
else
10821083
log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
10831084
"cxl", "cxl_dram");
1085+
1086+
rc = add_event_handler(ras, pevent, page_size, "cxl", "cxl_memory_module",
1087+
ras_cxl_memory_module_event_handler, NULL, CXL_MEMORY_MODULE_EVENT);
1088+
if (!rc)
1089+
num_events++;
1090+
else
1091+
log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
1092+
"cxl", "memory_module");
10841093
#endif
10851094

10861095
if (!num_events) {

ras-events.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ enum {
4646
CXL_GENERIC_EVENT,
4747
CXL_GENERAL_MEDIA_EVENT,
4848
CXL_DRAM_EVENT,
49+
CXL_MEMORY_MODULE_EVENT,
4950
NR_EVENTS
5051
};
5152

ras-record.c

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -992,6 +992,74 @@ int ras_store_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *
992992

993993
return rc;
994994
}
995+
996+
/*
997+
* Table and functions to handle cxl:cxl_memory_module_event
998+
*/
999+
static const struct db_fields cxl_memory_module_event_fields[] = {
1000+
{ .name = "id", .type = "INTEGER PRIMARY KEY" },
1001+
{ .name = "timestamp", .type = "TEXT" },
1002+
{ .name = "memdev", .type = "TEXT" },
1003+
{ .name = "host", .type = "TEXT" },
1004+
{ .name = "serial", .type = "INTEGER" },
1005+
{ .name = "log_type", .type = "TEXT" },
1006+
{ .name = "hdr_uuid", .type = "TEXT" },
1007+
{ .name = "hdr_flags", .type = "INTEGER" },
1008+
{ .name = "hdr_handle", .type = "INTEGER" },
1009+
{ .name = "hdr_related_handle", .type = "INTEGER" },
1010+
{ .name = "hdr_ts", .type = "TEXT" },
1011+
{ .name = "hdr_length", .type = "INTEGER" },
1012+
{ .name = "hdr_maint_op_class", .type = "INTEGER" },
1013+
{ .name = "event_type", .type = "INTEGER" },
1014+
{ .name = "health_status", .type = "INTEGER" },
1015+
{ .name = "media_status", .type = "INTEGER" },
1016+
{ .name = "life_used", .type = "INTEGER" },
1017+
{ .name = "dirty_shutdown_cnt", .type = "INTEGER" },
1018+
{ .name = "cor_vol_err_cnt", .type = "INTEGER" },
1019+
{ .name = "cor_per_err_cnt", .type = "INTEGER" },
1020+
{ .name = "device_temp", .type = "INTEGER" },
1021+
{ .name = "add_status", .type = "INTEGER" },
1022+
};
1023+
1024+
static const struct db_table_descriptor cxl_memory_module_event_tab = {
1025+
.name = "cxl_memory_module_event",
1026+
.fields = cxl_memory_module_event_fields,
1027+
.num_fields = ARRAY_SIZE(cxl_memory_module_event_fields),
1028+
};
1029+
1030+
int ras_store_cxl_memory_module_event(struct ras_events *ras, struct ras_cxl_memory_module_event *ev)
1031+
{
1032+
int rc;
1033+
struct sqlite3_priv *priv = ras->db_priv;
1034+
1035+
if (!priv || !priv->stmt_cxl_memory_module_event)
1036+
return 0;
1037+
log(TERM, LOG_INFO, "cxl_memory_module_event store: %p\n",
1038+
priv->stmt_cxl_memory_module_event);
1039+
1040+
ras_store_cxl_common_hdr(priv->stmt_cxl_memory_module_event, &ev->hdr);
1041+
sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 13, ev->event_type);
1042+
sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 14, ev->health_status);
1043+
sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 15, ev->media_status);
1044+
sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 16, ev->life_used);
1045+
sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 17, ev->dirty_shutdown_cnt);
1046+
sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 18, ev->cor_vol_err_cnt);
1047+
sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 19, ev->cor_per_err_cnt);
1048+
sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 20, ev->device_temp);
1049+
sqlite3_bind_int(priv->stmt_cxl_memory_module_event, 21, ev->add_status);
1050+
1051+
rc = sqlite3_step(priv->stmt_cxl_memory_module_event);
1052+
if (rc != SQLITE_OK && rc != SQLITE_DONE)
1053+
log(TERM, LOG_ERR,
1054+
"Failed to do stmt_cxl_memory_module_event step on sqlite: error = %d\n", rc);
1055+
rc = sqlite3_reset(priv->stmt_cxl_memory_module_event);
1056+
if (rc != SQLITE_OK && rc != SQLITE_DONE)
1057+
log(TERM, LOG_ERR,
1058+
"Failed reset stmt_cxl_memory_module_event on sqlite: error = %d\n", rc);
1059+
log(TERM, LOG_INFO, "register inserted at db\n");
1060+
1061+
return rc;
1062+
}
9951063
#endif
9961064

9971065
/*
@@ -1391,6 +1459,14 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras)
13911459
if (rc != SQLITE_OK)
13921460
goto error;
13931461
}
1462+
1463+
rc = ras_mc_create_table(priv, &cxl_memory_module_event_tab);
1464+
if (rc == SQLITE_OK) {
1465+
rc = ras_mc_prepare_stmt(priv, &priv->stmt_cxl_memory_module_event,
1466+
&cxl_memory_module_event_tab);
1467+
if (rc != SQLITE_OK)
1468+
goto error;
1469+
}
13941470
#endif
13951471

13961472
ras->db_priv = priv;
@@ -1568,6 +1644,14 @@ int ras_mc_event_closedb(unsigned int cpu, struct ras_events *ras)
15681644
"cpu %u: Failed to finalize cxl_dram_event sqlite: error = %d\n",
15691645
cpu, rc);
15701646
}
1647+
1648+
if (priv->stmt_cxl_memory_module_event) {
1649+
rc = sqlite3_finalize(priv->stmt_cxl_memory_module_event);
1650+
if (rc != SQLITE_OK)
1651+
log(TERM, LOG_ERR,
1652+
"cpu %u: Failed to finalize stmt_cxl_memory_module_event sqlite: error = %d\n",
1653+
cpu, rc);
1654+
}
15711655
#endif
15721656

15731657
rc = sqlite3_close_v2(db);

ras-record.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,19 @@ struct ras_cxl_dram_event {
218218
uint16_t validity_flags;
219219
};
220220

221+
struct ras_cxl_memory_module_event {
222+
struct ras_cxl_event_common_hdr hdr;
223+
uint8_t event_type;
224+
uint8_t health_status;
225+
uint8_t media_status;
226+
uint8_t life_used;
227+
uint32_t dirty_shutdown_cnt;
228+
uint32_t cor_vol_err_cnt;
229+
uint32_t cor_per_err_cnt;
230+
int16_t device_temp;
231+
uint8_t add_status;
232+
};
233+
221234
struct ras_mc_event;
222235
struct ras_aer_event;
223236
struct ras_extlog_event;
@@ -234,6 +247,7 @@ struct ras_cxl_overflow_event;
234247
struct ras_cxl_generic_event;
235248
struct ras_cxl_general_media_event;
236249
struct ras_cxl_dram_event;
250+
struct ras_cxl_memory_module_event;
237251

238252
#ifdef HAVE_SQLITE3
239253

@@ -274,6 +288,7 @@ struct sqlite3_priv {
274288
sqlite3_stmt *stmt_cxl_generic_event;
275289
sqlite3_stmt *stmt_cxl_general_media_event;
276290
sqlite3_stmt *stmt_cxl_dram_event;
291+
sqlite3_stmt *stmt_cxl_memory_module_event;
277292
#endif
278293
};
279294

@@ -309,6 +324,7 @@ int ras_store_cxl_overflow_event(struct ras_events *ras, struct ras_cxl_overflow
309324
int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev);
310325
int ras_store_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev);
311326
int ras_store_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev);
327+
int ras_store_cxl_memory_module_event(struct ras_events *ras, struct ras_cxl_memory_module_event *ev);
312328

313329
#else
314330
static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; };
@@ -329,6 +345,7 @@ static inline int ras_store_cxl_overflow_event(struct ras_events *ras, struct ra
329345
static inline int ras_store_cxl_generic_event(struct ras_events *ras, struct ras_cxl_generic_event *ev) { return 0; };
330346
static inline int ras_store_cxl_general_media_event(struct ras_events *ras, struct ras_cxl_general_media_event *ev) { return 0; };
331347
static inline int ras_store_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *ev) { return 0; };
348+
static inline int ras_store_cxl_memory_module_event(struct ras_events *ras, struct ras_cxl_memory_module_event *ev) { return 0; };
332349

333350
#endif
334351

0 commit comments

Comments
 (0)