Skip to content

Commit d8fadc6

Browse files
authored
[QoS] Resolve an issue in the sequence where a referenced object removed and then the referencing object deleting and then re-adding (#2210)
- What I did Resolve an issue in the following scenario Suppose object a in table A references object b in table B. When a user is going to remove items in table A and B, the notifications can be received in the following order: The notification of removing object b Then the notification of removing object a And then the notification of re-adding object a The notification of re-adding object b. Object b can not be removed in the 1st step because it is still referenced by object a. In case the system is busy, the notification of removing a remains in m_toSync when the notification of re-adding it is coming, which results in both notifications being handled together and the reference to object b not being cleared. As a result, notification of removing b will never be handled and remain in m_toSync forever. Solution: Introduce a flag pending remove indicating whether an object is about to be removed but pending on some reference. pending remove is set once a DEL notification is received for an object with non-zero reference. When resolving references in step 3, a pending remove object will be skipped and the notification will remain in m_toSync. SET operation will not be carried out in case there is a pending remove flag on the object to be set. By doing so, when object a is re-added in step 3, it can not retrieve the dependent object b. And step 1 will be handled and drained successfully. - Why I did it Fix bug. - How I verified it Mock test and manually test (eg. config qos reload) Signed-off-by: Stephen Sun <[email protected]>
1 parent eaf7264 commit d8fadc6

File tree

8 files changed

+614
-75
lines changed

8 files changed

+614
-75
lines changed

orchagent/bufferorch.cpp

+14
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,11 @@ task_process_status BufferOrch::processBufferPool(KeyOpFieldsValuesTuple &tuple)
389389
{
390390
sai_object = (*(m_buffer_type_maps[map_type_name]))[object_name].m_saiObjectId;
391391
SWSS_LOG_DEBUG("found existing object:%s of type:%s", object_name.c_str(), map_type_name.c_str());
392+
if ((*(m_buffer_type_maps[map_type_name]))[object_name].m_pendingRemove && op == SET_COMMAND)
393+
{
394+
SWSS_LOG_NOTICE("Entry %s %s is pending remove, need retry", map_type_name.c_str(), object_name.c_str());
395+
return task_process_status::task_need_retry;
396+
}
392397
}
393398
SWSS_LOG_DEBUG("processing command:%s", op.c_str());
394399
if (object_name == "ingress_zero_pool")
@@ -565,6 +570,7 @@ task_process_status BufferOrch::processBufferPool(KeyOpFieldsValuesTuple &tuple)
565570
}
566571

567572
(*(m_buffer_type_maps[map_type_name]))[object_name].m_saiObjectId = sai_object;
573+
(*(m_buffer_type_maps[map_type_name]))[object_name].m_pendingRemove = false;
568574
// Here we take the PFC watchdog approach to update the COUNTERS_DB metadata (e.g., PFC_WD_DETECTION_TIME per queue)
569575
// at initialization (creation and registration phase)
570576
// Specifically, we push the buffer pool name to oid mapping upon the creation of the oid
@@ -579,6 +585,7 @@ task_process_status BufferOrch::processBufferPool(KeyOpFieldsValuesTuple &tuple)
579585
{
580586
auto hint = objectReferenceInfo(m_buffer_type_maps, map_type_name, object_name);
581587
SWSS_LOG_NOTICE("Can't remove object %s due to being referenced (%s)", object_name.c_str(), hint.c_str());
588+
(*(m_buffer_type_maps[map_type_name]))[object_name].m_pendingRemove = true;
582589

583590
return task_process_status::task_need_retry;
584591
}
@@ -647,6 +654,11 @@ task_process_status BufferOrch::processBufferProfile(KeyOpFieldsValuesTuple &tup
647654
{
648655
sai_object = (*(m_buffer_type_maps[map_type_name]))[object_name].m_saiObjectId;
649656
SWSS_LOG_DEBUG("found existing object:%s of type:%s", object_name.c_str(), map_type_name.c_str());
657+
if ((*(m_buffer_type_maps[map_type_name]))[object_name].m_pendingRemove && op == SET_COMMAND)
658+
{
659+
SWSS_LOG_NOTICE("Entry %s %s is pending remove, need retry", map_type_name.c_str(), object_name.c_str());
660+
return task_process_status::task_need_retry;
661+
}
650662
}
651663
SWSS_LOG_DEBUG("processing command:%s", op.c_str());
652664
if (op == SET_COMMAND)
@@ -787,6 +799,7 @@ task_process_status BufferOrch::processBufferProfile(KeyOpFieldsValuesTuple &tup
787799
}
788800
}
789801
(*(m_buffer_type_maps[map_type_name]))[object_name].m_saiObjectId = sai_object;
802+
(*(m_buffer_type_maps[map_type_name]))[object_name].m_pendingRemove = false;
790803
SWSS_LOG_NOTICE("Created buffer profile %s with type %s", object_name.c_str(), map_type_name.c_str());
791804
}
792805

@@ -799,6 +812,7 @@ task_process_status BufferOrch::processBufferProfile(KeyOpFieldsValuesTuple &tup
799812
{
800813
auto hint = objectReferenceInfo(m_buffer_type_maps, map_type_name, object_name);
801814
SWSS_LOG_NOTICE("Can't remove object %s due to being referenced (%s)", object_name.c_str(), hint.c_str());
815+
(*(m_buffer_type_maps[map_type_name]))[object_name].m_pendingRemove = true;
802816

803817
return task_process_status::task_need_retry;
804818
}

orchagent/orch.cpp

+5
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,11 @@ bool Orch::parseReference(type_map &type_maps, string &ref_in, const string &typ
357357
SWSS_LOG_INFO("map:%s does not contain object with name:%s\n", type_name.c_str(), ref_in.c_str());
358358
return false;
359359
}
360+
if (obj_it->second.m_pendingRemove)
361+
{
362+
SWSS_LOG_NOTICE("map:%s contains a pending removed object %s, skip\n", type_name.c_str(), ref_in.c_str());
363+
return false;
364+
}
360365
object_name = ref_in;
361366
SWSS_LOG_DEBUG("parsed: type_name:%s, object_name:%s", type_name.c_str(), object_name.c_str());
362367
return true;

orchagent/orch.h

+1
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ typedef struct
6666
// multiple objects being referenced are separated by ','
6767
std::map<std::string, std::string> m_objsReferencingByMe;
6868
sai_object_id_t m_saiObjectId;
69+
bool m_pendingRemove;
6970
} referenced_object;
7071

7172
typedef std::map<std::string, referenced_object> object_reference_map;

orchagent/qosorch.cpp

+14-72
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,11 @@ task_process_status QosMapHandler::processWorkItem(Consumer& consumer, KeyOpFiel
110110
if (QosOrch::getTypeMap()[qos_map_type_name]->find(qos_object_name) != QosOrch::getTypeMap()[qos_map_type_name]->end())
111111
{
112112
sai_object = (*(QosOrch::getTypeMap()[qos_map_type_name]))[qos_object_name].m_saiObjectId;
113+
if ((*(QosOrch::getTypeMap()[qos_map_type_name]))[qos_object_name].m_pendingRemove && op == SET_COMMAND)
114+
{
115+
SWSS_LOG_NOTICE("Entry %s %s is pending remove, need retry", qos_map_type_name.c_str(), qos_object_name.c_str());
116+
return task_process_status::task_need_retry;
117+
}
113118
}
114119
if (op == SET_COMMAND)
115120
{
@@ -138,6 +143,7 @@ task_process_status QosMapHandler::processWorkItem(Consumer& consumer, KeyOpFiel
138143
return task_process_status::task_failed;
139144
}
140145
(*(QosOrch::getTypeMap()[qos_map_type_name]))[qos_object_name].m_saiObjectId = sai_object;
146+
(*(QosOrch::getTypeMap()[qos_map_type_name]))[qos_object_name].m_pendingRemove = false;
141147
SWSS_LOG_NOTICE("Created [%s:%s]", qos_map_type_name.c_str(), qos_object_name.c_str());
142148
}
143149
freeAttribResources(attributes);
@@ -153,6 +159,7 @@ task_process_status QosMapHandler::processWorkItem(Consumer& consumer, KeyOpFiel
153159
{
154160
auto hint = gQosOrch->objectReferenceInfo(QosOrch::getTypeMap(), qos_map_type_name, qos_object_name);
155161
SWSS_LOG_NOTICE("Can't remove object %s due to being referenced (%s)", qos_object_name.c_str(), hint.c_str());
162+
(*(QosOrch::getTypeMap()[qos_map_type_name]))[qos_object_name].m_pendingRemove = true;
156163
return task_process_status::task_need_retry;
157164
}
158165
if (!removeQosItem(sai_object))
@@ -1121,6 +1128,11 @@ task_process_status QosOrch::handleSchedulerTable(Consumer& consumer, KeyOpField
11211128
SWSS_LOG_ERROR("Error sai_object must exist for key %s", qos_object_name.c_str());
11221129
return task_process_status::task_invalid_entry;
11231130
}
1131+
if ((*(m_qos_maps[qos_map_type_name]))[qos_object_name].m_pendingRemove && op == SET_COMMAND)
1132+
{
1133+
SWSS_LOG_NOTICE("Entry %s %s is pending remove, need retry", qos_map_type_name.c_str(), qos_object_name.c_str());
1134+
return task_process_status::task_need_retry;
1135+
}
11241136
}
11251137
if (op == SET_COMMAND)
11261138
{
@@ -1228,6 +1240,7 @@ task_process_status QosOrch::handleSchedulerTable(Consumer& consumer, KeyOpField
12281240
}
12291241
SWSS_LOG_NOTICE("Created [%s:%s]", qos_map_type_name.c_str(), qos_object_name.c_str());
12301242
(*(m_qos_maps[qos_map_type_name]))[qos_object_name].m_saiObjectId = sai_object;
1243+
(*(m_qos_maps[qos_map_type_name]))[qos_object_name].m_pendingRemove = false;
12311244
}
12321245
}
12331246
else if (op == DEL_COMMAND)
@@ -1241,6 +1254,7 @@ task_process_status QosOrch::handleSchedulerTable(Consumer& consumer, KeyOpField
12411254
{
12421255
auto hint = gQosOrch->objectReferenceInfo(QosOrch::getTypeMap(), qos_map_type_name, qos_object_name);
12431256
SWSS_LOG_NOTICE("Can't remove object %s due to being referenced (%s)", qos_object_name.c_str(), hint.c_str());
1257+
(*(m_qos_maps[qos_map_type_name]))[qos_object_name].m_pendingRemove = true;
12441258
return task_process_status::task_need_retry;
12451259
}
12461260
sai_status = sai_scheduler_api->remove_scheduler(sai_object);
@@ -1613,78 +1627,6 @@ task_process_status QosOrch::handleQueueTable(Consumer& consumer, KeyOpFieldsVal
16131627
return task_process_status::task_success;
16141628
}
16151629

1616-
bool QosOrch::applyMapToPort(Port &port, sai_attr_id_t attr_id, sai_object_id_t map_id)
1617-
{
1618-
SWSS_LOG_ENTER();
1619-
1620-
sai_attribute_t attr;
1621-
attr.id = attr_id;
1622-
attr.value.oid = map_id;
1623-
1624-
sai_status_t status = sai_port_api->set_port_attribute(port.m_port_id, &attr);
1625-
if (status != SAI_STATUS_SUCCESS)
1626-
{
1627-
SWSS_LOG_ERROR("Failed setting sai object:%" PRIx64 " for port:%s, status:%d", map_id, port.m_alias.c_str(), status);
1628-
task_process_status handle_status = handleSaiSetStatus(SAI_API_PORT, status);
1629-
if (handle_status != task_success)
1630-
{
1631-
return parseHandleSaiStatusFailure(handle_status);
1632-
}
1633-
}
1634-
return true;
1635-
}
1636-
1637-
task_process_status QosOrch::ResolveMapAndApplyToPort(
1638-
Port &port,
1639-
sai_port_attr_t port_attr,
1640-
string field_name,
1641-
KeyOpFieldsValuesTuple &tuple,
1642-
string op)
1643-
{
1644-
SWSS_LOG_ENTER();
1645-
1646-
sai_object_id_t sai_object = SAI_NULL_OBJECT_ID;
1647-
string object_name;
1648-
bool result;
1649-
ref_resolve_status resolve_result = resolveFieldRefValue(m_qos_maps, field_name,
1650-
qos_to_ref_table_map.at(field_name), tuple, sai_object, object_name);
1651-
if (ref_resolve_status::success == resolve_result)
1652-
{
1653-
if (op == SET_COMMAND)
1654-
{
1655-
result = applyMapToPort(port, port_attr, sai_object);
1656-
}
1657-
else if (op == DEL_COMMAND)
1658-
{
1659-
// NOTE: The map is un-bound from the port. But the map itself still exists.
1660-
result = applyMapToPort(port, port_attr, SAI_NULL_OBJECT_ID);
1661-
}
1662-
else
1663-
{
1664-
SWSS_LOG_ERROR("Unknown operation type %s", op.c_str());
1665-
return task_process_status::task_invalid_entry;
1666-
}
1667-
if (!result)
1668-
{
1669-
SWSS_LOG_ERROR("Failed setting field:%s to port:%s, line:%d", field_name.c_str(), port.m_alias.c_str(), __LINE__);
1670-
return task_process_status::task_failed;
1671-
}
1672-
SWSS_LOG_DEBUG("Applied field:%s to port:%s, line:%d", field_name.c_str(), port.m_alias.c_str(), __LINE__);
1673-
return task_process_status::task_success;
1674-
}
1675-
else if (resolve_result != ref_resolve_status::field_not_found)
1676-
{
1677-
if(ref_resolve_status::not_resolved == resolve_result)
1678-
{
1679-
SWSS_LOG_INFO("Missing or invalid %s reference", field_name.c_str());
1680-
return task_process_status::task_need_retry;
1681-
}
1682-
SWSS_LOG_ERROR("Resolving %s reference failed", field_name.c_str());
1683-
return task_process_status::task_failed;
1684-
}
1685-
return task_process_status::task_success;
1686-
}
1687-
16881630
task_process_status QosOrch::handlePortQosMapTable(Consumer& consumer, KeyOpFieldsValuesTuple &tuple)
16891631
{
16901632
SWSS_LOG_ENTER();

orchagent/qosorch.h

-3
Original file line numberDiff line numberDiff line change
@@ -180,11 +180,8 @@ class QosOrch : public Orch
180180

181181
sai_object_id_t getSchedulerGroup(const Port &port, const sai_object_id_t queue_id);
182182

183-
bool applyMapToPort(Port &port, sai_attr_id_t attr_id, sai_object_id_t sai_dscp_to_tc_map);
184183
bool applySchedulerToQueueSchedulerGroup(Port &port, size_t queue_ind, sai_object_id_t scheduler_profile_id);
185184
bool applyWredProfileToQueue(Port &port, size_t queue_ind, sai_object_id_t sai_wred_profile);
186-
task_process_status ResolveMapAndApplyToPort(Port &port,sai_port_attr_t port_attr,
187-
string field_name, KeyOpFieldsValuesTuple &tuple, string op);
188185

189186
private:
190187
qos_table_handler_map m_qos_handler_map;

tests/mock_tests/Makefile.am

+1
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ tests_SOURCES = aclorch_ut.cpp \
2525
portsorch_ut.cpp \
2626
routeorch_ut.cpp \
2727
qosorch_ut.cpp \
28+
bufferorch_ut.cpp \
2829
saispy_ut.cpp \
2930
consumer_ut.cpp \
3031
ut_saihelper.cpp \

0 commit comments

Comments
 (0)