Skip to content

Commit 908e0c6

Browse files
authored
[Dynamic buffer calc] Bug fix: Remove PGs from an administratively down port. (#1652)
Remove PGs from an administratively down port. - Introduce a new state: PORT_ADMIN_DOWN which represents the port is administratively down. - Remove all PGs when the port is shut down and re-add all configured PGs when port is started up - Only record the new value but don't touch BUFFER_PG_TABLE if the following events come when a port is administratively down, a port's MTU, speed, or cable length is updated, a new PG is added to a port or an existing PG is removed from a port - Optimize the port event handling flow since refreshPriorityGroupsForPort should be called only once in case more than one fields are updated - Optimize the Lua plugin which calculates the buffer pool size according Signed-off-by: Stephen Sun [email protected] How I verified it Run regression and vs test
1 parent 1382f7a commit 908e0c6

File tree

4 files changed

+373
-147
lines changed

4 files changed

+373
-147
lines changed

cfgmgr/buffer_pool_mellanox.lua

+34-39
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ local lossypg_400g = 0
1212
local result = {}
1313
local profiles = {}
1414

15-
local count_up_port = 0
15+
local total_port = 0
1616

1717
local mgmt_pool_size = 256 * 1024
1818
local egress_mirror_headroom = 10 * 1024
@@ -30,56 +30,46 @@ end
3030

3131
local function iterate_all_items(all_items)
3232
table.sort(all_items)
33-
local prev_port = "None"
3433
local port
35-
local is_up
3634
local fvpairs
37-
local status
38-
local admin_down_ports = 0
3935
for i = 1, #all_items, 1 do
40-
-- Check whether the port on which pg or tc hosts is admin down
36+
-- Count the number of priorities or queues in each BUFFER_PG or BUFFER_QUEUE item
37+
-- For example, there are:
38+
-- 3 queues in 'BUFFER_QUEUE_TABLE:Ethernet0:0-2'
39+
-- 2 priorities in 'BUFFER_PG_TABLE:Ethernet0:3-4'
4140
port = string.match(all_items[i], "Ethernet%d+")
4241
if port ~= nil then
43-
if prev_port ~= port then
44-
status = redis.call('HGET', 'PORT_TABLE:'..port, 'admin_status')
45-
prev_port = port
46-
if status == "down" then
47-
is_up = false
48-
else
49-
is_up = true
50-
end
42+
local range = string.match(all_items[i], "Ethernet%d+:([^%s]+)$")
43+
local profile = redis.call('HGET', all_items[i], 'profile')
44+
local index = find_profile(profile)
45+
if index == 0 then
46+
-- Indicate an error in case the referenced profile hasn't been inserted or has been removed
47+
-- It's possible when the orchagent is busy
48+
-- The buffermgrd will take care of it and retry later
49+
return 1
5150
end
52-
if is_up == true then
53-
local range = string.match(all_items[i], "Ethernet%d+:([^%s]+)$")
54-
local profile = redis.call('HGET', all_items[i], 'profile')
55-
local index = find_profile(profile)
56-
local size
57-
if string.len(range) == 1 then
58-
size = 1
59-
else
60-
size = 1 + tonumber(string.sub(range, -1)) - tonumber(string.sub(range, 1, 1))
61-
end
62-
profiles[index][2] = profiles[index][2] + size
63-
local speed = redis.call('HGET', 'PORT_TABLE:'..port, 'speed')
64-
if speed == '400000' and profile == '[BUFFER_PROFILE_TABLE:ingress_lossy_profile]' then
65-
lossypg_400g = lossypg_400g + size
66-
end
51+
local size
52+
if string.len(range) == 1 then
53+
size = 1
54+
else
55+
size = 1 + tonumber(string.sub(range, -1)) - tonumber(string.sub(range, 1, 1))
56+
end
57+
profiles[index][2] = profiles[index][2] + size
58+
local speed = redis.call('HGET', 'PORT_TABLE:'..port, 'speed')
59+
if speed == '400000' and profile == '[BUFFER_PROFILE_TABLE:ingress_lossy_profile]' then
60+
lossypg_400g = lossypg_400g + size
6761
end
6862
end
6963
end
64+
return 0
7065
end
7166

7267
-- Connect to CONFIG_DB
7368
redis.call('SELECT', config_db)
7469

7570
local ports_table = redis.call('KEYS', 'PORT|*')
7671

77-
for i = 1, #ports_table do
78-
local status = redis.call('HGET', ports_table[i], 'admin_status')
79-
if status == "up" then
80-
count_up_port = count_up_port + 1
81-
end
82-
end
72+
total_port = #ports_table
8373

8474
local egress_lossless_pool_size = redis.call('HGET', 'BUFFER_POOL|egress_lossless_pool', 'size')
8575

@@ -114,8 +104,12 @@ end
114104
local all_pgs = redis.call('KEYS', 'BUFFER_PG*')
115105
local all_tcs = redis.call('KEYS', 'BUFFER_QUEUE*')
116106

117-
iterate_all_items(all_pgs)
118-
iterate_all_items(all_tcs)
107+
local fail_count = 0
108+
fail_count = fail_count + iterate_all_items(all_pgs)
109+
fail_count = fail_count + iterate_all_items(all_tcs)
110+
if fail_count > 0 then
111+
return {}
112+
end
119113

120114
local statistics = {}
121115

@@ -130,7 +124,7 @@ for i = 1, #profiles, 1 do
130124
size = size + lossypg_reserved
131125
end
132126
if profiles[i][1] == "BUFFER_PROFILE_TABLE:egress_lossy_profile" then
133-
profiles[i][2] = count_up_port
127+
profiles[i][2] = total_port
134128
end
135129
if size ~= 0 then
136130
if shp_enabled and shp_size == 0 then
@@ -152,7 +146,7 @@ local lossypg_extra_for_400g = (lossypg_reserved_400g - lossypg_reserved) * loss
152146
accumulative_occupied_buffer = accumulative_occupied_buffer + lossypg_extra_for_400g
153147

154148
-- Accumulate sizes for egress mirror and management pool
155-
local accumulative_egress_mirror_overhead = count_up_port * egress_mirror_headroom
149+
local accumulative_egress_mirror_overhead = total_port * egress_mirror_headroom
156150
accumulative_occupied_buffer = accumulative_occupied_buffer + accumulative_egress_mirror_overhead + mgmt_pool_size
157151

158152
-- Fetch mmu_size
@@ -240,5 +234,6 @@ table.insert(result, "debug:egress_mirror:" .. accumulative_egress_mirror_overhe
240234
table.insert(result, "debug:shp_enabled:" .. tostring(shp_enabled))
241235
table.insert(result, "debug:shp_size:" .. shp_size)
242236
table.insert(result, "debug:accumulative xoff:" .. accumulative_xoff)
237+
table.insert(result, "debug:total port:" .. total_port)
243238

244239
return result

0 commit comments

Comments
 (0)