Skip to content

Commit 6c88e47

Browse files
authored
[Dynamic Buffer Calc][Mellanox] Bug fixes and enhancements for the lua plugins for buffer pool calculation and headroom checking (sonic-net#1781)
What I did Bug fixes for buffer pool calculation and headroom checking on Mellanox platforms. Test the number of lanes instead of the speed when determining whether special handling is required for a port. For speeds other than 400G, eg 100G, it's possible that some 100G ports have 8 lanes and others have 4 lanes, which means they can not share the same buffer profile. A suffix _8lane is introduced to indicate it, like pg_lossless_100000_5m_8lane_profile Take the private headroom into account when calculating the buffer pool size Take deviation into account when checking the headroom against the per-port limit to avoid the inaccurate result in a rare case Use hashtable to record the reference count of a profile in lug plugin Signed-off-by: Stephen Sun [email protected] How I verified it Run regression and manually test Details if related Test the number of lanes instead of the speed when determining whether special handling (double headroom size) is required for a port. Originally, it was determined by testing whether the ports' speed is 400G but that is not accurate. A user can configure a port with 8 lanes to 100G. In this case, special handling is still required for a port that is not 400G. So we need to adjust the way to do that. The variable names are also updated accordingly: xxx_400g => xxx_8lanes Take deviation into account when checking the headroom against the per-port limit to avoid the inaccurate result in a rare case There are some deviations that make the accumulative headroom a bit larger than the quantity calculated by the buffer manager. We need to take it into account when calculating the accumulative headroom.
1 parent e86b900 commit 6c88e47

6 files changed

+190
-82
lines changed

cfgmgr/buffer_check_headroom_mellanox.lua

+29-6
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,25 @@ local port = KEYS[1]
77
local input_profile_name = ARGV[1]
88
local input_profile_size = ARGV[2]
99
local new_pg = ARGV[3]
10-
local accumulative_size = 0
10+
11+
local function is_port_with_8lanes(lanes)
12+
-- On Spectrum 3, ports with 8 lanes have doubled pipeline latency
13+
local number_of_lanes = 0
14+
if lanes then
15+
local _
16+
_, number_of_lanes = string.gsub(lanes, ",", ",")
17+
number_of_lanes = number_of_lanes + 1
18+
end
19+
return number_of_lanes == 8
20+
end
21+
22+
-- Initialize the accumulative size with 4096
23+
-- This is to absorb the possible deviation
24+
local accumulative_size = 4096
1125

1226
local appl_db = "0"
1327
local state_db = "6"
28+
local config_db = "4"
1429

1530
local ret_true = {}
1631
local ret = {}
@@ -20,7 +35,13 @@ table.insert(ret_true, "result:true")
2035

2136
default_ret = ret_true
2237

23-
local speed = redis.call('HGET', 'PORT|' .. port, 'speed')
38+
-- Connect to CONFIG_DB
39+
redis.call('SELECT', config_db)
40+
41+
local lanes
42+
43+
-- We need to know whether it's a 8-lane port because it has extra pipeline latency
44+
lanes = redis.call('HGET', 'PORT|' .. port, 'lanes')
2445

2546
-- Fetch the threshold from STATE_DB
2647
redis.call('SELECT', state_db)
@@ -31,11 +52,12 @@ if max_headroom_size == nil then
3152
end
3253

3354
local asic_keys = redis.call('KEYS', 'ASIC_TABLE*')
34-
local pipeline_delay = tonumber(redis.call('HGET', asic_keys[1], 'pipeline_latency'))
35-
if speed == 400000 then
36-
pipeline_delay = pipeline_delay * 2 - 1
55+
local pipeline_latency = tonumber(redis.call('HGET', asic_keys[1], 'pipeline_latency'))
56+
if is_port_with_8lanes(lanes) then
57+
-- The pipeline latency should be adjusted accordingly for ports with 2 buffer units
58+
pipeline_latency = pipeline_latency * 2 - 1
3759
end
38-
accumulative_size = accumulative_size + 2 * pipeline_delay * 1024
60+
accumulative_size = accumulative_size + 2 * pipeline_latency * 1024
3961

4062
-- Fetch all keys in BUFFER_PG according to the port
4163
redis.call('SELECT', appl_db)
@@ -95,6 +117,7 @@ end
95117

96118
if max_headroom_size > accumulative_size then
97119
table.insert(ret, "result:true")
120+
table.insert(ret, "debug:Accumulative headroom on port " .. accumulative_size .. ", the maximum available headroom " .. max_headroom_size)
98121
else
99122
table.insert(ret, "result:false")
100123
table.insert(ret, "debug:Accumulative headroom on port " .. accumulative_size .. " exceeds the maximum available headroom which is " .. max_headroom_size)

cfgmgr/buffer_headroom_mellanox.lua

+5-3
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
-- ARGV[2] - cable length
44
-- ARGV[3] - port mtu
55
-- ARGV[4] - gearbox delay
6+
-- ARGV[5] - lane count of the ports on which the profile will be applied
67

78
-- parameters retried from databases:
89
-- From CONFIG_DB.LOSSLESS_TRAFFIC_PATTERN
@@ -26,6 +27,7 @@ local port_speed = tonumber(ARGV[1])
2627
local cable_length = tonumber(string.sub(ARGV[2], 1, -2))
2728
local port_mtu = tonumber(ARGV[3])
2829
local gearbox_delay = tonumber(ARGV[4])
30+
local is_8lane = (ARGV[5] == "8")
2931

3032
local appl_db = "0"
3133
local config_db = "4"
@@ -100,9 +102,9 @@ local xon_value
100102
local headroom_size
101103
local speed_overhead
102104

103-
-- Adjustment for 400G
104-
if port_speed == 400000 then
105-
pipeline_latency = 37 * 1024
105+
-- Adjustment for 8-lane port
106+
if is_8lane ~= nil and is_8lane then
107+
pipeline_latency = pipeline_latency * 2 - 1024
106108
speed_overhead = port_mtu
107109
else
108110
speed_overhead = 0

cfgmgr/buffer_pool_mellanox.lua

+109-60
Original file line numberDiff line numberDiff line change
@@ -5,34 +5,31 @@ local appl_db = "0"
55
local config_db = "4"
66
local state_db = "6"
77

8-
local lossypg_reserved = 19 * 1024
9-
local lossypg_reserved_400g = 37 * 1024
10-
-- Number of 400G ports
11-
local port_count_400g = 0
12-
-- Number of lossy PG on 400G ports
13-
local lossypg_400g = 0
8+
-- Number of ports with 8 lanes (whose pipeline latency should be doubled)
9+
local port_count_8lanes = 0
10+
-- Number of lossy PG on ports with 8 lanes
11+
local lossypg_8lanes = 0
12+
13+
-- Private headrom
14+
local private_headroom = 10 * 1024
1415

1516
local result = {}
1617
local profiles = {}
18+
local lossless_profiles = {}
1719

1820
local total_port = 0
1921

2022
local mgmt_pool_size = 256 * 1024
2123
local egress_mirror_headroom = 10 * 1024
2224

23-
local function find_profile(ref)
24-
-- Remove the surrounding square bracket and the find in the list
25-
local name = string.sub(ref, 2, -2)
26-
for i = 1, #profiles, 1 do
27-
if profiles[i][1] == name then
28-
return i
29-
end
30-
end
31-
return 0
32-
end
25+
-- The set of ports with 8 lanes
26+
local port_set_8lanes = {}
27+
-- Number of ports with lossless profiles
28+
local lossless_port_count = 0
3329

34-
local function iterate_all_items(all_items)
30+
local function iterate_all_items(all_items, check_lossless)
3531
table.sort(all_items)
32+
local lossless_ports = {}
3633
local port
3734
local fvpairs
3835
for i = 1, #all_items, 1 do
@@ -43,9 +40,13 @@ local function iterate_all_items(all_items)
4340
port = string.match(all_items[i], "Ethernet%d+")
4441
if port ~= nil then
4542
local range = string.match(all_items[i], "Ethernet%d+:([^%s]+)$")
46-
local profile = redis.call('HGET', all_items[i], 'profile')
47-
local index = find_profile(profile)
48-
if index == 0 then
43+
local profile_name = redis.call('HGET', all_items[i], 'profile')
44+
if not profile_name then
45+
return 1
46+
end
47+
profile_name = string.sub(profile_name, 2, -2)
48+
local profile_ref_count = profiles[profile_name]
49+
if profile_ref_count == nil then
4950
-- Indicate an error in case the referenced profile hasn't been inserted or has been removed
5051
-- It's possible when the orchagent is busy
5152
-- The buffermgrd will take care of it and retry later
@@ -57,13 +58,15 @@ local function iterate_all_items(all_items)
5758
else
5859
size = 1 + tonumber(string.sub(range, -1)) - tonumber(string.sub(range, 1, 1))
5960
end
60-
profiles[index][2] = profiles[index][2] + size
61-
local speed = redis.call('HGET', 'PORT_TABLE:'..port, 'speed')
62-
if speed == '400000' then
63-
if profile == '[BUFFER_PROFILE_TABLE:ingress_lossy_profile]' then
64-
lossypg_400g = lossypg_400g + size
61+
profiles[profile_name] = profile_ref_count + size
62+
if port_set_8lanes[port] and profile_name == 'BUFFER_PROFILE_TABLE:ingress_lossy_profile' then
63+
lossypg_8lanes = lossypg_8lanes + size
64+
end
65+
if check_lossless and lossless_profiles[profile_name] then
66+
if lossless_ports[port] == nil then
67+
lossless_port_count = lossless_port_count + 1
68+
lossless_ports[port] = true
6569
end
66-
port_count_400g = port_count_400g + 1
6770
end
6871
end
6972
end
@@ -77,6 +80,27 @@ local ports_table = redis.call('KEYS', 'PORT|*')
7780

7881
total_port = #ports_table
7982

83+
-- Initialize the port_set_8lanes set
84+
local lanes
85+
local number_of_lanes
86+
local port
87+
for i = 1, total_port, 1 do
88+
-- Load lanes from PORT table
89+
lanes = redis.call('HGET', ports_table[i], 'lanes')
90+
if lanes then
91+
local _
92+
_, number_of_lanes = string.gsub(lanes, ",", ",")
93+
number_of_lanes = number_of_lanes + 1
94+
port = string.sub(ports_table[i], 6, -1)
95+
if (number_of_lanes == 8) then
96+
port_set_8lanes[port] = true
97+
port_count_8lanes = port_count_8lanes + 1
98+
else
99+
port_set_8lanes[port] = false
100+
end
101+
end
102+
end
103+
80104
local egress_lossless_pool_size = redis.call('HGET', 'BUFFER_POOL|egress_lossless_pool', 'size')
81105

82106
-- Whether shared headroom pool is enabled?
@@ -97,22 +121,45 @@ else
97121
shp_size = 0
98122
end
99123

124+
-- Fetch mmu_size
125+
redis.call('SELECT', state_db)
126+
local mmu_size = tonumber(redis.call('HGET', 'BUFFER_MAX_PARAM_TABLE|global', 'mmu_size'))
127+
if mmu_size == nil then
128+
mmu_size = tonumber(egress_lossless_pool_size)
129+
end
130+
local asic_keys = redis.call('KEYS', 'ASIC_TABLE*')
131+
local cell_size = tonumber(redis.call('HGET', asic_keys[1], 'cell_size'))
132+
local pipeline_latency = tonumber(redis.call('HGET', asic_keys[1], 'pipeline_latency'))
133+
134+
local lossypg_reserved = pipeline_latency * 1024
135+
local lossypg_reserved_8lanes = (2 * pipeline_latency - 1) * 1024
136+
137+
-- Align mmu_size at cell size boundary, otherwise the sdk will complain and the syncd will fail
138+
local number_of_cells = math.floor(mmu_size / cell_size)
139+
local ceiling_mmu_size = number_of_cells * cell_size
140+
100141
-- Switch to APPL_DB
101142
redis.call('SELECT', appl_db)
102143

103144
-- Fetch names of all profiles and insert them into the look up table
104145
local all_profiles = redis.call('KEYS', 'BUFFER_PROFILE*')
105146
for i = 1, #all_profiles, 1 do
106-
table.insert(profiles, {all_profiles[i], 0})
147+
if all_profiles[i] ~= "BUFFER_PROFILE_TABLE_KEY_SET" and all_profiles[i] ~= "BUFFER_PROFILE_TABLE_DEL_SET" then
148+
local xoff = redis.call('HGET', all_profiles[i], 'xoff')
149+
if xoff then
150+
lossless_profiles[all_profiles[i]] = true
151+
end
152+
profiles[all_profiles[i]] = 0
153+
end
107154
end
108155

109156
-- Fetch all the PGs
110157
local all_pgs = redis.call('KEYS', 'BUFFER_PG*')
111158
local all_tcs = redis.call('KEYS', 'BUFFER_QUEUE*')
112159

113160
local fail_count = 0
114-
fail_count = fail_count + iterate_all_items(all_pgs)
115-
fail_count = fail_count + iterate_all_items(all_tcs)
161+
fail_count = fail_count + iterate_all_items(all_pgs, true)
162+
fail_count = fail_count + iterate_all_items(all_tcs, false)
116163
if fail_count > 0 then
117164
return {}
118165
end
@@ -122,56 +169,55 @@ local statistics = {}
122169
-- Fetch sizes of all of the profiles, accumulate them
123170
local accumulative_occupied_buffer = 0
124171
local accumulative_xoff = 0
125-
for i = 1, #profiles, 1 do
126-
if profiles[i][1] ~= "BUFFER_PROFILE_TABLE_KEY_SET" and profiles[i][1] ~= "BUFFER_PROFILE_TABLE_DEL_SET" then
127-
local size = tonumber(redis.call('HGET', profiles[i][1], 'size'))
172+
173+
for name in pairs(profiles) do
174+
if name ~= "BUFFER_PROFILE_TABLE_KEY_SET" and name ~= "BUFFER_PROFILE_TABLE_DEL_SET" then
175+
local size = tonumber(redis.call('HGET', name, 'size'))
128176
if size ~= nil then
129-
if profiles[i][1] == "BUFFER_PROFILE_TABLE:ingress_lossy_profile" then
177+
if name == "BUFFER_PROFILE_TABLE:ingress_lossy_profile" then
130178
size = size + lossypg_reserved
131179
end
132-
if profiles[i][1] == "BUFFER_PROFILE_TABLE:egress_lossy_profile" then
133-
profiles[i][2] = total_port
180+
if name == "BUFFER_PROFILE_TABLE:egress_lossy_profile" then
181+
profiles[name] = total_port
134182
end
135183
if size ~= 0 then
136184
if shp_enabled and shp_size == 0 then
137-
local xon = tonumber(redis.call('HGET', profiles[i][1], 'xon'))
138-
local xoff = tonumber(redis.call('HGET', profiles[i][1], 'xoff'))
185+
local xon = tonumber(redis.call('HGET', name, 'xon'))
186+
local xoff = tonumber(redis.call('HGET', name, 'xoff'))
139187
if xon ~= nil and xoff ~= nil and xon + xoff > size then
140-
accumulative_xoff = accumulative_xoff + (xon + xoff - size) * profiles[i][2]
188+
accumulative_xoff = accumulative_xoff + (xon + xoff - size) * profiles[name]
141189
end
142190
end
143-
accumulative_occupied_buffer = accumulative_occupied_buffer + size * profiles[i][2]
191+
accumulative_occupied_buffer = accumulative_occupied_buffer + size * profiles[name]
144192
end
145-
table.insert(statistics, {profiles[i][1], size, profiles[i][2]})
193+
table.insert(statistics, {name, size, profiles[name]})
146194
end
147195
end
148196
end
149197

150-
-- Extra lossy xon buffer for 400G port
151-
local lossypg_extra_for_400g = (lossypg_reserved_400g - lossypg_reserved) * lossypg_400g
152-
accumulative_occupied_buffer = accumulative_occupied_buffer + lossypg_extra_for_400g
198+
-- Extra lossy xon buffer for ports with 8 lanes
199+
local lossypg_extra_for_8lanes = (lossypg_reserved_8lanes - lossypg_reserved) * lossypg_8lanes
200+
accumulative_occupied_buffer = accumulative_occupied_buffer + lossypg_extra_for_8lanes
201+
202+
-- Accumulate sizes for private headrooms
203+
local accumulative_private_headroom = 0
204+
if shp_enabled then
205+
accumulative_private_headroom = lossless_port_count * private_headroom
206+
accumulative_occupied_buffer = accumulative_occupied_buffer + accumulative_private_headroom
207+
accumulative_xoff = accumulative_xoff - accumulative_private_headroom
208+
if accumulative_xoff < 0 then
209+
accumulative_xoff = 0
210+
end
211+
end
153212

154213
-- Accumulate sizes for management PGs
155-
local accumulative_management_pg = (total_port - port_count_400g) * lossypg_reserved + port_count_400g * lossypg_reserved_400g
214+
local accumulative_management_pg = (total_port - port_count_8lanes) * lossypg_reserved + port_count_8lanes * lossypg_reserved_8lanes
156215
accumulative_occupied_buffer = accumulative_occupied_buffer + accumulative_management_pg
157216

158217
-- Accumulate sizes for egress mirror and management pool
159218
local accumulative_egress_mirror_overhead = total_port * egress_mirror_headroom
160219
accumulative_occupied_buffer = accumulative_occupied_buffer + accumulative_egress_mirror_overhead + mgmt_pool_size
161220

162-
-- Fetch mmu_size
163-
redis.call('SELECT', state_db)
164-
local mmu_size = tonumber(redis.call('HGET', 'BUFFER_MAX_PARAM_TABLE|global', 'mmu_size'))
165-
if mmu_size == nil then
166-
mmu_size = tonumber(egress_lossless_pool_size)
167-
end
168-
local asic_keys = redis.call('KEYS', 'ASIC_TABLE*')
169-
local cell_size = tonumber(redis.call('HGET', asic_keys[1], 'cell_size'))
170-
171-
-- Align mmu_size at cell size boundary, otherwise the sdk will complain and the syncd will fail
172-
local number_of_cells = math.floor(mmu_size / cell_size)
173-
local ceiling_mmu_size = number_of_cells * cell_size
174-
175221
-- Switch to CONFIG_DB
176222
redis.call('SELECT', config_db)
177223

@@ -238,13 +284,16 @@ table.insert(result, "debug:accumulative size:" .. accumulative_occupied_buffer)
238284
for i = 1, #statistics do
239285
table.insert(result, "debug:" .. statistics[i][1] .. ":" .. statistics[i][2] .. ":" .. statistics[i][3])
240286
end
241-
table.insert(result, "debug:extra_400g:" .. (lossypg_reserved_400g - lossypg_reserved) .. ":" .. lossypg_400g .. ":" .. port_count_400g)
287+
table.insert(result, "debug:extra_8lanes:" .. (lossypg_reserved_8lanes - lossypg_reserved) .. ":" .. lossypg_8lanes .. ":" .. port_count_8lanes)
242288
table.insert(result, "debug:mgmt_pool:" .. mgmt_pool_size)
289+
if shp_enabled then
290+
table.insert(result, "debug:accumulative_private_headroom:" .. accumulative_private_headroom)
291+
table.insert(result, "debug:accumulative xoff:" .. accumulative_xoff)
292+
end
243293
table.insert(result, "debug:accumulative_mgmt_pg:" .. accumulative_management_pg)
244294
table.insert(result, "debug:egress_mirror:" .. accumulative_egress_mirror_overhead)
245295
table.insert(result, "debug:shp_enabled:" .. tostring(shp_enabled))
246296
table.insert(result, "debug:shp_size:" .. shp_size)
247-
table.insert(result, "debug:accumulative xoff:" .. accumulative_xoff)
248-
table.insert(result, "debug:total port:" .. total_port)
297+
table.insert(result, "debug:total port:" .. total_port .. " ports with 8 lanes:" .. port_count_8lanes)
249298

250299
return result

0 commit comments

Comments
 (0)