Skip to content

Commit 7a7a796

Browse files
committed
WIP
Signed-off-by: Daniel Noland <[email protected]>
1 parent f5de0e9 commit 7a7a796

File tree

20 files changed

+4272
-72
lines changed

20 files changed

+4272
-72
lines changed

Cargo.lock

Lines changed: 199 additions & 27 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,11 +66,14 @@ mac_address = { version = "1.1.8", default-features = false, features = [] }
6666
mio = { version = "1.0.4", default-features = false, features = [] }
6767
multi_index_map = { version = "0.15.0", default-features = false, features = [] }
6868
netdev = { version = "0.35.2", default-features = false, features = [] }
69+
netlink-packet-route = { version = "0.24.0", default-features = false, features = [] }
6970
nix = { version = "0.30.1", default-features = false, features = ["socket"] }
7071
ordermap = { version = "0.5.7", default-features = false, features = [] }
7172
pretty_assertions = { version = "1.4.1", default-features = false, features = ["std"] }
7273
procfs = { version = "0.17.0", default-features = false, features = [] }
73-
rtnetlink = { version = "0.17.0", default-features = false, features = [] }
74+
#rtnetlink = { git = "https://github.com/daniel-noland/rtnetlink.git", branch = "hh/tc-actions", default-features = false, features = [] }
75+
#rtnetlink = { version = "0.17.0", default-features = false, features = [] }
76+
rtnetlink = { path = "../rtnetlink", default-features = false, features = [] }
7477
rustyline = { version = "16.0.0", default-features = false, features = [] }
7578
serde = { version = "1.0.219", default-features = false, features = [] }
7679
serde_yml = { version = "0.0.12", default-features = false, features = [] }

interface-manager/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ bolero = { workspace = true, optional = true, default-features = false, features
1919
derive_builder = { workspace = true, default-features = false, features = ["default"] }
2020
futures = { workspace = true, features = ["default"] }
2121
multi_index_map = { workspace = true, features = ["serde"] }
22+
pci-ids = { version = "0.2.5", default-features = false, features = [] }
23+
pci-info = { version = "0.3.2", features = ["pci_subclass_debug_strings", "default", "pci_interface_func_debug_strings", "pci_class_debug_strings"] }
2224
rtnetlink = { workspace = true, features = ["default", "tokio"] }
2325
serde = { workspace = true, features = ["std"] }
2426
thiserror = { workspace = true, features = ["std"] }

interface-manager/src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ use std::marker::PhantomData;
1919
use std::sync::Arc;
2020

2121
pub mod interface;
22+
pub mod physical;
23+
pub mod tc;
2224

2325
use rtnetlink::Handle;
2426

interface-manager/src/physical/mod.rs

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// Copyright Open Network Fabric Authors
3+
4+
#[repr(u16)]
5+
pub enum KnownNetworkCardVendor {
6+
Intel = 0x8086,
7+
Mellanox = 0x15b3,
8+
}
9+
10+
impl From<KnownNetworkCardVendor> for String {
11+
fn from(value: KnownNetworkCardVendor) -> Self {
12+
match value {
13+
KnownNetworkCardVendor::Intel => "Intel Corporation".to_string(),
14+
KnownNetworkCardVendor::Mellanox => "Mellanox Technologies".to_string(),
15+
}
16+
}
17+
}
18+
19+
mod vendor {
20+
use crate::physical::KnownNetworkCardVendor;
21+
pub(super) const MELLANOX: u16 = KnownNetworkCardVendor::Mellanox as u16;
22+
pub(super) const INTEL: u16 = KnownNetworkCardVendor::Intel as u16;
23+
}
24+
25+
pub enum NetworkCardVendor {
26+
Known(KnownNetworkCardVendor),
27+
Unknown(UnknownNetworkCardVendor),
28+
}
29+
30+
#[repr(transparent)]
31+
pub struct UnknownNetworkCardVendor(u16);
32+
33+
impl From<KnownNetworkCardVendor> for u16 {
34+
fn from(value: KnownNetworkCardVendor) -> Self {
35+
value as u16
36+
}
37+
}
38+
39+
impl From<u16> for NetworkCardVendor {
40+
fn from(value: u16) -> Self {
41+
match value {
42+
vendor::INTEL => NetworkCardVendor::Known(KnownNetworkCardVendor::Intel),
43+
vendor::MELLANOX => NetworkCardVendor::Known(KnownNetworkCardVendor::Mellanox),
44+
_ => NetworkCardVendor::Unknown(UnknownNetworkCardVendor(value)),
45+
}
46+
}
47+
}
48+
49+
#[cfg(test)]
50+
mod test {
51+
use pci_ids::Device;
52+
use pci_info::PciInfo;
53+
use pci_info::pci_enums::PciDeviceInterfaceFunc;
54+
use std::fs;
55+
56+
#[test]
57+
fn pci_test() {
58+
// Enumerate the devices on the PCI bus using the default
59+
// enumerator for the current platform. The `unwrap()` panics if
60+
// the enumeration fatally fails.
61+
let info = PciInfo::enumerate_pci().unwrap();
62+
63+
// Print out some properties of the enumerated devices.
64+
// Note that the collection contains both devices and errors
65+
// as the enumeration of PCI devices can fail entirely (in which
66+
// case `PciInfo::enumerate_pci()` would return error) or
67+
// partially (in which case an error would be inserted in the
68+
// result).
69+
info.iter().filter_map(Result::ok).for_each(|pci_device| {
70+
if let Ok(PciDeviceInterfaceFunc::NetworkController_Ethernet_Default) =
71+
pci_device.device_iface()
72+
{
73+
let device =
74+
match Device::from_vid_pid(pci_device.vendor_id(), pci_device.device_id()) {
75+
None => {
76+
println!(
77+
"Unknown device: {:#x}:{:#x}",
78+
pci_device.vendor_id(),
79+
pci_device.device_id()
80+
);
81+
let location = match pci_device.location() {
82+
Ok(location) => location,
83+
Err(err) => {
84+
eprintln!("{err}");
85+
return;
86+
}
87+
};
88+
match fs::read_link(format!("/sys/bus/pci/devices/{location}/physfn")) {
89+
Ok(physfn) => {
90+
println!(
91+
"physfn: {}",
92+
physfn
93+
.strip_prefix("../")
94+
.unwrap()
95+
.as_os_str()
96+
.to_string_lossy()
97+
);
98+
}
99+
Err(err) => {
100+
eprintln!("{err}");
101+
return;
102+
}
103+
}
104+
return;
105+
}
106+
Some(device) => device,
107+
};
108+
println!("device: {device:#?}");
109+
}
110+
});
111+
}
112+
}

interface-manager/src/tc/README.md

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
# The linux TC subsystem
2+
3+
TC is the linux traffic control system.
4+
It is a complex beast, but it is also very powerful.
5+
I have tried to provide a high-level overview of the parts of `tc` which we need here.
6+
7+
```mermaid
8+
---
9+
title: Linux Traffic Control (tc) Entity Relationship diagram
10+
---
11+
erDiagram
12+
netdev ||--|{ qdisc : ""
13+
block |o--o{ qdisc : "ingress and/or egress"
14+
filter ||--o{ match : ""
15+
filter }o--o{ action : ""
16+
block ||--o{ chain : ""
17+
chain ||--o{ filter : ""
18+
```
19+
20+
## What are these things?
21+
22+
### qdisc
23+
24+
**A `qdisc` is short for queuing discipline**.
25+
There are many different algorithms for queueing packets, but the non-trivial case pretty much exclusively focuses on _the order in which packets **egress**_ a network card.
26+
27+
`qdisc` also applies to ingress traffic, but in a much more trivial sense.
28+
You can't really control the order or timing in which packets arrive, so the ingress `qdisc` is mostly just a place to attach [filters], [chains], and (optionally) [blocks].
29+
30+
In general, `qdisc`s are quite complex and powerful, but we only need the "trivial" case (ingress) for our current design.
31+
Our ingress qdisc of choice is called clsact; basically FIFO but with the ability to attach (potentially offloaded) [filter]s rules.
32+
33+
### filter
34+
35+
A filter is basically a tuple of
36+
37+
1. a match criteria; some list of selectors which decide if this filter applies to a given packet.
38+
2. a list of actions; some set of operations to perform if the match criteria apply.
39+
3. a priority; a number to disambiguate which filter applies if multiple filters match. In linux, tc filters match _lower_ priority first.
40+
41+
Filters exist within a [chain], and chains live on network interfaces or [blocks] of network interfaces.
42+
43+
Some examples of filters include matching on ARP, or IPv4 packets with source ip in the range `192.168.5.0/24`.
44+
Match criteria can be combined to form complex filters.
45+
In such cases, a packet must match all listed criteria to trigger the [action]s.
46+
47+
### action
48+
49+
An action is some type of manipulation or event that may occur when a packet matches on a [filter] criteria.
50+
51+
Examples of action include
52+
53+
* dropping packets,
54+
* editing source or destination ip addresses,
55+
* pushing or popping VLAN headers,
56+
* redirecting the packet to the ingress or egress pipeline of another network device,
57+
* encapsulating the packet in a VXLAN packet,
58+
* or mirroring (copying) the packet to another network device.
59+
60+
Actions are reference counted and **may be attached to more than one filter.**
61+
More specifically, each action has four important variables associated with it
62+
63+
1. The action `kind`. `kind` is a static string which identifies the family of the action. Examples include `mirred`, which covers packet redirect and mirroring; `gact`, which includes actions like `drop` or `jump`; and `tunnel_key`, which includes encapsulation and decapsulation actions. Many other families of actions exist in the linux `tc` subsystem.
64+
2. The action `index`. `index`, is a unique (_per `kind`_) identifier to track the specific incidence of a given action. For example, if you might create a `gact` dro action with index 17. You may then attach that drop command to several different filters by referencing that index on filter creation. If you recycle an action in this way then all the filters associated with that action will update the hit counters for that action. Reuse of actions in this way may also save resources in the network card (assuming the matches and action can be offloaded).
65+
3. The action `bindcnt`. This is the number of active filters which reference this action.
66+
4. The action `refcnt`. This is the reference count of the action. This number is either equal to the `bindcnt` or is equal to the `bindcnt` plus one.
67+
68+
If you create an action as part of a filter creation command, then you the `refcnt` and the `bindcnt` numbers will be equal.
69+
If you create an action without attaching it to a filter, you need to set the `refcnt` to one (in which case the `bindcnt` will be zero automatically).
70+
71+
Linux will automatically remove any action if and only if the `bindcnt` _and_ `refcnt` are equal to zero.
72+
"Deleting" an action sets its `refcnt` to its `bindcnt`.
73+
Note that this means you need to both delete an action _and_ detach that action from all active filters before linux will actually remove the action.
74+
This is a good thing!
75+
It prevents you from kicking the legs out from under your own network card by deleting an action (some actions make no sense without their accompanying actions).
76+
77+
### chain
78+
79+
A `chain` is basically a list of filters sorted in descending order of priority.
80+
A network device or [block] may have multiple chains attached to it.
81+
Packets start processing on the device or [block]'s chain 0, and may or may not move to another chain (depending on which actions they encounter).
82+
`jump` and `goto chain` are among the possible [actions].
83+
84+
### block
85+
86+
A `block` is basically a list of [qdisc]s on different network interfaces.
87+
88+
A [filter] may be attached to either a [qdisc] or a `block` of qdiscs.
89+
This is useful in our case!
90+
91+
Imagine you have 10 network interfaces which are all part of the same underlying physical switch ASIC.
92+
If you configure `tc` [filter]s in the most basic way, you may need to install the same rule to the ASIC 10 times.
93+
94+
This is bad for a few major reasons:
95+
96+
1. It is pointlessly complex. You would need to keep track of all 10 rules. You need to create it 10 times, delete it 10 times, update it 10 times, track the counters 10 times, and so on.
97+
2. It is wasteful of resources in the ASIC. Installing the rule into the hardware once saves TCAM or SRAM in the ASIC (both of which are in limited supply).
98+
3. It is more prone to race conditions. Updating the rule in one place is usually much easier to sequence when transitioning between rule sets.

0 commit comments

Comments
 (0)