Skip to content

Commit d14a76a

Browse files
committed
internal/exec/stages/disks: prevent races with udev
The "udevadm settle" command used to wait for udev to process the disk changes and recreate the entries under /dev was still prone to races where udev didn't get notified yet of the final event to wait for. This caused the boot with a btrfs root filesystem created by Ignition to fail almost every time on certain hardware. Issue tagged events and wait for them to be processed by udev. This is actually meanigful in all stages not only for the other parts of the initramfs which may be surprised by sudden device nodes disappearing shortly like the case was with systemd's fsck service but also for the inter-stage dependencies which currently are using the waiter for systemd device units but that doesn't really prevent from races with udev device node recreation. Thus, these changes are complementary to the existing waiter which mainly has the purpose to wait for unmodified devices. For newly created RAIDs we can wait for the new node to be available as udev will not recreate it.
1 parent 980f5b4 commit d14a76a

File tree

5 files changed

+86
-29
lines changed

5 files changed

+86
-29
lines changed

internal/exec/stages/disks/disks.go

+34-29
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
"errors"
2323
"fmt"
2424
"os/exec"
25+
"path/filepath"
2526

2627
"github.com/coreos/ignition/v2/config/v3_4_experimental/types"
2728
"github.com/coreos/ignition/v2/internal/distro"
@@ -105,35 +106,39 @@ func (s stage) Run(config types.Config) error {
105106
return fmt.Errorf("failed to create filesystems: %v", err)
106107
}
107108

108-
// udevd registers an IN_CLOSE_WRITE inotify watch on block device
109-
// nodes, and synthesizes udev "change" events when the watch fires.
110-
// mkfs.btrfs triggers multiple such events, the first of which
111-
// occurs while there is no recognizable filesystem on the
112-
// partition. Thus, if an existing partition is reformatted as
113-
// btrfs while keeping the same filesystem label, there will be a
114-
// synthesized uevent that deletes the /dev/disk/by-label symlink
115-
// and a second one that restores it. If we didn't account for this,
116-
// a systemd unit that depended on the by-label symlink (e.g.
117-
// systemd-fsck-root.service) could have the symlink deleted out
118-
// from under it.
119-
//
120-
// There's no way to fix this completely. We can't wait for the
121-
// restoring uevent to propagate, since we can't determine which
122-
// specific uevents were triggered by the mkfs. We can wait for
123-
// udev to settle, though it's conceivable that the deleting uevent
124-
// has already been processed and the restoring uevent is still
125-
// sitting in the inotify queue. In practice the uevent queue will
126-
// be the slow one, so this should be good enough.
127-
//
128-
// Test case: boot failure in coreos.ignition.*.btrfsroot kola test.
129-
//
130-
// Additionally, partitioning (and possibly creating raid) suffers
131-
// the same problem. To be safe, always settle.
132-
if _, err := s.Logger.LogCmd(
133-
exec.Command(distro.UdevadmCmd(), "settle"),
134-
"waiting for udev to settle",
135-
); err != nil {
136-
return fmt.Errorf("udevadm settle failed: %v", err)
109+
return nil
110+
}
111+
112+
// waitForUdev triggers a tagged event and waits for it to bubble up
113+
// again. This ensures that udev processed the device changes.
114+
// The requirement is that the used device path exists and itself is
115+
// not recreated by udev seeing the changes done. Thus, resolve a
116+
// /dev/disk/by-something/X symlink before performing the device
117+
// changes (i.e., pass /run/ignition/dev_aliases/X) and, e.g., don't
118+
// call it for a partition but the full disk if you modified the
119+
// partition table.
120+
func (s stage) waitForUdev(dev string) error {
121+
// Resolve the original /dev/ABC entry because udevadm wants
122+
// this as argument instead of a symlink like
123+
// /run/ignition/dev_aliases/X.
124+
devPath, err := filepath.EvalSymlinks(dev)
125+
if err != nil {
126+
return fmt.Errorf("failed to resolve device alias %q: %v", dev, err)
127+
}
128+
// By triggering our own event and waiting for it we know that udev
129+
// will have processed the device changes, a bare "udevadm settle"
130+
// is prone to races with the inotify queue. We expect the /dev/DISK
131+
// entry to exist because this function is either called for the full
132+
// disk and only the /dev/DISKpX partition entries will change, or the
133+
// function is called for a partition where the contents changed and
134+
// nothing causes the kernel/udev to reread the partition table and
135+
// recreate the /dev/DISKpX entries. If that was the case best we could
136+
// do here is to add a retry loop (and relax the function comment).
137+
_, err = s.Logger.LogCmd(
138+
exec.Command(distro.UdevadmCmd(), "trigger", "--settle",
139+
devPath), "waiting for triggered uevent")
140+
if err != nil {
141+
return fmt.Errorf("udevadm trigger failed: %v", err)
137142
}
138143

139144
return nil

internal/exec/stages/disks/filesystems.go

+23
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,29 @@ func (s stage) createFilesystem(fs types.Filesystem) error {
210210
return fmt.Errorf("mkfs failed: %v", err)
211211
}
212212

213+
// udevd registers an IN_CLOSE_WRITE inotify watch on block device
214+
// nodes, and synthesizes udev "change" events when the watch fires.
215+
// mkfs.btrfs triggers multiple such events, the first of which
216+
// occurs while there is no recognizable filesystem on the
217+
// partition. Thus, if an existing partition is reformatted as
218+
// btrfs while keeping the same filesystem label, there will be a
219+
// synthesized uevent that deletes the /dev/disk/by-label symlink
220+
// and a second one that restores it. If we didn't account for this,
221+
// a systemd unit that depended on the by-label symlink (e.g.
222+
// systemd-fsck-root.service) could have the symlink deleted out
223+
// from under it.
224+
// Trigger a tagged uevent and wait for it because a bare "udevadm
225+
// settle" does not guarantee that all changes were processed
226+
// because it's conceivable that only the deleting uevent has
227+
// already been processed (or none!) while the restoring uevent
228+
// is still sitting in the inotify queue. By triggering our own
229+
// event and waiting for it we know that udev will have processed
230+
// the device changes.
231+
// Test case: boot failure in coreos.ignition.*.btrfsroot kola test.
232+
if err := s.waitForUdev(devAlias); err != nil {
233+
return fmt.Errorf("createFilesystems: %v", err)
234+
}
235+
213236
return nil
214237
}
215238

internal/exec/stages/disks/luks.go

+8
Original file line numberDiff line numberDiff line change
@@ -338,6 +338,14 @@ func (s *stage) createLuks(config types.Config) error {
338338
}
339339
delete(s.State.LuksPersistKeyFiles, luks.Name)
340340
}
341+
342+
// It's best to wait here for the /dev/disk/by-*/X entries to be
343+
// (re)created, not only for other parts of the initramfs but
344+
// also because s.waitOnDevices() can still race with udev's
345+
// disk entry recreation.
346+
if err := s.waitForUdev(devAlias); err != nil {
347+
return fmt.Errorf("createLuks: %v", err)
348+
}
341349
}
342350

343351
return nil

internal/exec/stages/disks/partitions.go

+9
Original file line numberDiff line numberDiff line change
@@ -394,5 +394,14 @@ func (s stage) partitionDisk(dev types.Disk, devAlias string) error {
394394
if err := op.Commit(); err != nil {
395395
return fmt.Errorf("commit failure: %v", err)
396396
}
397+
398+
// It's best to wait here for the /dev/ABC entries to be
399+
// (re)created, not only for other parts of the initramfs but
400+
// also because s.waitOnDevices() can still race with udev's
401+
// partition entry recreation.
402+
if err := s.waitForUdev(devAlias); err != nil {
403+
return fmt.Errorf("createPartitions: %v", err)
404+
}
405+
397406
return nil
398407
}

internal/exec/stages/disks/raid.go

+12
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ package disks
2222
import (
2323
"fmt"
2424
"os/exec"
25+
"strings"
2526

2627
"github.com/coreos/ignition/v2/config/v3_4_experimental/types"
2728
"github.com/coreos/ignition/v2/internal/distro"
@@ -78,6 +79,17 @@ func (s stage) createRaids(config types.Config) error {
7879
); err != nil {
7980
return fmt.Errorf("mdadm failed: %v", err)
8081
}
82+
83+
devName := md.Name
84+
if !strings.HasPrefix(devName, "/dev") {
85+
devName = "/dev/md/" + md.Name
86+
}
87+
// Wait for the created device node to show up, no udev
88+
// race prevention required because this node did not
89+
// exist before.
90+
if err := s.waitOnDevices([]string{devName}, "raids"); err != nil {
91+
return err
92+
}
8193
}
8294

8395
return nil

0 commit comments

Comments
 (0)