Skip to content
This repository was archived by the owner on Dec 13, 2018. It is now read-only.

Add checkpoint/restore functionality to nsinit and libcontainer #204

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions namespaces/checkpoint.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
package namespaces

import (
"fmt"
"log"
"os/exec"
"strconv"

"github.com/docker/libcontainer"
)

const (
CheckpointLog = "dump.log"
RestoreLog = "restore.log"
)

// Checkpoint the specified container using the criu(8) utility.
func Checkpoint(criuBinary string, container *libcontainer.Config, imageDir string, initPid int, verbose bool) error {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you please add some comments to the exported functions?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

// Prepare command line arguments.
args := []string{
"dump", "-v4",
"-D", imageDir, "-o", CheckpointLog,
"--root", container.RootFs,
"--manage-cgroups", "--evasive-devices",
"-t", strconv.Itoa(initPid),
}
for _, mountpoint := range container.MountConfig.Mounts {
args = append(args, "--ext-mount-map", fmt.Sprintf("%s:%s", mountpoint.Destination, mountpoint.Destination))
}

// Execute criu to checkpoint.
if verbose {
log.Printf("Running CRIU: %s %v\n", criuBinary, args)
}
output, err := exec.Command(criuBinary, args...).CombinedOutput()
if verbose && len(output) > 0 {
log.Printf("%s\n", output)
}
return err
}
34 changes: 34 additions & 0 deletions namespaces/restore.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
package namespaces

import (
"fmt"
"log"
"os/exec"

"github.com/docker/libcontainer"
)

// Restore the specified container (previously checkpointed) using the
// criu(8) utility.
func Restore(criuBinary string, container *libcontainer.Config, imageDir string, verbose bool) error {
// Prepare command line arguments.
args := []string{
"restore", "-d", "-v4",
"-D", imageDir, "-o", RestoreLog,
"--root", container.RootFs,
"--manage-cgroups", "--evasive-devices",
}
for _, mountpoint := range container.MountConfig.Mounts {
args = append(args, "--ext-mount-map", fmt.Sprintf("%s:%s", mountpoint.Destination, mountpoint.Source))
}

// Execute criu to restore.
if verbose {
log.Printf("Running CRIU: %s %v\n", criuBinary, args)
}
output, err := exec.Command(criuBinary, args...).CombinedOutput()
if verbose && len(output) > 0 {
log.Printf("%s\n", output)
}
return err
}
108 changes: 108 additions & 0 deletions nsinit/checkpoint.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
package main

import (
"log"
"os"
"path/filepath"

"github.com/codegangsta/cli"
"github.com/docker/libcontainer"
)

var checkpointDescription string = `Checkpoint a process tree running in a container with the criu(8)
utility.

The container ID is determined by one of the following three methods
in the order described:

1. Command line argument
2. Last pathname component of the data_path environment variable
3. Last pathname component of the current working directory

If method 1 is used, the libcontainer's home directory must be
specified in the LIBCONTAINER_DIR environment variable.

Nsinit expects to find the container's container.json and state.json
files in the container subdirectory of the libcontainer's home
directory.

The user has to specify an image home directory for criu(8) by setting
the CRIU_IMG_HOME_DIR environment variable. Within this directory,
a container's image files will be saved in <container_id>/criu_img
subdirectory.

Returns 0 on success. On error, prints an error message and returns
a non-zero code.

ENVIRONMENT:
CRIU_BINARY criu binary to execute, if not set "criu" is assumed
CRIU_IMG_HOME_DIR criu image home directory
LIBCONTAINER_DIR libcontainer home directory
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: It is not clear what this should point to from the description.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The EXAMPLE: section right below this section shows the typical value for LIBCONTAINER_DIR. It makes life easy for the user because its value doesn't change from container to container. In contract, data_path is set to the actual container.json file and the user has to repeatedly set it for each container.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A slight correction to my comment above: data_path is set to the directory where container.json file is. The pathname to this directory includes the container ID which changes from container to container. So the preferred way of doing checkpoint and restore (least amount of typing) is to set LIBCONTAINER_DIR once and provide container ID on nsinit command line. In this case, the user can be in any directory.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

awesome.

data_path directory pathname of container.json (no trailing /)
log pathname where to log

EXAMPLE:
# export LIBCONTAINER_DIR=/var/lib/docker/execdriver/native
# export CRIU_IMG_HOME_DIR=/var/lib/docker/containers
# docker ps -lq --no-trunc
281ab0098269e515e3f81661c3cd6272abb640cf352efc64b3b98cc2470f3944
# nsinit checkpoint 281ab0098269e515e3f81661c3cd6272abb640cf352efc64b3b98cc2470f3944
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right now nsinit works off of the cwd. Is there a reason why we cannot use that same flow with the checkpoint and restore commands instead of passing a docker container id?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Checkpoint does actually work off of cwd (without the need to set LIBCONTAINER_DIR or specifying container ID), but restore will not work because Docker removes the container directory after checkpoint (container has exited). IOW, there is no cwd for restore.

checkpoint succeeded
# `

var checkpointCommand = cli.Command{
Name: "checkpoint",
Usage: "checkpoint a container",
Action: checkpointAction,
Description: checkpointDescription,
Flags: []cli.Flag{
cli.BoolFlag{Name: "verbose, v", Usage: "enable verbose mode"},
},
}

func checkpointAction(context *cli.Context) {
if len(context.Args()) > 1 {
log.Fatal("Too many command line arguments\n")
}

// Get container ID and set dataPath if needed.
containerId := getContainerId(context)

// Load the container.json file to verify that we have
// a valid container.
container, err := loadConfig()
if err != nil {
log.Fatal(err)
}

// Get the init PID of the process tree from state.json.
state, err := libcontainer.GetState(dataPath)
if err != nil {
log.Fatal(err)
}
initPid := state.InitPid
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we want to verify we didn't get a 0? Which would be an unitialized InitPid

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We do. If we go to CRIU RPC with pid == 0 it will dump the requester :)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added a check against initPid of 0. If/when we use CRIU RPC, will revisit the issue.

if initPid == 0 {
log.Fatal("Container's init PID is uninitialized\n")
}

// Create an image directory for this container (which
// may already exist from a previous checkpoint).
imageDir := getImageDir(containerId)
err = os.MkdirAll(imageDir, 0700)
if err != nil {
log.Fatal(err)
}

// Copy container.json in the criu image directory for
// later use during restore.
copyFile(filepath.Join(dataPath, "container.json"), filepath.Join(imageDir, "container.json"))
if err != nil {
log.Fatal(err)
}

// Run criu and exit on error because our caller doesn't take return value.
err = runCriu(context, container, containerId, imageDir, initPid)
if err != nil {
log.Fatal(err)
}
}
99 changes: 99 additions & 0 deletions nsinit/cr_common.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
package main

import (
"fmt"
"log"
"os"
"path/filepath"

"github.com/codegangsta/cli"
"github.com/docker/libcontainer"
"github.com/docker/libcontainer/namespaces"
)

// Refer to the description message at the beginning of this file to
// see how we get the container ID.
func getContainerId(context *cli.Context) string {
if len(context.Args()) == 1 {
// dataPath may have been initialized from data_path if
// in environment (if it was set). But if a container
// ID is specified on the command line, dataPath will be
// reinitialized here from LIBCONTAINER_DIR environment
// variable and the specified container ID.
containerId := context.Args()[0]
if libcontainerDir == "" {
log.Fatal("LIBCONTAINER_DIR not set")
}
dataPath = filepath.Join(libcontainerDir, containerId)
return containerId
}

if dataPath == "" {
// If the container has been checkpointed, the directory
// where its container.json file existed has been removed.
// So for restore, we cannot get the container ID from
// the cwd pathname.
if context.Command.Name == "restore" {
log.Fatal("Specify container ID as an argument or set data_path env var")
}

cwd, err := os.Getwd()
if err != nil {
log.Fatal(err)
}
dataPath = cwd
}

// Extract container ID from the pathname.
containerId := filepath.Base(dataPath)
if containerId == "" {
log.Fatal("Cannot determine container ID")
}

return containerId
}

// Return the directory pathname where CRIU should save and retrieve
// its image files.
func getImageDir(containerId string) string {
p := os.Getenv("CRIU_IMG_HOME_DIR")
if p == "" {
log.Fatal("CRIU_IMG_HOME_DIR not set")
}
return filepath.Join(p, containerId, "criu_img")
}

// Common code for checkpoint and restore.
func runCriu(context *cli.Context, container *libcontainer.Config, containerId, imageDir string, initPid int) error {
verbose := context.Bool("verbose")
cmd := context.Command.Name

criuBinary := os.Getenv("CRIU_BINARY")
if criuBinary == "" {
criuBinary = "criu"
}

var err error
if cmd == "checkpoint" {
err = namespaces.Checkpoint(criuBinary, container, imageDir, initPid, verbose)
} else {
err = namespaces.Restore(criuBinary, container, imageDir, verbose)
}
if !verbose {
return err
}

if err == nil {
fmt.Printf("%s succeeded\n", cmd)
} else {
fmt.Printf("%s failed: %s\n", cmd, err)
var logFile string
if cmd == "checkpoint" {
logFile = filepath.Join(imageDir, namespaces.CheckpointLog)
} else {
logFile = filepath.Join(imageDir, namespaces.RestoreLog)
}
fmt.Printf("Cause of failure may be in %s\n", logFile)
}
return err
}
7 changes: 4 additions & 3 deletions nsinit/init.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,10 @@ import (
)

var (
dataPath = os.Getenv("data_path")
console = os.Getenv("console")
rawPipeFd = os.Getenv("pipe")
dataPath = os.Getenv("data_path")
console = os.Getenv("console")
rawPipeFd = os.Getenv("pipe")
libcontainerDir = os.Getenv("LIBCONTAINER_DIR")

initCommand = cli.Command{
Name: "init",
Expand Down
4 changes: 3 additions & 1 deletion nsinit/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ func main() {
app := cli.NewApp()

app.Name = "nsinit"
app.Version = "0.1"
app.Version = "0.2"
app.Author = "libcontainer maintainers"
app.Flags = []cli.Flag{
cli.StringFlag{Name: "nspid"},
Expand All @@ -59,6 +59,8 @@ func main() {
configCommand,
pauseCommand,
unpauseCommand,
checkpointCommand,
restoreCommand,
}

if err := app.Run(os.Args); err != nil {
Expand Down
Loading