Skip to content

Commit

Permalink
Merge pull request #268 from flatcar-linux/tormath1/em
Browse files Browse the repository at this point in the history
equinix-metal: recycle existing instances
  • Loading branch information
tormath1 authored Dec 17, 2021
2 parents 890a4ef + 4fb3794 commit 54f6daa
Show file tree
Hide file tree
Showing 7 changed files with 133 additions and 30 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- BPF test with DNS gadget from Inspektor Gadget ([#260](https://github.com/flatcar-linux/mantle/pull/260))
- BPF execsnoop test ([#233](https://github.com/flatcar-linux/mantle/pull/233))
- plume: Enable arm64 board uploads for the Stable channel ([#266](https://github.com/flatcar-linux/mantle/pull/266))
- A way to reuse Equinix Metal devices during tests ([#268](https://github.com/flatcar-linux/mantle/pull/268))

### Changed
- `lsblk --json` output handling ([#244](https://github.com/flatcar-linux/mantle/pull/244))
Expand Down
2 changes: 1 addition & 1 deletion cmd/ore/packet/create-device.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ func runCreateDevice(cmd *cobra.Command, args []string) error {
os.Exit(1)
}

device, err := API.CreateDevice(hostname, conf, nil)
device, err := API.CreateOrUpdateDevice(hostname, conf, nil, "")
if err != nil {
fmt.Fprintf(os.Stderr, "Couldn't create device: %v\n", err)
os.Exit(1)
Expand Down
79 changes: 63 additions & 16 deletions platform/api/packet/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ func (a *API) PreflightCheck() error {
}

// console is optional, and is closed on error or when the device is deleted.
func (a *API) CreateDevice(hostname string, conf *conf.Conf, console Console) (*packngo.Device, error) {
func (a *API) CreateOrUpdateDevice(hostname string, conf *conf.Conf, console Console, id string) (*packngo.Device, error) {
consoleStarted := false
defer func() {
if console != nil && !consoleStarted {
Expand Down Expand Up @@ -212,7 +212,7 @@ func (a *API) CreateDevice(hostname string, conf *conf.Conf, console Console) (*
}
defer a.bucket.Delete(context.TODO(), ipxeScriptName)

device, err := a.createDevice(hostname, ipxeScriptURL)
device, err := a.createDevice(hostname, ipxeScriptURL, id)
if err != nil {
return nil, fmt.Errorf("couldn't create device: %v", err)
}
Expand Down Expand Up @@ -249,6 +249,15 @@ func (a *API) CreateDevice(hostname string, conf *conf.Conf, console Console) (*
return nil, fmt.Errorf("timed out waiting for flatcar-install: %v", err)
}

// TCP discard service has been reached so `flatcar-install` is done.
// We can deactivate `PXE` boot to avoid bootlooping.
alwaysPXE := false
if _, _, err = a.c.Devices.Update(deviceID, &packngo.DeviceUpdateRequest{
AlwaysPXE: &alwaysPXE,
}); err != nil {
return nil, fmt.Errorf("unable to deactivate PXE boot: %v", err)
}

plog.Debugf("Finished installation of device: %q", deviceID)

return device, nil
Expand Down Expand Up @@ -478,22 +487,58 @@ boot`, a.opts.InstallerImageKernelURL, userdataURL, linuxConsole[a.opts.Board],
}

// device creation seems a bit flaky, so try a few times
func (a *API) createDevice(hostname, ipxeScriptURL string) (device *packngo.Device, err error) {
func (a *API) createDevice(hostname, ipxeScriptURL, id string) (*packngo.Device, error) {
var err error

// we force a PXE boot in order to fetch the
// new configuration and prevent to boot from a mis-installed Flatcar.
alwaysPXE := true

for tries := apiRetries; tries >= 0; tries-- {
var response *packngo.Response
device, response, err = a.c.Devices.Create(&packngo.DeviceCreateRequest{
ProjectID: a.opts.Project,
Facility: []string{a.opts.Facility},
Plan: a.opts.Plan,
BillingCycle: "hourly",
Hostname: hostname,
OS: "custom_ipxe",
IPXEScriptURL: ipxeScriptURL,
Tags: []string{"mantle"},
})
var (
device *packngo.Device
response *packngo.Response
)

if id != "" {
plog.Infof("Recycling instance: %s", id)
device, response, err = a.c.Devices.Update(id, &packngo.DeviceUpdateRequest{
AlwaysPXE: &alwaysPXE,
IPXEScriptURL: &ipxeScriptURL,
Hostname: &hostname,
})
if err != nil {
err = fmt.Errorf("updating device: %w", err)
continue
}

// we reboot the instance to apply the changes.
response, err = a.c.Devices.Reboot(id)
if err != nil {
err = fmt.Errorf("rebooting device: %w", err)
continue
}

plog.Infof("device rebooted: %s", id)
} else {
plog.Infof("Recycling is not possible, creating a new instance")
device, response, err = a.c.Devices.Create(&packngo.DeviceCreateRequest{
ProjectID: a.opts.Project,
Facility: []string{a.opts.Facility},
Plan: a.opts.Plan,
BillingCycle: "hourly",
Hostname: hostname,
OS: "custom_ipxe",
IPXEScriptURL: ipxeScriptURL,
Tags: []string{"mantle"},
AlwaysPXE: alwaysPXE,
})
}

if err == nil || response.StatusCode != 500 {
return
return device, err
}

plog.Debugf("Retrying to create device after failure: %q %q %q \n", device, response, err)
if device != nil && device.ID != "" {
a.DeleteDevice(device.ID)
Expand All @@ -502,7 +547,8 @@ func (a *API) createDevice(hostname, ipxeScriptURL string) (device *packngo.Devi
time.Sleep(apiRetryInterval)
}
}
return

return nil, fmt.Errorf("reached maximum number of retries to create/update a device: %w", err)
}

func (a *API) startConsole(deviceID string, console Console) error {
Expand Down Expand Up @@ -544,6 +590,7 @@ func (a *API) startConsole(deviceID string, console Console) error {
}
go func() {
err := runner()

if err != nil {
ready <- err
}
Expand Down
27 changes: 19 additions & 8 deletions platform/machine/packet/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,16 @@ func (pc *cluster) NewMachine(userdata *conf.UserData) (platform.Machine, error)
pcons = cons
}

// CreateDevice unconditionally closes console when done with it
device, err = pc.flight.api.CreateDevice(vmname, conf, pcons)
var id string
select {
case i := <-pc.flight.devicesPool:
id = i
default:
id = ""
}

// CreateOrUpdateDevice unconditionally closes console when done with it
device, err = pc.flight.api.CreateOrUpdateDevice(vmname, conf, pcons, id)
if err != nil {
continue // provisioning error
}
Expand All @@ -90,37 +98,40 @@ func (pc *cluster) NewMachine(userdata *conf.UserData) (platform.Machine, error)
mach.publicIP = pc.flight.api.GetDeviceAddress(device, 4, true)
mach.privateIP = pc.flight.api.GetDeviceAddress(device, 4, false)
if mach.publicIP == "" || mach.privateIP == "" {
mach.Destroy()
pc.flight.api.DeleteDevice(mach.ID())
err = fmt.Errorf("couldn't find IP addresses for device")
continue // provisioning error
}

// Warning: the assumption is that within one test a machine doesn't get reused
// (if a test would create, destroy and create a machine explicitly)
// otherwise the console file names will clash
dir := filepath.Join(pc.RuntimeConf().OutputDir, mach.ID())
if err = os.Mkdir(dir, 0777); err != nil {
mach.Destroy()
pc.flight.api.DeleteDevice(mach.ID())
return nil, err
}

if cons != nil {
if err = os.Rename(consolePath, filepath.Join(dir, "console.txt")); err != nil {
mach.Destroy()
pc.flight.api.DeleteDevice(mach.ID())
return nil, err
}
}

confPath := filepath.Join(dir, "user-data")
if err = conf.WriteFile(confPath); err != nil {
mach.Destroy()
pc.flight.api.DeleteDevice(mach.ID())
return nil, err
}

if mach.journal, err = platform.NewJournal(dir); err != nil {
mach.Destroy()
pc.flight.api.DeleteDevice(mach.ID())
return nil, err
}

if err = platform.StartMachine(mach, mach.journal); err != nil {
mach.Destroy()
pc.flight.api.DeleteDevice(mach.ID())
continue // provisioning error
}

Expand Down
22 changes: 21 additions & 1 deletion platform/machine/packet/console.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ package packet

import (
"bytes"
"fmt"
"os"

"golang.org/x/crypto/ssh"
Expand All @@ -26,10 +27,29 @@ type console struct {
f *os.File
buf bytes.Buffer
done chan interface{}
ssh *ssh.Client
}

func (c *console) SSHClient(ip, user string) (*ssh.Client, error) {
return c.pc.UserSSHClient(ip, user)
client, err := c.pc.UserSSHClient(ip, user)
if err != nil {
return nil, fmt.Errorf("getting SSH client: %w", err)
}

c.ssh = client
return client, nil
}

func (c *console) CloseSSH() error {
if c.ssh == nil {
return nil
}

if err := c.ssh.Close(); err != nil {
return fmt.Errorf("closing SSH client: %w", err)
}

return nil
}

func (c *console) Write(p []byte) (int, error) {
Expand Down
18 changes: 16 additions & 2 deletions platform/machine/packet/flight.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ type flight struct {
*platform.BaseFlight
api *packet.API
sshKeyID string
// devicesPool holds the devices available
// to be recycled by EM in order to minimize the
// number of created devices.
devicesPool chan string
}

func NewFlight(opts *packet.Options) (platform.Flight, error) {
Expand All @@ -49,8 +53,9 @@ func NewFlight(opts *packet.Options) (platform.Flight, error) {
}

pf := &flight{
BaseFlight: bf,
api: api,
BaseFlight: bf,
api: api,
devicesPool: make(chan string, 1000),
}

keys, err := pf.Keys()
Expand Down Expand Up @@ -93,5 +98,14 @@ func (pf *flight) Destroy() {
}
}

// before delete the instances from the devices pool
// we close it in order to avoid deadlocks.
close(pf.devicesPool)
for id := range pf.devicesPool {
if err := pf.api.DeleteDevice(id); err != nil {
plog.Errorf("deleting device %s: %v", id, err)
}
}

pf.BaseFlight.Destroy()
}
14 changes: 12 additions & 2 deletions platform/machine/packet/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,18 @@ func (pm *machine) Reboot() error {
}

func (pm *machine) Destroy() {
if err := pm.cluster.flight.api.DeleteDevice(pm.ID()); err != nil {
plog.Errorf("Error terminating device %v: %v", pm.ID(), err)
// Instead of actually deleting the device.
// We add it to the devices pool in order to mark it
// as "ready to be used" by other tests.
id := pm.ID()
pm.cluster.flight.devicesPool <- id
plog.Infof("device %s added to the pool", id)

// The serial console SSH client needs to be manually closed in order to prevent program from
// freezing on `done` channel in the `Output()` console's method.
plog.Infof("closing %s serial console SSH client", id)
if err := pm.console.CloseSSH(); err != nil {
plog.Errorf("closing serial console SSH client: %v", err)
}

if pm.journal != nil {
Expand Down

0 comments on commit 54f6daa

Please sign in to comment.