Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: clean up stopped instances #6030

Merged
merged 4 commits into from
Apr 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 18 additions & 90 deletions .github/spot-runner-action/dist/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -200,15 +200,6 @@ class Ec2Instance {
}
});
}
// async runInstances(params: RunInstancesRequest) {
// const client = await this.getEc2Client();
// try {
// return (await client.runInstances(params).promise()).Instances;
// } catch (error) {
// core.error(`Failed to create instance(s)`);
// throw error;
// }
// }
getSubnetAzId() {
var _a;
return __awaiter(this, void 0, void 0, function* () {
Expand Down Expand Up @@ -329,82 +320,12 @@ class Ec2Instance {
DefaultTargetCapacityType: useOnDemand ? "on-demand" : "spot",
},
};
// const config: SpotFleetRequestConfigData = {
// IamFleetRole:
// "arn:aws:iam::278380418400:role/aws-ec2-spot-fleet-tagging-role",
// TargetCapacity: 1,
// // We always ask for 1 instance, but might ask for 100% on demand or spot
// OnDemandTargetCapacity: useOnDemand ? 1 : 0,
// TerminateInstancesWithExpiration: true,
// Type: "request",
// LaunchSpecifications:
// };
// const params: RequestSpotFleetRequest = {
// SpotFleetRequestConfig: config,
// };
const client = yield this.getEc2Client();
const fleet = yield client.createFleet(createFleetRequest).promise();
const instances = ((fleet === null || fleet === void 0 ? void 0 : fleet.Instances) || [])[0] || {};
return (instances.InstanceIds || [])[0];
});
}
// async getOnDemandInstanceConfiguration(
// ec2SpotInstanceStrategy: string
// ): Promise<RunInstancesRequest> {
// const userData = new UserData(this.config);
// const params: RunInstancesRequest = {
// ImageId: this.config.ec2AmiId,
// InstanceInitiatedShutdownBehavior: "terminate",
// InstanceMarketOptions: {},
// InstanceType: "",
// MaxCount: 1,
// MinCount: 1,
// SecurityGroupIds: [this.config.ec2SecurityGroupId],
// SubnetId: this.config.ec2SubnetId,
// KeyName: this.config.ec2KeyName,
// Placement: {
// AvailabilityZone: await this.getSubnetAz(),
// },
// TagSpecifications: [
// {
// ResourceType: "instance",
// Tags: this.tags,
// },
// ],
// // <aztec>parity with build-system
// BlockDeviceMappings: [
// {
// DeviceName: "/dev/sda1",
// Ebs: {
// VolumeSize: 32,
// },
// },
// ],
// // parity with build-system</aztec>
// UserData: await userData.getUserData(),
// };
// switch (ec2SpotInstanceStrategy.toLowerCase()) {
// case "besteffort":
// case "spotonly": {
// params.InstanceMarketOptions = {
// MarketType: "spot",
// SpotOptions: {
// InstanceInterruptionBehavior: "terminate",
// SpotInstanceType: "one-time",
// },
// };
// break;
// }
// case "none": {
// params.InstanceMarketOptions = {};
// break;
// }
// default: {
// throw new TypeError("Invalid value for ec2_spot_instance_strategy");
// }
// }
// return params;
// }
getInstanceStatus(instanceId) {
return __awaiter(this, void 0, void 0, function* () {
const client = yield this.getEc2Client();
Expand All @@ -420,7 +341,7 @@ class Ec2Instance {
}
});
}
getInstancesForTags() {
getInstancesForTags(instanceStatus) {
return __awaiter(this, void 0, void 0, function* () {
const client = yield this.getEc2Client();
const filters = [
Expand All @@ -438,6 +359,10 @@ class Ec2Instance {
for (const reservation of (yield client.describeInstances(params).promise()).Reservations || []) {
instances = instances.concat(reservation.Instances || []);
}
if (instanceStatus) {
// Filter instances that are stopped
instances = instances.filter((instance) => { var _a; return ((_a = instance === null || instance === void 0 ? void 0 : instance.State) === null || _a === void 0 ? void 0 : _a.Name) === instanceStatus; });
}
return instances;
}
catch (error) {
Expand Down Expand Up @@ -716,9 +641,8 @@ function pollSpotStatus(config, ec2Client, ghClient) {
return __awaiter(this, void 0, void 0, function* () {
// 12 iters x 10000 ms = 2 minutes
for (let iter = 0; iter < 12; iter++) {
const instances = yield ec2Client.getInstancesForTags();
const hasInstance = instances.filter((i) => { var _a; return ((_a = i.State) === null || _a === void 0 ? void 0 : _a.Name) === "running"; }).length > 0;
if (!hasInstance) {
const instances = yield ec2Client.getInstancesForTags("running");
if (instances.length <= 0) {
// we need to start an instance
return "none";
}
Expand All @@ -742,14 +666,18 @@ function start() {
return __awaiter(this, void 0, void 0, function* () {
const config = new config_1.ActionConfig();
if (config.subaction === "stop") {
yield stop();
yield terminate();
return;
}
else if (config.subaction === "restart") {
yield stop();
yield terminate();
// then we make a fresh instance
}
else if (config.subaction !== "start") {
else if (config.subaction === "start") {
// We need to terminate
yield terminate("stopped");
}
else {
throw new Error("Unexpected subaction: " + config.subaction);
}
// subaction is 'start' or 'restart'estart'
Expand All @@ -765,7 +693,7 @@ function start() {
if (config.subaction === "restart") {
throw new Error("Taking down spot we just started. This seems wrong, erroring out.");
}
yield stop();
yield terminate();
}
var ec2SpotStrategies;
switch (config.ec2SpotInstanceStrategy) {
Expand Down Expand Up @@ -831,14 +759,14 @@ function start() {
}
});
}
function stop() {
function terminate(instanceStatus) {
return __awaiter(this, void 0, void 0, function* () {
try {
core.info("Starting instance cleanup");
const config = new config_1.ActionConfig();
const ec2Client = new ec2_1.Ec2Instance(config);
const ghClient = new github_1.GithubClient(config);
const instances = yield ec2Client.getInstancesForTags();
const instances = yield ec2Client.getInstancesForTags(instanceStatus);
yield ec2Client.terminateInstances(instances.map((i) => i.InstanceId));
core.info("Clearing previously installed runners");
const result = yield ghClient.removeRunnersWithLabels([config.githubJobId]);
Expand All @@ -860,7 +788,7 @@ function stop() {
start();
}
catch (error) {
stop();
terminate();
(0, utils_1.assertIsError)(error);
core.error(error);
core.setFailed(error.message);
Expand Down
93 changes: 7 additions & 86 deletions .github/spot-runner-action/src/ec2.ts
Original file line number Diff line number Diff line change
Expand Up @@ -117,17 +117,6 @@ export class Ec2Instance {
}
}

// async runInstances(params: RunInstancesRequest) {
// const client = await this.getEc2Client();

// try {
// return (await client.runInstances(params).promise()).Instances;
// } catch (error) {
// core.error(`Failed to create instance(s)`);
// throw error;
// }
// }

async getSubnetAzId() {
const client = await this.getEc2Client();
try {
Expand Down Expand Up @@ -252,86 +241,12 @@ export class Ec2Instance {
DefaultTargetCapacityType: useOnDemand ? "on-demand" : "spot",
},
};
// const config: SpotFleetRequestConfigData = {
// IamFleetRole:
// "arn:aws:iam::278380418400:role/aws-ec2-spot-fleet-tagging-role",
// TargetCapacity: 1,
// // We always ask for 1 instance, but might ask for 100% on demand or spot
// OnDemandTargetCapacity: useOnDemand ? 1 : 0,
// TerminateInstancesWithExpiration: true,
// Type: "request",
// LaunchSpecifications:
// };
// const params: RequestSpotFleetRequest = {
// SpotFleetRequestConfig: config,
// };
const client = await this.getEc2Client();
const fleet = await client.createFleet(createFleetRequest).promise();
const instances: CreateFleetInstance = (fleet?.Instances || [])[0] || {};
return (instances.InstanceIds || [])[0];
}

// async getOnDemandInstanceConfiguration(
// ec2SpotInstanceStrategy: string
// ): Promise<RunInstancesRequest> {
// const userData = new UserData(this.config);

// const params: RunInstancesRequest = {
// ImageId: this.config.ec2AmiId,
// InstanceInitiatedShutdownBehavior: "terminate",
// InstanceMarketOptions: {},
// InstanceType: "",
// MaxCount: 1,
// MinCount: 1,
// SecurityGroupIds: [this.config.ec2SecurityGroupId],
// SubnetId: this.config.ec2SubnetId,
// KeyName: this.config.ec2KeyName,
// Placement: {
// AvailabilityZone: await this.getSubnetAz(),
// },
// TagSpecifications: [
// {
// ResourceType: "instance",
// Tags: this.tags,
// },
// ],
// // <aztec>parity with build-system
// BlockDeviceMappings: [
// {
// DeviceName: "/dev/sda1",
// Ebs: {
// VolumeSize: 32,
// },
// },
// ],
// // parity with build-system</aztec>
// UserData: await userData.getUserData(),
// };

// switch (ec2SpotInstanceStrategy.toLowerCase()) {
// case "besteffort":
// case "spotonly": {
// params.InstanceMarketOptions = {
// MarketType: "spot",
// SpotOptions: {
// InstanceInterruptionBehavior: "terminate",
// SpotInstanceType: "one-time",
// },
// };
// break;
// }
// case "none": {
// params.InstanceMarketOptions = {};
// break;
// }
// default: {
// throw new TypeError("Invalid value for ec2_spot_instance_strategy");
// }
// }

// return params;
// }

async getInstanceStatus(instanceId: string) {
const client = await this.getEc2Client();
try {
Expand All @@ -347,7 +262,7 @@ export class Ec2Instance {
}
}

async getInstancesForTags(): Promise<AWS.EC2.Instance[]> {
async getInstancesForTags(instanceStatus?: string): Promise<AWS.EC2.Instance[]> {
const client = await this.getEc2Client();
const filters: FilterInterface[] = [
{
Expand All @@ -367,6 +282,12 @@ export class Ec2Instance {
).Reservations || []) {
instances = instances.concat(reservation.Instances || []);
}
if (instanceStatus) {
// Filter instances that are stopped
instances = instances.filter(
(instance) => instance?.State?.Name === instanceStatus
);
}
return instances;
} catch (error) {
core.error(
Expand Down
23 changes: 12 additions & 11 deletions .github/spot-runner-action/src/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,8 @@ async function pollSpotStatus(
): Promise<"usable" | "unusable" | "none"> {
// 12 iters x 10000 ms = 2 minutes
for (let iter = 0; iter < 12; iter++) {
const instances = await ec2Client.getInstancesForTags();
const hasInstance =
instances.filter((i) => i.State?.Name === "running").length > 0;
if (!hasInstance) {
const instances = await ec2Client.getInstancesForTags("running");
if (instances.length <= 0) {
// we need to start an instance
return "none";
}
Expand All @@ -38,12 +36,15 @@ async function pollSpotStatus(
async function start() {
const config = new ActionConfig();
if (config.subaction === "stop") {
await stop();
await terminate();
return;
} else if (config.subaction === "restart") {
await stop();
await terminate();
// then we make a fresh instance
} else if (config.subaction !== "start") {
} else if (config.subaction === "start") {
// We need to terminate
await terminate("stopped");
} else {
throw new Error("Unexpected subaction: " + config.subaction);
}
// subaction is 'start' or 'restart'estart'
Expand All @@ -65,7 +66,7 @@ async function start() {
"Taking down spot we just started. This seems wrong, erroring out."
);
}
await stop();
await terminate();
}

var ec2SpotStrategies: string[];
Expand Down Expand Up @@ -138,13 +139,13 @@ async function start() {
}
}

async function stop() {
async function terminate(instanceStatus?: string) {
try {
core.info("Starting instance cleanup");
const config = new ActionConfig();
const ec2Client = new Ec2Instance(config);
const ghClient = new GithubClient(config);
const instances = await ec2Client.getInstancesForTags();
const instances = await ec2Client.getInstancesForTags(instanceStatus);
await ec2Client.terminateInstances(instances.map((i) => i.InstanceId!));
core.info("Clearing previously installed runners");
const result = await ghClient.removeRunnersWithLabels([config.githubJobId]);
Expand All @@ -164,7 +165,7 @@ async function stop() {
try {
start();
} catch (error) {
stop();
terminate();
assertIsError(error);
core.error(error);
core.setFailed(error.message);
Expand Down
6 changes: 5 additions & 1 deletion scripts/attach_ebs_cache.sh
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,12 @@ while [ "$(aws ec2 describe-volumes \
elapsed_time=$((elapsed_time + WAIT_INTERVAL))
done

# Attach volume to the instance
# First, make sure this is detached from any instances stuck in stopping state
aws ec2 detach-volume \
--region $REGION \
--volume-id $VOLUME_ID || true

# Attach volume to the instance
aws ec2 attach-volume \
--region $REGION \
--volume-id $VOLUME_ID \
Expand Down
Loading