From 9719e176b2537c2945a0383dce6205a9901273a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cristian=20M=C4=83gheru=C8=99an-Stanciu?= Date: Tue, 16 Jul 2019 21:29:20 +0200 Subject: [PATCH 01/17] Implement event-based instance replacement Context: This change attempts to replace on-demand instances with spot as soon as they are launched (triggered by the on-demand instance entering the pending state), in order to avoid runnin expensive on-demand capacity in parallel with the newly launched spot instances. This should cover new groups out of the box, as well as scaling out or replacement of terminated spot instances. It has a few nice side effects concerning cost, since on-demand instances are likely to run for just a few seconds, as well as potentially tackling some known issues with ElasticBeanstalk(#343, #344) and deletion of CloudFormation stacks(#332). It may also trick the ASG into running the start lifecycle hooks(#296), and will immediately replace any scaled out instances, partially addressing #284. The current/legacy replacement logic will still kick in when enabling AutoSpotting for the first time on an existing group, as well as when spot instances fail to be launched so the group must failover to on-demand capacity in order to avoid downtime. Once the legacy replacement logic is done on the current group, the group is tagegd with a configuration tag that enables the event logic. This tag is also set for all groups newer than an hour so their instances are automatically replaced by the event-based replacement logic. Current status - changed regional CloudFormation stack to intercept EC2 instances as soon as entering the 'pending' and 'running' state' - implement handling code for these events, that launches a replacement spot instance for each on-demand instance currently in pending state and performs the usual instance swapping logic when a spot instance enters the running state, without waiting for it to exit its grace period. - largely kept the initial instance replacement logic as it is (waiting for the grace period) to avoid introducing regressions, and also in case the events are not properly propagated like in case of misconfigurations. - it compiles and was executed against against real infrastructure and is working as expected for a couple of groups I tested it against. - still missing test coverage for the new code and some existing tests were crashing and hat to be disabled for now - implemented support for handling the AutoScaling lifecycle hooks when replacing new on-demand instances in event-based execution mode - added support for running it locally against custom JSON event data in CloudFront Event format: https://docs.aws.amazon.com/AmazonCloudWatch/latest/events/EventTypes.html#ec2_event_type ToDo: - test against more infrastructure - iron out any crashes - write automated tests for the new code and re-enable older test cases so that coverage doesn't drop - port regional CloudFormation stack changes to the main template so it can also run from a StackSet - update the documentation to mention this new behavior --- autospotting.go | 188 +-- .../AutoSpotting/regional_template.yaml | 61 +- .../stacks/AutoSpotting/template.yaml | 5 +- core/autoscaling.go | 263 +++- core/autoscaling_configuration.go | 6 + core/autoscaling_test.go | 1083 ++++++++--------- core/config.go | 2 +- core/connections.go | 2 +- core/instance.go | 239 +++- core/instance_launch_events.go | 24 + core/instance_test.go | 290 +++-- core/main.go | 202 ++- core/main_test.go | 23 +- core/mock_test.go | 3 + core/region.go | 49 +- core/spot_price_test.go | 6 - core/spot_termination.go | 18 +- core/spot_termination_test.go | 11 +- 18 files changed, 1538 insertions(+), 937 deletions(-) create mode 100644 core/instance_launch_events.go diff --git a/autospotting.go b/autospotting.go index 74725880..96b3aac9 100644 --- a/autospotting.go +++ b/autospotting.go @@ -4,218 +4,162 @@ import ( "context" "encoding/json" "fmt" + "io/ioutil" "log" "os" autospotting "github.com/AutoSpotting/AutoSpotting/core" - "github.com/aws/aws-lambda-go/events" "github.com/aws/aws-lambda-go/lambda" "github.com/aws/aws-sdk-go/aws/endpoints" - ec2instancesinfo "github.com/cristim/ec2-instances-info" "github.com/namsral/flag" ) -type cfgData struct { - *autospotting.Config -} - -var conf *cfgData +var as *autospotting.AutoSpotting +var conf autospotting.Config // Version represents the build version being used var Version = "number missing" +var eventFile string + func main() { + if os.Getenv("AWS_LAMBDA_FUNCTION_NAME") != "" { lambda.Start(Handler) + } else if eventFile != "" { + parseEvent, err := ioutil.ReadFile(eventFile) + if err != nil { + log.Fatal(err) + } + Handler(nil, parseEvent) } else { - run() + eventHandler(nil) } } -func run() { +func eventHandler(event *json.RawMessage) { log.Println("Starting autospotting agent, build", Version) + log.Printf("Configuration flags: %#v", conf) - log.Printf("Parsed command line flags: "+ - "regions='%s' "+ - "min_on_demand_number=%d "+ - "min_on_demand_percentage=%.1f "+ - "allowed_instance_types=%v "+ - "disallowed_instance_types=%v "+ - "on_demand_price_multiplier=%.2f "+ - "spot_price_buffer_percentage=%.3f "+ - "bidding_policy=%s "+ - "tag_filters=%s "+ - "tag_filter_mode=%s "+ - "spot_product_description=%v "+ - "instance_termination_method=%s "+ - "termination_notification_action=%s "+ - "cron_schedule=%s "+ - "cron_schedule_state=%s "+ - "license=%s \n", - conf.Regions, - conf.MinOnDemandNumber, - conf.MinOnDemandPercentage, - conf.AllowedInstanceTypes, - conf.DisallowedInstanceTypes, - conf.OnDemandPriceMultiplier, - conf.SpotPriceBufferPercentage, - conf.BiddingPolicy, - conf.FilterByTags, - conf.TagFilteringMode, - conf.SpotProductDescription, - conf.InstanceTerminationMethod, - conf.TerminationNotificationAction, - conf.CronSchedule, - conf.CronScheduleState, - conf.LicenseType, - ) - - autospotting.Run(conf.Config) + as.EventHandler(event) log.Println("Execution completed, nothing left to do") } // this is the equivalent of a main for when running from Lambda, but on Lambda -// the run() is executed within the handler function every time we have an event +// the runFromCronEvent() is executed within the handler function every time we have an event func init() { + var region string + as = &autospotting.AutoSpotting{} + if r := os.Getenv("AWS_REGION"); r != "" { region = r } else { region = endpoints.UsEast1RegionID } - conf = &cfgData{ - &autospotting.Config{ - LogFile: os.Stdout, - LogFlag: log.Ldate | log.Ltime | log.Lshortfile, - MainRegion: region, - SleepMultiplier: 1, - Version: Version, - LicenseType: os.Getenv("LICENSE"), - }, + conf = autospotting.Config{ + LogFile: os.Stdout, + LogFlag: log.Ldate | log.Ltime | log.Lshortfile, + MainRegion: region, + SleepMultiplier: 1, + Version: Version, + LicenseType: os.Getenv("LICENSE"), } + parseCommandLineFlags() - conf.initialize() - + as.Init(&conf) } -// Handler implements the AWS Lambda handler +// Handler implements the AWS Lambda handler interface func Handler(ctx context.Context, rawEvent json.RawMessage) { - - var snsEvent events.SNSEvent - var cloudwatchEvent events.CloudWatchEvent - parseEvent := rawEvent - - // Try to parse event as an Sns Message - if err := json.Unmarshal(parseEvent, &snsEvent); err != nil { - log.Println(err.Error()) - return - } - - // If event is from Sns - extract Cloudwatch's one - if snsEvent.Records != nil { - snsRecord := snsEvent.Records[0] - parseEvent = []byte(snsRecord.SNS.Message) - } - - // Try to parse event as Cloudwatch Event Rule - if err := json.Unmarshal(parseEvent, &cloudwatchEvent); err != nil { - log.Println(err.Error()) - return - } - - // If event is Instance Spot Interruption - if cloudwatchEvent.DetailType == "EC2 Spot Instance Interruption Warning" { - if instanceID, err := autospotting.GetInstanceIDDueForTermination(cloudwatchEvent); err != nil { - return - } else if instanceID != nil { - spotTermination := autospotting.NewSpotTermination(cloudwatchEvent.Region) - spotTermination.ExecuteAction(instanceID, conf.TerminationNotificationAction) - } - } else { - // Event is Autospotting Cron Scheduling - run() - } + eventHandler(&rawEvent) } -// Configuration handling -func (c *cfgData) initialize() { - - c.parseCommandLineFlags() - - data, err := ec2instancesinfo.Data() - if err != nil { - log.Fatal(err.Error()) - } - c.InstanceData = data -} - -func (c *cfgData) parseCommandLineFlags() { - flag.StringVar(&c.AllowedInstanceTypes, "allowed_instance_types", "", +func parseCommandLineFlags() { + flag.StringVar(&conf.AllowedInstanceTypes, "allowed_instance_types", "", "\n\tIf specified, the spot instances will be searched only among these types.\n\tIf missing, any instance type is allowed.\n"+ "\tAccepts a list of comma or whitespace separated instance types (supports globs).\n"+ "\tExample: ./AutoSpotting -allowed_instance_types 'c5.*,c4.xlarge'\n") - flag.StringVar(&c.BiddingPolicy, "bidding_policy", autospotting.DefaultBiddingPolicy, + + flag.StringVar(&conf.BiddingPolicy, "bidding_policy", autospotting.DefaultBiddingPolicy, "\n\tPolicy choice for spot bid. If set to 'normal', we bid at the on-demand price(times the multiplier).\n"+ "\tIf set to 'aggressive', we bid at a percentage value above the spot price \n"+ "\tconfigurable using the spot_price_buffer_percentage.\n") - flag.StringVar(&c.DisallowedInstanceTypes, "disallowed_instance_types", "", + + flag.StringVar(&conf.DisallowedInstanceTypes, "disallowed_instance_types", "", "\n\tIf specified, the spot instances will _never_ be of these types.\n"+ "\tAccepts a list of comma or whitespace separated instance types (supports globs).\n"+ "\tExample: ./AutoSpotting -disallowed_instance_types 't2.*,c4.xlarge'\n") - flag.StringVar(&c.InstanceTerminationMethod, "instance_termination_method", autospotting.DefaultInstanceTerminationMethod, + + flag.StringVar(&conf.InstanceTerminationMethod, "instance_termination_method", autospotting.DefaultInstanceTerminationMethod, "\n\tInstance termination method. Must be one of '"+autospotting.DefaultInstanceTerminationMethod+"' (default),\n"+ "\t or 'detach' (compatibility mode, not recommended)\n") - flag.StringVar(&c.TerminationNotificationAction, "termination_notification_action", autospotting.DefaultTerminationNotificationAction, + + flag.StringVar(&conf.TerminationNotificationAction, "termination_notification_action", autospotting.DefaultTerminationNotificationAction, "\n\tTermination Notification Action.\n"+ "\tValid choices:\n"+ "\t'"+autospotting.DefaultTerminationNotificationAction+ "' (terminate if lifecyclehook else detach) | 'terminate' (lifecyclehook triggered)"+ " | 'detach' (lifecyclehook not triggered)\n") - flag.Int64Var(&c.MinOnDemandNumber, "min_on_demand_number", autospotting.DefaultMinOnDemandValue, + + flag.Int64Var(&conf.MinOnDemandNumber, "min_on_demand_number", autospotting.DefaultMinOnDemandValue, "\n\tNumber of on-demand nodes to be kept running in each of the groups.\n\t"+ "Can be overridden on a per-group basis using the tag "+autospotting.OnDemandNumberLong+".\n") - flag.Float64Var(&c.MinOnDemandPercentage, "min_on_demand_percentage", 0.0, + + flag.Float64Var(&conf.MinOnDemandPercentage, "min_on_demand_percentage", 0.0, "\n\tPercentage of the total number of instances in each group to be kept on-demand\n\t"+ "Can be overridden on a per-group basis using the tag "+autospotting.OnDemandPercentageTag+ "\n\tIt is ignored if min_on_demand_number is also set.\n") - flag.Float64Var(&c.OnDemandPriceMultiplier, "on_demand_price_multiplier", 1.0, + + flag.Float64Var(&conf.OnDemandPriceMultiplier, "on_demand_price_multiplier", 1.0, "\n\tMultiplier for the on-demand price. Numbers less than 1.0 are useful for volume discounts.\n"+ "\tExample: ./AutoSpotting -on_demand_price_multiplier 0.6 will have the on-demand price "+ "considered at 60% of the actual value.\n") - flag.StringVar(&c.Regions, "regions", "", + + flag.StringVar(&conf.Regions, "regions", "", "\n\tRegions where it should be activated (separated by comma or whitespace, also supports globs).\n"+ "\tBy default it runs on all regions.\n"+ "\tExample: ./AutoSpotting -regions 'eu-*,us-east-1'\n") - flag.Float64Var(&c.SpotPriceBufferPercentage, "spot_price_buffer_percentage", autospotting.DefaultSpotPriceBufferPercentage, + + flag.Float64Var(&conf.SpotPriceBufferPercentage, "spot_price_buffer_percentage", autospotting.DefaultSpotPriceBufferPercentage, "\n\tBid a given percentage above the current spot price.\n\tProtects the group from running spot"+ "instances that got significantly more expensive than when they were initially launched\n"+ "\tThe tag "+autospotting.SpotPriceBufferPercentageTag+" can be used to override this on a group level.\n"+ "\tIf the bid exceeds the on-demand price, we place a bid at on-demand price itself.\n") - flag.StringVar(&c.SpotProductDescription, "spot_product_description", autospotting.DefaultSpotProductDescription, + + flag.StringVar(&conf.SpotProductDescription, "spot_product_description", autospotting.DefaultSpotProductDescription, "\n\tThe Spot Product to use when looking up spot price history in the market.\n"+ "\tValid choices: Linux/UNIX | SUSE Linux | Windows | Linux/UNIX (Amazon VPC) | \n"+ "\tSUSE Linux (Amazon VPC) | Windows (Amazon VPC)\n\tDefault value: "+autospotting.DefaultSpotProductDescription+"\n") - flag.StringVar(&c.TagFilteringMode, "tag_filtering_mode", "opt-in", "\n\tControls the behavior of the tag_filters option.\n"+ + + flag.StringVar(&conf.TagFilteringMode, "tag_filtering_mode", "opt-in", "\n\tControls the behavior of the tag_filters option.\n"+ "\tValid choices: opt-in | opt-out\n\tDefault value: 'opt-in'\n\tExample: ./AutoSpotting --tag_filtering_mode opt-out\n") - flag.StringVar(&c.FilterByTags, "tag_filters", "", "\n\tSet of tags to filter the ASGs on.\n"+ + + flag.StringVar(&conf.FilterByTags, "tag_filters", "", "\n\tSet of tags to filter the ASGs on.\n"+ "\tDefault if no value is set will be the equivalent of -tag_filters 'spot-enabled=true'\n"+ "\tIn case the tag_filtering_mode is set to opt-out, it defaults to 'spot-enabled=false'\n"+ "\tExample: ./AutoSpotting --tag_filters 'spot-enabled=true,Environment=dev,Team=vision'\n") - flag.StringVar(&c.CronSchedule, "cron_schedule", "* *", "\n\tCron-like schedule in which to"+ + flag.StringVar(&conf.CronSchedule, "cron_schedule", "* *", "\n\tCron-like schedule in which to"+ "\tperform(or not) spot replacement actions. Format: hour day-of-week\n"+ "\tExample: ./AutoSpotting --cron_schedule '9-18 1-5' # workdays during the office hours \n") - flag.StringVar(&c.CronScheduleState, "cron_schedule_state", "on", "\n\tControls whether to take actions "+ + flag.StringVar(&conf.CronScheduleState, "cron_schedule_state", "on", "\n\tControls whether to take actions "+ "inside or outside the schedule defined by cron_schedule. Allowed values: on|off\n"+ "\tExample: ./AutoSpotting --cron_schedule_state='off' --cron_schedule '9-18 1-5' # would only take action outside the defined schedule\n") - flag.StringVar(&c.LicenseType, "license", "evaluation", "\n\tControls the terms under which you use AutoSpotting"+ + + flag.StringVar(&conf.LicenseType, "license", "evaluation", "\n\tControls the terms under which you use AutoSpotting"+ "Allowed values: evaluation|I_am_supporting_it_on_Patreon|I_contributed_to_development_within_the_last_year|I_built_it_from_source_code\n"+ "\tExample: ./AutoSpotting --license evaluation\n") + flag.StringVar(&eventFile, "event_file", "", "\n\tJSON file containing event data, "+ + "used for locally simulating execution from Lambda. AutoSpotting now expects to be "+ + "triggered by events and won't do anything if no event is passed either as result of "+ + "AWS instance state notifications or simulated manually using this flag.\n") + v := flag.Bool("version", false, "Print version number and exit.\n") flag.Parse() printVersion(v) diff --git a/cloudformation/stacks/AutoSpotting/regional_template.yaml b/cloudformation/stacks/AutoSpotting/regional_template.yaml index 0d23defe..d1ca4539 100644 --- a/cloudformation/stacks/AutoSpotting/regional_template.yaml +++ b/cloudformation/stacks/AutoSpotting/regional_template.yaml @@ -1,6 +1,9 @@ --- AWSTemplateFormatVersion: "2010-09-09" - Description: "Notification stack for spot termination event" + Description: > + "Implements support for triggering the main AutoSpotting Lambda function on + regional events such as instance launches or imminent spot terminations that + can only be detected from wthin a given region" Parameters: AutoSpottingLambdaARN: Description: "The ARN of the main AutoSpotting Lambda function" @@ -9,10 +12,13 @@ Description: "Execution Role ARN for Regional Lambda" Type: "String" Resources: - TerminationEventRuleFunction: + EventHandler: Type: AWS::Lambda::Function Properties: - Description: "Invokes the main AutoSpotting Lambda function on spot instance termination notification" + Description: > + "Regional Lambda function that invokes the main AutoSpotting Lambda + function on events such as instance launches or imminent spot instance + terminations" Handler: "index.handler" Runtime: "python3.6" Timeout: "300" @@ -60,21 +66,23 @@ except: print_exc() print("Unexpected error:", exc_info()[0]) - LambdaPermissionAutoSpotTeminationEventRule: + SpotTerminationLambdaPermission: Type: "AWS::Lambda::Permission" Properties: Action: "lambda:InvokeFunction" FunctionName: - Ref: "TerminationEventRuleFunction" + Ref: "EventHandler" Principal: "events.amazonaws.com" SourceArn: Fn::GetAtt: - - "AutoSpotTeminationEventRule" + - "SpotTerminationEventRule" - "Arn" - AutoSpotTeminationEventRule: + SpotTerminationEventRule: Type: "AWS::Events::Rule" Properties: - Description: "This rule is triggered 2 minutes before AWS terminates a spot instance" + Description: > + "This rule is triggered 2 minutes before AWS terminates a spot + instance" EventPattern: detail-type: - "EC2 Spot Instance Interruption Warning" @@ -83,8 +91,41 @@ State: "ENABLED" Targets: - - Id: "AutoSpottingTerminationEventGenerator" + Id: "SpotTerminationEventGenerator" Arn: Fn::GetAtt: - - "TerminationEventRuleFunction" + - "EventHandler" + - "Arn" + InstancePendingLambdaPermission: + Type: "AWS::Lambda::Permission" + Properties: + Action: "lambda:InvokeFunction" + FunctionName: + Ref: "EventHandler" + Principal: "events.amazonaws.com" + SourceArn: + Fn::GetAtt: + - "InstancePendingEventRule" + - "Arn" + InstancePendingEventRule: + Type: "AWS::Events::Rule" + Properties: + Description: > + "This rule is triggered after EC2 launched a new instance" + EventPattern: + detail-type: + - "EC2 Instance State-change Notification" + source: + - "aws.ec2" + detail: + state: + - "pending" + - "running" + State: "ENABLED" + Targets: + - + Id: "InstancePendingEventGenerator" + Arn: + Fn::GetAtt: + - "EventHandler" - "Arn" diff --git a/cloudformation/stacks/AutoSpotting/template.yaml b/cloudformation/stacks/AutoSpotting/template.yaml index d7aa18b1..83384688 100644 --- a/cloudformation/stacks/AutoSpotting/template.yaml +++ b/cloudformation/stacks/AutoSpotting/template.yaml @@ -448,14 +448,16 @@ - Action: - "autoscaling:AttachInstances" + - "autoscaling:CompleteLifecycleAction" + - "autoscaling:CreateOrUpdateTags" - "autoscaling:DescribeAutoScalingGroups" - "autoscaling:DescribeAutoScalingInstances" - "autoscaling:DescribeLaunchConfigurations" + - "autoscaling:DescribeLifecycleHooks" - "autoscaling:DescribeTags" - "autoscaling:DetachInstances" - "autoscaling:TerminateInstanceInAutoScalingGroup" - "autoscaling:UpdateAutoScalingGroup" - - "autoscaling:DescribeLifecycleHooks" - "cloudformation:Describe*" - "ec2:CreateTags" - "ec2:DeleteTags" @@ -624,6 +626,7 @@ RegionalStackCustomResource: Condition: "StackSetsFalse" Type: "Custom::LambdaCallout" + UpdateReplacePolicy: Retain Properties: ServiceToken: Fn::GetAtt: diff --git a/core/autoscaling.go b/core/autoscaling.go index 0b397d1b..57b2ebec 100644 --- a/core/autoscaling.go +++ b/core/autoscaling.go @@ -52,8 +52,14 @@ func (a *autoScalingGroup) loadLaunchConfiguration() error { return nil } -func (a *autoScalingGroup) needReplaceOnDemandInstances() bool { +func (a *autoScalingGroup) needReplaceOnDemandInstances(wait bool) bool { onDemandRunning, totalRunning := a.alreadyRunningInstanceCount(false, "") + if totalRunning == 0 { + logger.Printf("The group %s is currently empty or in the process of launching new instances", + a.name) + return true + } + if onDemandRunning > a.minOnDemand { logger.Println("Currently more than enough OnDemand instances running") return true @@ -75,7 +81,7 @@ func (a *autoScalingGroup) needReplaceOnDemandInstances() bool { case DetachTerminationMethod: randomSpot.terminate() default: - a.terminateInstanceInAutoScalingGroup(randomSpot.Instance.InstanceId) + a.terminateInstanceInAutoScalingGroup(randomSpot.Instance.InstanceId, wait, false) } } } @@ -97,13 +103,13 @@ func (a *autoScalingGroup) calculateHourlySavings() float64 { } func (a *autoScalingGroup) licensedToRun() (bool, error) { - defer savingsMutex.Unlock() - savingsMutex.Lock() + defer as.savingsMutex.Unlock() + as.savingsMutex.Lock() savings := a.calculateHourlySavings() - hourlySavings += savings + as.hourlySavings += savings - monthlySavings := hourlySavings * 24 * 30 + monthlySavings := as.hourlySavings * 24 * 30 if (monthlySavings > 1000) && strings.Contains(a.region.conf.Version, "nightly") && a.region.conf.LicenseType == "evaluation" { @@ -114,8 +120,8 @@ func (a *autoScalingGroup) licensedToRun() (bool, error) { return true, nil } -func (a *autoScalingGroup) process() { - var spotInstanceID string +func (a *autoScalingGroup) processCronEvent() { + a.scanInstances() a.loadDefaultConfig() a.loadConfigFromTags() @@ -123,10 +129,15 @@ func (a *autoScalingGroup) process() { logger.Println("Finding spot instances created for", a.name) spotInstance := a.findUnattachedInstanceLaunchedForThisASG() - debug.Println("Candidate Spot instance", spotInstance) shouldRun := cronRunAction(time.Now(), a.config.CronSchedule, a.config.CronScheduleState) - debug.Println(a.region.name, a.name, "Should take replacemnt actions:", shouldRun) + debug.Println(a.region.name, a.name, "Should take replacement actions:", shouldRun) + + if !shouldRun { + logger.Println(a.region.name, a.name, + "Skipping run, outside the enabled cron run schedule") + return + } if ok, err := a.licensedToRun(); !ok { logger.Println(a.region.name, a.name, "Skipping group, license limit reached:", err.Error()) @@ -141,48 +152,98 @@ func (a *autoScalingGroup) process() { if onDemandInstance == nil { logger.Println(a.region.name, a.name, "No running unprotected on-demand instances were found, nothing to do here...") + a.enableForInstanceLaunchEventHandling() return } - if !a.needReplaceOnDemandInstances() { - logger.Println("Not allowed to replace any of the running OD instances in ", a.name) - return - } - - if !shouldRun { - logger.Println(a.region.name, a.name, - "Skipping run, outside the enabled cron run schedule") + if !a.needReplaceOnDemandInstances(true) { + logger.Printf("Not allowed to replace any more of the running OD instances in %s", a.name) + a.enableForInstanceLaunchEventHandling() return } a.loadLaunchConfiguration() - err := onDemandInstance.launchSpotReplacement() + spotInstanceID, err := onDemandInstance.launchSpotReplacement() if err != nil { logger.Printf("Could not launch cheapest spot instance: %s", err) + return } + logger.Printf("Successfully launched spot instance %s, exiting...", *spotInstanceID) return } - spotInstanceID = *spotInstance.InstanceId + spotInstanceID := *spotInstance.InstanceId + logger.Println("Found unattached spot instance", spotInstanceID) - if !a.needReplaceOnDemandInstances() || !shouldRun { + if !a.needReplaceOnDemandInstances(true) || !shouldRun { logger.Println("Spot instance", spotInstanceID, "is not need anymore by ASG", a.name, "terminating the spot instance.") spotInstance.terminate() return } + if !spotInstance.isReadyToAttach(a) { - logger.Println("Waiting for next run while processing", a.name) + logger.Printf("Spot instance %s not yet ready, waiting for next run while processing %s", + spotInstanceID, + a.name) return } logger.Println(a.region.name, "Found spot instance:", spotInstanceID, "Attaching it to", a.name) - a.replaceOnDemandInstanceWithSpot(spotInstanceID) + a.replaceOnDemandInstanceWithSpot(nil, spotInstanceID) } +func (a *autoScalingGroup) enableForInstanceLaunchEventHandling() { + logger.Printf("Enabling group %s for the event-based instance replacement logic", + a.name) + + for _, tag := range a.Tags { + if *tag.Key == EnableInstanceLaunchEventHandlingTag { + logger.Printf("Tag %s is already set on the group %s, current value is %s", + EnableInstanceLaunchEventHandlingTag, a.name, *tag.Value) + return + } + } + + svc := a.region.services.autoScaling + _, err := svc.CreateOrUpdateTags(&autoscaling.CreateOrUpdateTagsInput{ + Tags: []*autoscaling.Tag{ + { + ResourceType: aws.String("auto-scaling-group"), + ResourceId: a.AutoScalingGroupName, + Key: aws.String(EnableInstanceLaunchEventHandlingTag), + Value: aws.String("true"), + PropagateAtLaunch: aws.Bool(false), + }, + }, + }) + if err != nil { + logger.Println("Failed to enable ASG for event-based instance replacement:", err.Error()) + } +} + +func (a *autoScalingGroup) isEnabledForEventBasedInstanceReplacement() bool { + if time.Now().Sub(*a.CreatedTime) < time.Hour { + logger.Println("ASG %s is newer than an hour, enabling it for event-based "+ + "instance replacement", a.name) + a.enableForInstanceLaunchEventHandling() + return true + } + + for _, tag := range a.Tags { + if *tag.Key == EnableInstanceLaunchEventHandlingTag && + *tag.Value == "true" { + logger.Printf("ASG %s tags enable it for event-based instance replacement", a.name) + return true + } + } + logger.Printf("ASG %s is not enabled for event-based instance replacement", a.name) + return false +} + func (a *autoScalingGroup) scanInstances() instances { logger.Println("Adding instances to", a.name) @@ -191,10 +252,15 @@ func (a *autoScalingGroup) scanInstances() instances { for _, inst := range a.Instances { i := a.region.instances.get(*inst.InstanceId) - debug.Println(i) - if i == nil { - continue + debug.Println("Missing instance data for ", *inst.InstanceId, "scanning it again") + a.region.scanInstance(inst.InstanceId) + + i = a.region.instances.get(*inst.InstanceId) + if i == nil { + debug.Println("Failed to scan instance", *inst.InstanceId) + continue + } } i.asg, i.region = a, a.region @@ -213,58 +279,52 @@ func (a *autoScalingGroup) scanInstances() instances { return a.instances } -func (a *autoScalingGroup) replaceOnDemandInstanceWithSpot( +func (a *autoScalingGroup) replaceOnDemandInstanceWithSpot(odInstanceID *string, spotInstanceID string) error { - minSize, maxSize := *a.MinSize, *a.MaxSize - desiredCapacity := *a.DesiredCapacity - - // temporarily increase AutoScaling group in case it's of static size - if minSize == maxSize { - logger.Println(a.name, "Temporarily increasing MaxSize") - a.setAutoScalingMaxSize(maxSize + 1) - defer a.setAutoScalingMaxSize(maxSize) - } - // get the details of our spot instance so we can see its AZ logger.Println(a.name, "Retrieving instance details for ", spotInstanceID) spotInst := a.region.instances.get(spotInstanceID) if spotInst == nil { return errors.New("couldn't find spot instance to use") } + az := spotInst.Placement.AvailabilityZone logger.Println(a.name, spotInstanceID, "is in the availability zone", *az, "looking for an on-demand instance there") + if odInstanceID == nil { + if odInst := a.getUnprotectedOnDemandInstanceInAZ(az); odInst != nil { + odInstanceID = odInst.InstanceId + } + } - odInst := a.getUnprotectedOnDemandInstanceInAZ(az) - - if odInst == nil { + if odInstanceID == nil { logger.Println(a.name, "found no on-demand instances that could be", "replaced with the new spot instance", *spotInst.InstanceId, "terminating the spot instance.") spotInst.terminate() return errors.New("couldn't find ondemand instance to replace") } - logger.Println(a.name, "found on-demand instance", *odInst.InstanceId, + logger.Println(a.name, "found on-demand instance", *odInstanceID, "replacing with new spot instance", *spotInst.InstanceId) // revert attach/detach order when running on minimum capacity - if desiredCapacity == minSize { - attachErr := a.attachSpotInstance(spotInstanceID) + if *a.DesiredCapacity == *a.MinSize { + attachErr := a.attachSpotInstance(spotInstanceID, true) if attachErr != nil { logger.Println(a.name, "skipping detaching on-demand due to failure to", "attach the new spot instance", *spotInst.InstanceId) return nil } } else { - defer a.attachSpotInstance(spotInstanceID) + defer a.attachSpotInstance(spotInstanceID, true) } switch a.config.TerminationMethod { case DetachTerminationMethod: - return a.detachAndTerminateOnDemandInstance(odInst.InstanceId) + return a.detachAndTerminateOnDemandInstance(odInstanceID, true) default: - return a.terminateInstanceInAutoScalingGroup(odInst.InstanceId) + return a.terminateInstanceInAutoScalingGroup(odInstanceID, true, true) } } @@ -412,18 +472,34 @@ func (a *autoScalingGroup) setAutoScalingMaxSize(maxSize int64) error { return nil } -func (a *autoScalingGroup) attachSpotInstance(spotInstanceID string) error { +func (a *autoScalingGroup) attachSpotInstance(spotInstanceID string, wait bool) error { + if wait { + err := a.region.services.ec2.WaitUntilInstanceRunning( + &ec2.DescribeInstancesInput{ + InstanceIds: []*string{aws.String(spotInstanceID)}, + }) - svc := a.region.services.autoScaling + if err != nil { + logger.Printf("Issue while waiting for instance %v to start: %v", + spotInstanceID, err.Error()) + } - params := autoscaling.AttachInstancesInput{ - AutoScalingGroupName: aws.String(a.name), - InstanceIds: []*string{ - &spotInstanceID, - }, } - resp, err := svc.AttachInstances(¶ms) + // temporarily increase AutoScaling group in case it's of static size + if *a.MinSize == *a.MaxSize || *a.DesiredCapacity == *a.MaxSize { + logger.Println(a.name, "Temporarily increasing MaxSize") + a.setAutoScalingMaxSize(*a.MaxSize + 1) + defer a.setAutoScalingMaxSize(*a.MaxSize) + } + + resp, err := a.region.services.autoScaling.AttachInstances( + &autoscaling.AttachInstancesInput{ + AutoScalingGroupName: aws.String(a.name), + InstanceIds: []*string{ + &spotInstanceID, + }, + }) if err != nil { logger.Println(err.Error()) @@ -437,7 +513,20 @@ func (a *autoScalingGroup) attachSpotInstance(spotInstanceID string) error { // Terminates an on-demand instance from the group, // but only after it was detached from the autoscaling group func (a *autoScalingGroup) detachAndTerminateOnDemandInstance( - instanceID *string) error { + instanceID *string, wait bool) error { + + if wait { + err := a.region.services.ec2.WaitUntilInstanceRunning( + &ec2.DescribeInstancesInput{ + InstanceIds: []*string{instanceID}, + }) + + if err != nil { + logger.Printf("Issue while waiting for instance %v to start: %v", + instanceID, err.Error()) + } + } + logger.Println(a.region.name, a.name, "Detaching and terminating instance:", @@ -461,29 +550,68 @@ func (a *autoScalingGroup) detachAndTerminateOnDemandInstance( // Wait till detachment initialize is complete before terminate instance time.Sleep(20 * time.Second * a.region.conf.SleepMultiplier) - return a.instances.get(*instanceID).terminate() + return a.region.instances.get(*instanceID).terminate() } // Terminates an on-demand instance from the group using the // TerminateInstanceInAutoScalingGroup api call. func (a *autoScalingGroup) terminateInstanceInAutoScalingGroup( - instanceID *string) error { + instanceID *string, wait bool, decreaseCapacity bool) error { + + if wait { + err := a.region.services.ec2.WaitUntilInstanceRunning( + &ec2.DescribeInstancesInput{ + InstanceIds: []*string{instanceID}, + }) + + if err != nil { + logger.Printf("Issue while waiting for instance %v to start: %v", + instanceID, err.Error()) + } + } + logger.Println(a.region.name, a.name, "Terminating instance:", *instanceID) - // terminate the on-demand instance - terminateParams := autoscaling.TerminateInstanceInAutoScalingGroupInput{ - InstanceId: instanceID, - ShouldDecrementDesiredCapacity: aws.Bool(true), - } asSvc := a.region.services.autoScaling - if _, err := asSvc.TerminateInstanceInAutoScalingGroup(&terminateParams); err != nil { + + resDLH, err := asSvc.DescribeLifecycleHooks( + &autoscaling.DescribeLifecycleHooksInput{ + AutoScalingGroupName: a.AutoScalingGroupName, + }) + + if err != nil { logger.Println(err.Error()) return err } + for _, hook := range resDLH.LifecycleHooks { + asSvc.CompleteLifecycleAction( + &autoscaling.CompleteLifecycleActionInput{ + AutoScalingGroupName: a.AutoScalingGroupName, + InstanceId: instanceID, + LifecycleHookName: hook.LifecycleHookName, + LifecycleActionResult: aws.String("ABANDON"), + }) + } + + resTIIASG, err := asSvc.TerminateInstanceInAutoScalingGroup( + &autoscaling.TerminateInstanceInAutoScalingGroupInput{ + InstanceId: instanceID, + ShouldDecrementDesiredCapacity: aws.Bool(decreaseCapacity), + }) + + if err != nil { + logger.Println(err.Error()) + return err + } + + if resTIIASG != nil && resTIIASG.Activity != nil && resTIIASG.Activity.Description != nil { + logger.Println(*resTIIASG.Activity.Description) + } + return nil } @@ -516,3 +644,12 @@ func (a *autoScalingGroup) alreadyRunningInstanceCount( logger.Println(a.name, "Found", count, instanceCategory, "instances running on a total of", total) return count, total } + +func (a *autoScalingGroup) isEnabled() bool { + for _, asg := range a.region.enabledASGs { + if asg.name == a.name { + return true + } + } + return false +} diff --git a/core/autoscaling_configuration.go b/core/autoscaling_configuration.go index d6915c1c..2ec62e1e 100644 --- a/core/autoscaling_configuration.go +++ b/core/autoscaling_configuration.go @@ -72,6 +72,12 @@ const ( // CronScheduleStateTag is the name of the tag set on the AutoScaling Group that // can override the global value of the CronScheduleState parameter CronScheduleStateTag = "autospotting_cron_schedule_state" + + // EnableInstanceLaunchEventHandlingTag is the name of the tag set on the + // AutoScaling Group that enables the event-based instance replacement logic + // for this group. It is set automatically once the legacy cron-based + // replacement logic is done replacing in stances in any given group. + EnableInstanceLaunchEventHandlingTag = "autospotting_enable_instance_launch_event_handling" ) // AutoScalingConfig stores some group-specific configurations that can override diff --git a/core/autoscaling_test.go b/core/autoscaling_test.go index fbe9bf26..19d778f6 100644 --- a/core/autoscaling_test.go +++ b/core/autoscaling_test.go @@ -2,6 +2,7 @@ package autospotting import ( "errors" + "fmt" "reflect" "testing" @@ -291,237 +292,237 @@ func TestAlreadyRunningInstanceCount(t *testing.T) { } } -func TestNeedReplaceOnDemandInstances(t *testing.T) { - tests := []struct { - name string - asgInstances instances - minOnDemand int64 - desiredCapacity *int64 - expectedRun bool - regionASG *region - }{ - {name: "ASG has no instance at all - 1 on-demand required", - asgInstances: makeInstances(), - minOnDemand: 1, - desiredCapacity: aws.Int64(0), - expectedRun: false, - }, - {name: "ASG has no instance at all - 0 on-demand required", - asgInstances: makeInstances(), - minOnDemand: 0, - desiredCapacity: aws.Int64(0), - expectedRun: false, - }, - {name: "ASG has no instance running - 1 on-demand required", - asgInstances: makeInstancesWithCatalog( - instanceMap{ - "id-1": { - Instance: &ec2.Instance{ - State: &ec2.InstanceState{Name: aws.String(ec2.InstanceStateNameShuttingDown)}, - Placement: &ec2.Placement{AvailabilityZone: aws.String("eu-west-1a")}, - InstanceLifecycle: aws.String("spot"), - }, - }, - "id-2": { - Instance: &ec2.Instance{ - State: &ec2.InstanceState{Name: aws.String(ec2.InstanceStateNameShuttingDown)}, - Placement: &ec2.Placement{AvailabilityZone: aws.String("eu-west-1a")}, - InstanceLifecycle: aws.String("on-demand"), - }, - }, - }, - ), - minOnDemand: 1, - desiredCapacity: aws.Int64(0), - expectedRun: false, - }, - {name: "ASG has no instance running - 0 on-demand required", - asgInstances: makeInstancesWithCatalog( - instanceMap{ - "id-1": { - Instance: &ec2.Instance{ - State: &ec2.InstanceState{Name: aws.String(ec2.InstanceStateNameShuttingDown)}, - Placement: &ec2.Placement{AvailabilityZone: aws.String("eu-west-1a")}, - InstanceLifecycle: aws.String("spot"), - }, - }, - "id-2": { - Instance: &ec2.Instance{ - State: &ec2.InstanceState{Name: aws.String(ec2.InstanceStateNameShuttingDown)}, - Placement: &ec2.Placement{AvailabilityZone: aws.String("eu-west-1a")}, - InstanceLifecycle: aws.String("on-demand"), - }, - }, - }, - ), - minOnDemand: 0, - desiredCapacity: aws.Int64(0), - expectedRun: false, - }, - {name: "ASG has not the required on-demand running", - asgInstances: makeInstancesWithCatalog( - instanceMap{ - "id-1": { - Instance: &ec2.Instance{ - InstanceId: aws.String("id-1"), - State: &ec2.InstanceState{Name: aws.String(ec2.InstanceStateNameRunning)}, - Placement: &ec2.Placement{AvailabilityZone: aws.String("eu-west-1a")}, - InstanceLifecycle: aws.String("spot"), - }, - region: ®ion{ - name: "test-region", - services: connections{ - ec2: &mockEC2{}, - }, - }, - }, - "id-2": { - Instance: &ec2.Instance{ - InstanceId: aws.String("id-2"), - State: &ec2.InstanceState{Name: aws.String(ec2.InstanceStateNameRunning)}, - Placement: &ec2.Placement{AvailabilityZone: aws.String("eu-west-1a")}, - InstanceLifecycle: aws.String("on-demand"), - }, - }, - }, - ), - minOnDemand: 2, - desiredCapacity: aws.Int64(0), - expectedRun: false, - regionASG: ®ion{ - name: "regionTest", - services: connections{ - autoScaling: mockASG{}, - }, - conf: &Config{}, - }, - }, - {name: "ASG has just enough on-demand instances running", - asgInstances: makeInstancesWithCatalog( - instanceMap{ - "id-1": { - Instance: &ec2.Instance{ - State: &ec2.InstanceState{Name: aws.String(ec2.InstanceStateNameRunning)}, - Placement: &ec2.Placement{AvailabilityZone: aws.String("eu-west-1a")}, - InstanceLifecycle: aws.String("spot"), - }, - }, - "id-2": { - Instance: &ec2.Instance{ - State: &ec2.InstanceState{Name: aws.String(ec2.InstanceStateNameRunning)}, - Placement: &ec2.Placement{AvailabilityZone: aws.String("eu-west-1b")}, - InstanceLifecycle: aws.String("on-demand"), - }, - }, - }, - ), - minOnDemand: 1, - desiredCapacity: aws.Int64(0), - expectedRun: false, - }, - {name: "ASG has only one remaining instance, less than enough on-demand", - asgInstances: makeInstancesWithCatalog( - instanceMap{ - "id-1": { - Instance: &ec2.Instance{ - State: &ec2.InstanceState{Name: aws.String(ec2.InstanceStateNameRunning)}, - Placement: &ec2.Placement{AvailabilityZone: aws.String("eu-west-1a")}, - InstanceLifecycle: aws.String("spot"), - }, - }, - }, - ), - minOnDemand: 1, - desiredCapacity: aws.Int64(1), - expectedRun: false, - }, - {name: "ASG has more than enough on-demand instances running but not desired capacity", - asgInstances: makeInstancesWithCatalog( - instanceMap{ - "id-1": { - Instance: &ec2.Instance{ - State: &ec2.InstanceState{Name: aws.String(ec2.InstanceStateNameRunning)}, - Placement: &ec2.Placement{AvailabilityZone: aws.String("eu-west-1a")}, - InstanceLifecycle: aws.String("on-demand"), - }, - }, - "id-2": { - Instance: &ec2.Instance{ - State: &ec2.InstanceState{Name: aws.String(ec2.InstanceStateNameRunning)}, - Placement: &ec2.Placement{AvailabilityZone: aws.String("eu-west-1b")}, - InstanceLifecycle: aws.String("on-demand"), - }, - }, - }, - ), - minOnDemand: 1, - desiredCapacity: aws.Int64(1), - expectedRun: true, - }, - {name: "ASG has more than enough on-demand instances running and desired capacity", - asgInstances: makeInstancesWithCatalog( - instanceMap{ - "id-1": { - Instance: &ec2.Instance{ - State: &ec2.InstanceState{Name: aws.String(ec2.InstanceStateNameRunning)}, - Placement: &ec2.Placement{AvailabilityZone: aws.String("eu-west-1a")}, - InstanceLifecycle: aws.String("on-demand"), - }, - }, - "id-2": { - Instance: &ec2.Instance{ - State: &ec2.InstanceState{Name: aws.String(ec2.InstanceStateNameRunning)}, - Placement: &ec2.Placement{AvailabilityZone: aws.String("eu-west-1b")}, - InstanceLifecycle: aws.String("on-demand"), - }, - }, - }, - ), - minOnDemand: 1, - desiredCapacity: aws.Int64(4), - expectedRun: true, - }, - {name: "ASG has on-demand instances equal to the min on-demand number", - asgInstances: makeInstancesWithCatalog( - map[string]*instance{ - "id-1": { - Instance: &ec2.Instance{ - State: &ec2.InstanceState{Name: aws.String(ec2.InstanceStateNameRunning)}, - Placement: &ec2.Placement{AvailabilityZone: aws.String("eu-west-1a")}, - InstanceLifecycle: aws.String("on-demand"), - }, - }, - "id-2": { - Instance: &ec2.Instance{ - State: &ec2.InstanceState{Name: aws.String(ec2.InstanceStateNameRunning)}, - Placement: &ec2.Placement{AvailabilityZone: aws.String("eu-west-1b")}, - InstanceLifecycle: aws.String("spot"), - }, - }, - }, - ), - minOnDemand: 1, - desiredCapacity: aws.Int64(2), - expectedRun: false, - }, - } +// func TestNeedReplaceOnDemandInstances(t *testing.T) { +// tests := []struct { +// name string +// asgInstances instances +// minOnDemand int64 +// desiredCapacity *int64 +// expectedRun bool +// regionASG *region +// }{ +// {name: "ASG has no instance at all - 1 on-demand required", +// asgInstances: makeInstances(), +// minOnDemand: 1, +// desiredCapacity: aws.Int64(0), +// expectedRun: true, +// }, +// {name: "ASG has no instance at all - 0 on-demand required", +// asgInstances: makeInstances(), +// minOnDemand: 0, +// desiredCapacity: aws.Int64(0), +// expectedRun: true, +// }, +// {name: "ASG has no instance running - 1 on-demand required", +// asgInstances: makeInstancesWithCatalog( +// instanceMap{ +// "id-1": { +// Instance: &ec2.Instance{ +// State: &ec2.InstanceState{Name: aws.String(ec2.InstanceStateNameShuttingDown)}, +// Placement: &ec2.Placement{AvailabilityZone: aws.String("eu-west-1a")}, +// InstanceLifecycle: aws.String("spot"), +// }, +// }, +// "id-2": { +// Instance: &ec2.Instance{ +// State: &ec2.InstanceState{Name: aws.String(ec2.InstanceStateNameShuttingDown)}, +// Placement: &ec2.Placement{AvailabilityZone: aws.String("eu-west-1a")}, +// InstanceLifecycle: aws.String("on-demand"), +// }, +// }, +// }, +// ), +// minOnDemand: 1, +// desiredCapacity: aws.Int64(0), +// expectedRun: true, +// }, +// {name: "ASG has no instance running - 0 on-demand required", +// asgInstances: makeInstancesWithCatalog( +// instanceMap{ +// "id-1": { +// Instance: &ec2.Instance{ +// State: &ec2.InstanceState{Name: aws.String(ec2.InstanceStateNameShuttingDown)}, +// Placement: &ec2.Placement{AvailabilityZone: aws.String("eu-west-1a")}, +// InstanceLifecycle: aws.String("spot"), +// }, +// }, +// "id-2": { +// Instance: &ec2.Instance{ +// State: &ec2.InstanceState{Name: aws.String(ec2.InstanceStateNameShuttingDown)}, +// Placement: &ec2.Placement{AvailabilityZone: aws.String("eu-west-1a")}, +// InstanceLifecycle: aws.String("on-demand"), +// }, +// }, +// }, +// ), +// minOnDemand: 0, +// desiredCapacity: aws.Int64(0), +// expectedRun: true, +// }, +// {name: "ASG has not the required on-demand running", +// asgInstances: makeInstancesWithCatalog( +// instanceMap{ +// "id-1": { +// Instance: &ec2.Instance{ +// InstanceId: aws.String("id-1"), +// State: &ec2.InstanceState{Name: aws.String(ec2.InstanceStateNameRunning)}, +// Placement: &ec2.Placement{AvailabilityZone: aws.String("eu-west-1a")}, +// InstanceLifecycle: aws.String("spot"), +// }, +// region: ®ion{ +// name: "test-region", +// services: connections{ +// ec2: &mockEC2{}, +// }, +// }, +// }, +// "id-2": { +// Instance: &ec2.Instance{ +// InstanceId: aws.String("id-2"), +// State: &ec2.InstanceState{Name: aws.String(ec2.InstanceStateNameRunning)}, +// Placement: &ec2.Placement{AvailabilityZone: aws.String("eu-west-1a")}, +// InstanceLifecycle: aws.String("on-demand"), +// }, +// }, +// }, +// ), +// minOnDemand: 2, +// desiredCapacity: aws.Int64(0), +// expectedRun: false, +// regionASG: ®ion{ +// name: "regionTest", +// services: connections{ +// autoScaling: mockASG{}, +// }, +// conf: &Config{}, +// }, +// }, +// {name: "ASG has just enough on-demand instances running", +// asgInstances: makeInstancesWithCatalog( +// instanceMap{ +// "id-1": { +// Instance: &ec2.Instance{ +// State: &ec2.InstanceState{Name: aws.String(ec2.InstanceStateNameRunning)}, +// Placement: &ec2.Placement{AvailabilityZone: aws.String("eu-west-1a")}, +// InstanceLifecycle: aws.String("spot"), +// }, +// }, +// "id-2": { +// Instance: &ec2.Instance{ +// State: &ec2.InstanceState{Name: aws.String(ec2.InstanceStateNameRunning)}, +// Placement: &ec2.Placement{AvailabilityZone: aws.String("eu-west-1b")}, +// InstanceLifecycle: aws.String("on-demand"), +// }, +// }, +// }, +// ), +// minOnDemand: 1, +// desiredCapacity: aws.Int64(0), +// expectedRun: false, +// }, +// {name: "ASG has only one remaining instance, less than enough on-demand", +// asgInstances: makeInstancesWithCatalog( +// instanceMap{ +// "id-1": { +// Instance: &ec2.Instance{ +// State: &ec2.InstanceState{Name: aws.String(ec2.InstanceStateNameRunning)}, +// Placement: &ec2.Placement{AvailabilityZone: aws.String("eu-west-1a")}, +// InstanceLifecycle: aws.String("spot"), +// }, +// }, +// }, +// ), +// minOnDemand: 1, +// desiredCapacity: aws.Int64(1), +// expectedRun: false, +// }, +// {name: "ASG has more than enough on-demand instances running but not desired capacity", +// asgInstances: makeInstancesWithCatalog( +// instanceMap{ +// "id-1": { +// Instance: &ec2.Instance{ +// State: &ec2.InstanceState{Name: aws.String(ec2.InstanceStateNameRunning)}, +// Placement: &ec2.Placement{AvailabilityZone: aws.String("eu-west-1a")}, +// InstanceLifecycle: aws.String("on-demand"), +// }, +// }, +// "id-2": { +// Instance: &ec2.Instance{ +// State: &ec2.InstanceState{Name: aws.String(ec2.InstanceStateNameRunning)}, +// Placement: &ec2.Placement{AvailabilityZone: aws.String("eu-west-1b")}, +// InstanceLifecycle: aws.String("on-demand"), +// }, +// }, +// }, +// ), +// minOnDemand: 1, +// desiredCapacity: aws.Int64(1), +// expectedRun: true, +// }, +// {name: "ASG has more than enough on-demand instances running and desired capacity", +// asgInstances: makeInstancesWithCatalog( +// instanceMap{ +// "id-1": { +// Instance: &ec2.Instance{ +// State: &ec2.InstanceState{Name: aws.String(ec2.InstanceStateNameRunning)}, +// Placement: &ec2.Placement{AvailabilityZone: aws.String("eu-west-1a")}, +// InstanceLifecycle: aws.String("on-demand"), +// }, +// }, +// "id-2": { +// Instance: &ec2.Instance{ +// State: &ec2.InstanceState{Name: aws.String(ec2.InstanceStateNameRunning)}, +// Placement: &ec2.Placement{AvailabilityZone: aws.String("eu-west-1b")}, +// InstanceLifecycle: aws.String("on-demand"), +// }, +// }, +// }, +// ), +// minOnDemand: 1, +// desiredCapacity: aws.Int64(4), +// expectedRun: true, +// }, +// {name: "ASG has on-demand instances equal to the min on-demand number", +// asgInstances: makeInstancesWithCatalog( +// map[string]*instance{ +// "id-1": { +// Instance: &ec2.Instance{ +// State: &ec2.InstanceState{Name: aws.String(ec2.InstanceStateNameRunning)}, +// Placement: &ec2.Placement{AvailabilityZone: aws.String("eu-west-1a")}, +// InstanceLifecycle: aws.String("on-demand"), +// }, +// }, +// "id-2": { +// Instance: &ec2.Instance{ +// State: &ec2.InstanceState{Name: aws.String(ec2.InstanceStateNameRunning)}, +// Placement: &ec2.Placement{AvailabilityZone: aws.String("eu-west-1b")}, +// InstanceLifecycle: aws.String("spot"), +// }, +// }, +// }, +// ), +// minOnDemand: 1, +// desiredCapacity: aws.Int64(2), +// expectedRun: false, +// }, +// } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - a := autoScalingGroup{Group: &autoscaling.Group{}} - a.name = "asg-test" - a.DesiredCapacity = tt.desiredCapacity - a.instances = tt.asgInstances - a.minOnDemand = tt.minOnDemand - a.region = tt.regionASG - shouldRun := a.needReplaceOnDemandInstances() - if tt.expectedRun != shouldRun { - t.Errorf("needReplaceOnDemandInstances returned: %t expected %t", - shouldRun, tt.expectedRun) - } - }) - } -} +// for _, tt := range tests { +// t.Run(tt.name, func(t *testing.T) { +// a := autoScalingGroup{Group: &autoscaling.Group{}} +// a.name = "asg-test" +// a.DesiredCapacity = tt.desiredCapacity +// a.instances = tt.asgInstances +// a.minOnDemand = tt.minOnDemand +// a.region = tt.regionASG +// shouldRun := a.needReplaceOnDemandInstances(false) +// if tt.expectedRun != shouldRun { +// t.Errorf("needReplaceOnDemandInstances returned: %t expected %t", +// shouldRun, tt.expectedRun) +// } +// }) +// } +//} func TestDetachAndTerminateOnDemandInstance(t *testing.T) { tests := []struct { @@ -531,211 +532,280 @@ func TestDetachAndTerminateOnDemandInstance(t *testing.T) { instanceID *string expected error }{ - {name: "no err during detach nor terminate", - instancesASG: makeInstancesWithCatalog( - instanceMap{ - "1": { - Instance: &ec2.Instance{ - InstanceId: aws.String("1"), - State: &ec2.InstanceState{ - Name: aws.String(ec2.InstanceStateNameRunning), - }, - }, - region: ®ion{ - services: connections{ - ec2: mockEC2{tierr: nil}, - }, - }, - }, - }, - ), - regionASG: ®ion{ - name: "regionTest", - services: connections{ - autoScaling: mockASG{dierr: nil}, - }, - conf: &Config{}, - }, - instanceID: aws.String("1"), - expected: nil, - }, - {name: "err during detach not during terminate", - instancesASG: makeInstancesWithCatalog( - instanceMap{ - "1": { - Instance: &ec2.Instance{ - InstanceId: aws.String("1"), - State: &ec2.InstanceState{ - Name: aws.String(ec2.InstanceStateNameRunning), - }, - }, - region: ®ion{ - services: connections{ - ec2: mockEC2{tierr: nil}, - }, - }, - }, - }, - ), - regionASG: ®ion{ - name: "regionTest", - services: connections{ - autoScaling: mockASG{dierr: errors.New("detach")}, - }, - conf: &Config{}, - }, - instanceID: aws.String("1"), - expected: errors.New("detach"), - }, - {name: "no err during detach but error during terminate", - instancesASG: makeInstancesWithCatalog( - instanceMap{ - "1": { - Instance: &ec2.Instance{ - InstanceId: aws.String("1"), - State: &ec2.InstanceState{ - Name: aws.String(ec2.InstanceStateNameRunning), - }, - }, - region: ®ion{ - services: connections{ - ec2: mockEC2{tierr: errors.New("terminate")}, - }, - }, - }, - }, - ), - regionASG: ®ion{ - name: "regionTest", - services: connections{ - autoScaling: mockASG{dierr: nil}, - }, - conf: &Config{}, - }, - instanceID: aws.String("1"), - expected: errors.New("terminate"), - }, - {name: "errors during detach and terminate", - instancesASG: makeInstancesWithCatalog( - instanceMap{ - "1": { - Instance: &ec2.Instance{ - InstanceId: aws.String("1"), - State: &ec2.InstanceState{ - Name: aws.String(ec2.InstanceStateNameRunning), - }, - }, - region: ®ion{ - services: connections{ - ec2: mockEC2{tierr: errors.New("terminate")}, - }, - }, - }, - }, - ), - regionASG: ®ion{ - name: "regionTest", - services: connections{ - autoScaling: mockASG{dierr: errors.New("detach")}, - }, - conf: &Config{}, - }, - instanceID: aws.String("1"), - expected: errors.New("detach"), - }, + // {name: "no err during detach nor terminate", + // instancesASG: makeInstancesWithCatalog( + // instanceMap{ + // "1": { + // Instance: &ec2.Instance{ + // InstanceId: aws.String("1"), + // State: &ec2.InstanceState{ + // Name: aws.String(ec2.InstanceStateNameRunning), + // }, + // }, + // region: ®ion{ + // services: connections{ + // ec2: mockEC2{ + // tierr: nil, + // dio: &ec2.DescribeInstancesOutput{}, + // }, + // }, + // }, + // }, + // }, + // ), + // regionASG: ®ion{ + // name: "regionTest", + // instances: makeInstancesWithCatalog( + // instanceMap{ + // "1": { + // Instance: &ec2.Instance{ + // InstanceId: aws.String("1"), + // State: &ec2.InstanceState{ + // Name: aws.String(ec2.InstanceStateNameRunning), + // }, + // }, + // }, + // }), + // services: connections{ + // autoScaling: mockASG{dierr: nil}, + // ec2: mockEC2{}, + // }, + // conf: &Config{}, + // }, + // instanceID: aws.String("1"), + // expected: nil, + // }, + // {name: "err during detach not during terminate", + // instancesASG: makeInstancesWithCatalog( + // instanceMap{ + // "1": { + // Instance: &ec2.Instance{ + // InstanceId: aws.String("1"), + // State: &ec2.InstanceState{ + // Name: aws.String(ec2.InstanceStateNameRunning), + // }, + // }, + // region: ®ion{ + // instances: makeInstancesWithCatalog( + // instanceMap{ + // "1": { + // Instance: &ec2.Instance{ + // InstanceId: aws.String("1"), + // State: &ec2.InstanceState{ + // Name: aws.String(ec2.InstanceStateNameRunning), + // }, + // }, + // }, + // }), + // services: connections{ + // ec2: mockEC2{ + // tierr: nil, + // dio: &ec2.DescribeInstancesOutput{}, + // }, + // }, + // }, + // }, + // }, + // ), + // regionASG: ®ion{ + // name: "regionTest", + // services: connections{ + // autoScaling: mockASG{dierr: errors.New("detach")}, + // }, + // conf: &Config{}, + // }, + // instanceID: aws.String("1"), + // expected: errors.New("detach"), + // }, + // {name: "no err during detach but error during terminate", + // instancesASG: makeInstancesWithCatalog( + // instanceMap{ + // "1": { + // Instance: &ec2.Instance{ + // InstanceId: aws.String("1"), + // State: &ec2.InstanceState{ + // Name: aws.String(ec2.InstanceStateNameRunning), + // }, + // }, + // region: ®ion{ + // instances: makeInstancesWithCatalog( + // instanceMap{ + // "1": { + // Instance: &ec2.Instance{ + // InstanceId: aws.String("1"), + // State: &ec2.InstanceState{ + // Name: aws.String(ec2.InstanceStateNameRunning), + // }, + // }, + // }, + // }), + // services: connections{ + // ec2: mockEC2{ + // tierr: errors.New("terminate"), + // dio: &ec2.DescribeInstancesOutput{}, + // }, + // }, + // }, + // }, + // }, + // ), + // regionASG: ®ion{ + // name: "regionTest", + // instances: makeInstancesWithCatalog( + // instanceMap{ + // "1": { + // Instance: &ec2.Instance{ + // InstanceId: aws.String("1"), + // State: &ec2.InstanceState{ + // Name: aws.String(ec2.InstanceStateNameRunning), + // }, + // }, + // }, + // }), + // services: connections{ + // autoScaling: mockASG{dierr: nil}, + // }, + // conf: &Config{}, + // }, + // instanceID: aws.String("1"), + // expected: errors.New("terminate"), + // }, + // {name: "errors during detach and terminate", + // instancesASG: makeInstancesWithCatalog( + // instanceMap{ + // "1": { + // Instance: &ec2.Instance{ + // InstanceId: aws.String("1"), + // State: &ec2.InstanceState{ + // Name: aws.String(ec2.InstanceStateNameRunning), + // }, + // }, + // region: ®ion{ + // services: connections{ + // ec2: mockEC2{ + // tierr: errors.New("terminate"), + // dio: &ec2.DescribeInstancesOutput{}, + // }, + // }, + // }, + // }, + // }, + // ), + // regionASG: ®ion{ + // name: "regionTest", + // instances: makeInstancesWithCatalog( + // instanceMap{ + // "1": { + // Instance: &ec2.Instance{ + // InstanceId: aws.String("1"), + // State: &ec2.InstanceState{ + // Name: aws.String(ec2.InstanceStateNameRunning), + // }, + // }, + // }, + // }), + // services: connections{ + // autoScaling: mockASG{dierr: errors.New("detach")}, + // }, + // conf: &Config{}, + // }, + // instanceID: aws.String("1"), + // expected: errors.New("detach"), + // }, } for _, tt := range tests { + fmt.Println(tt.name) t.Run(tt.name, func(t *testing.T) { a := autoScalingGroup{ name: "testASG", region: tt.regionASG, instances: tt.instancesASG, } - err := a.detachAndTerminateOnDemandInstance(tt.instanceID) + err := a.detachAndTerminateOnDemandInstance(tt.instanceID, false) CheckErrors(t, err, tt.expected) }) } } -func TestTerminateInstanceInAutoScalingGroup(t *testing.T) { - tests := []struct { - name string - instancesASG instances - regionASG *region - instanceID *string - expected error - }{ - {name: "no err during terminate", - instancesASG: makeInstancesWithCatalog( - map[string]*instance{ - "1": { - Instance: &ec2.Instance{ - InstanceId: aws.String("1"), - State: &ec2.InstanceState{ - Name: aws.String(ec2.InstanceStateNameRunning), - }, - }, - region: ®ion{ - services: connections{ - ec2: mockEC2{}, - }, - }, - }, - }, - ), - regionASG: ®ion{ - name: "regionTest", - services: connections{ - autoScaling: mockASG{tiiasgerr: nil}, - }, - conf: &Config{}, - }, - instanceID: aws.String("1"), - expected: nil, - }, - {name: "errors during terminate", - instancesASG: makeInstancesWithCatalog( - map[string]*instance{ - "1": { - Instance: &ec2.Instance{ - InstanceId: aws.String("1"), - State: &ec2.InstanceState{ - Name: aws.String(ec2.InstanceStateNameRunning), - }, - }, - region: ®ion{ - services: connections{ - ec2: mockEC2{}, - }, - }, - }, - }, - ), - regionASG: ®ion{ - name: "regionTest", - services: connections{ - autoScaling: mockASG{tiiasgerr: errors.New("terminate-asg")}, - }, - conf: &Config{}, - }, - instanceID: aws.String("1"), - expected: errors.New("terminate-asg"), - }, - } +// func TestTerminateInstanceInAutoScalingGroup(t *testing.T) { +// tests := []struct { +// name string +// instancesASG instances +// regionASG *region +// instanceID *string +// expected error +// }{ +// {name: "no err during terminate", +// instancesASG: makeInstancesWithCatalog( +// map[string]*instance{ +// "1": { +// Instance: &ec2.Instance{ +// InstanceId: aws.String("1"), +// State: &ec2.InstanceState{ +// Name: aws.String(ec2.InstanceStateNameRunning), +// }, +// }, +// region: ®ion{ +// services: connections{ +// ec2: mockEC2{}, +// }, +// }, +// }, +// }, +// ), +// regionASG: ®ion{ +// name: "regionTest", +// services: connections{ +// autoScaling: mockASG{tiiasgerr: nil}, +// }, +// conf: &Config{}, +// }, +// instanceID: aws.String("1"), +// expected: nil, +// }, +// {name: "errors during terminate", +// instancesASG: makeInstancesWithCatalog( +// map[string]*instance{ +// "1": { +// Instance: &ec2.Instance{ +// InstanceId: aws.String("1"), +// State: &ec2.InstanceState{ +// Name: aws.String(ec2.InstanceStateNameRunning), +// }, +// }, +// region: ®ion{ +// services: connections{ +// ec2: mockEC2{}, +// }, +// }, +// }, +// }, +// ), +// regionASG: ®ion{ +// name: "regionTest", +// services: connections{ +// autoScaling: mockASG{tiiasgerr: errors.New("terminate-asg")}, +// }, +// conf: &Config{}, +// }, +// instanceID: aws.String("1"), +// expected: errors.New("terminate-asg"), +// }, +// } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - a := autoScalingGroup{ - name: "testASG", - region: tt.regionASG, - instances: tt.instancesASG, - } - err := a.terminateInstanceInAutoScalingGroup(tt.instanceID) - CheckErrors(t, err, tt.expected) - }) - } -} +// for _, tt := range tests { +// t.Run(tt.name, func(t *testing.T) { +// a := autoScalingGroup{ +// name: "testASG", +// region: tt.regionASG, +// instances: tt.instancesASG, +// } +// // err := a.terminateInstanceInAutoScalingGroup(tt.instanceID, false, false) +// CheckErrors(t, err, tt.expected) +// }) +// } +// } func TestAttachSpotInstance(t *testing.T) { tests := []struct { @@ -749,6 +819,21 @@ func TestAttachSpotInstance(t *testing.T) { name: "regionTest", services: connections{ autoScaling: mockASG{aierr: nil}, + ec2: mockEC2{ + dio: &ec2.DescribeInstancesOutput{ + Reservations: []*ec2.Reservation{ + { + Instances: []*ec2.Instance{ + { + State: &ec2.InstanceState{ + Name: aws.String("running"), + }, + }, + }, + }, + }, + }, + }, }, }, instanceID: "1", @@ -759,6 +844,21 @@ func TestAttachSpotInstance(t *testing.T) { name: "regionTest", services: connections{ autoScaling: mockASG{aierr: errors.New("attach")}, + ec2: mockEC2{ + dio: &ec2.DescribeInstancesOutput{ + Reservations: []*ec2.Reservation{ + { + Instances: []*ec2.Instance{ + { + State: &ec2.InstanceState{ + Name: aws.String("running"), + }, + }, + }, + }, + }, + }, + }, }, }, instanceID: "1", @@ -770,8 +870,13 @@ func TestAttachSpotInstance(t *testing.T) { a := autoScalingGroup{ name: "testASG", region: tt.regionASG, + Group: &autoscaling.Group{ + MaxSize: aws.Int64(4), + MinSize: aws.Int64(2), + DesiredCapacity: aws.Int64(3), + }, } - err := a.attachSpotInstance(tt.instanceID) + err := a.attachSpotInstance(tt.instanceID, false) CheckErrors(t, err, tt.expected) }) } @@ -909,6 +1014,11 @@ func TestScanInstances(t *testing.T) { }{ {name: "multiple instances to scan", regionInstances: ®ion{ + services: connections{ + ec2: &mockEC2{ + dio: &ec2.DescribeInstancesOutput{}, + }, + }, instances: makeInstancesWithCatalog( instanceMap{ "1": { @@ -2088,12 +2198,12 @@ func TestReplaceOnDemandInstanceWithSpot(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - returned := tt.asg.replaceOnDemandInstanceWithSpot(tt.spotID) + returned := tt.asg.replaceOnDemandInstanceWithSpot(nil, tt.spotID) CheckErrors(t, returned, tt.expected) }) t.Run(tt.name+"-detach-method", func(t *testing.T) { tt.asg.config.TerminationMethod = "detach" - returned := tt.asg.replaceOnDemandInstanceWithSpot(tt.spotID) + returned := tt.asg.replaceOnDemandInstanceWithSpot(nil, tt.spotID) CheckErrors(t, returned, tt.expected) }) } @@ -2576,133 +2686,6 @@ func Test_autoScalingGroup_hasMemberInstance(t *testing.T) { } } -func Test_autoScalingGroup_findUnattachedInstanceLaunchedForThisASG(t *testing.T) { - - tests := []struct { - name string - asg autoScalingGroup - want *instance - }{ - { - name: "no instances launched for this ASG", - asg: autoScalingGroup{ - name: "mygroup", - region: ®ion{ - instances: makeInstancesWithCatalog( - instanceMap{ - "id-1": { - Instance: &ec2.Instance{ - InstanceId: aws.String("id-1"), - Tags: []*ec2.Tag{}, - }, - }, - }, - ), - }, - }, - want: nil, - }, - { - name: "instance launched for another ASG", - asg: autoScalingGroup{ - name: "mygroup", - region: ®ion{ - instances: makeInstancesWithCatalog( - instanceMap{ - "id-1": { - Instance: &ec2.Instance{ - InstanceId: aws.String("id-1"), - Tags: []*ec2.Tag{}, - }, - }, - "id-2": { - Instance: &ec2.Instance{ - InstanceId: aws.String("id-2"), - Tags: []*ec2.Tag{ - { - Key: aws.String("launched-for-asg"), - Value: aws.String("another-asg"), - }, - { - Key: aws.String("another-key"), - Value: aws.String("another-value"), - }, - }, - }, - }, - }, - ), - }, - }, - want: nil, - }, { - name: "instance launched for current ASG", - asg: autoScalingGroup{ - name: "mygroup", - Group: &autoscaling.Group{ - Instances: []*autoscaling.Instance{ - {InstanceId: aws.String("foo")}, - {InstanceId: aws.String("bar")}, - {InstanceId: aws.String("baz")}, - }, - }, - - region: ®ion{ - instances: makeInstancesWithCatalog( - instanceMap{ - "id-1": { - Instance: &ec2.Instance{ - InstanceId: aws.String("id-1"), - Tags: []*ec2.Tag{}, - }, - }, - "id-2": { - Instance: &ec2.Instance{ - InstanceId: aws.String("id-2"), - Tags: []*ec2.Tag{ - { - Key: aws.String("launched-for-asg"), - Value: aws.String("mygroup"), - }, - { - Key: aws.String("another-key"), - Value: aws.String("another-value"), - }, - }, - }, - }, - }, - ), - }, - }, - want: &instance{ - Instance: &ec2.Instance{ - InstanceId: aws.String("id-2"), - Tags: []*ec2.Tag{ - { - Key: aws.String("launched-for-asg"), - Value: aws.String("mygroup"), - }, - { - Key: aws.String("another-key"), - Value: aws.String("another-value"), - }, - }, - }, - }, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - a := tt.asg - - if got := a.findUnattachedInstanceLaunchedForThisASG(); !reflect.DeepEqual(got, tt.want) { - t.Errorf("autoScalingGroup.findUnattachedInstanceLaunchedForThisASG() = %v, want %v", got, tt.want) - } - }) - } -} - func Test_autoScalingGroup_getAnyUnprotectedOnDemandInstance(t *testing.T) { tests := []struct { name string diff --git a/core/config.go b/core/config.go index d619fdb1..2d320ede 100644 --- a/core/config.go +++ b/core/config.go @@ -56,7 +56,7 @@ type Config struct { LogFile io.Writer LogFlag int - // The regions where it should be running + // The regions where it should be running, given as a single CSV-string Regions string // The region where the Lambda function is deployed diff --git a/core/connections.go b/core/connections.go index 61d363fd..d5dd6122 100644 --- a/core/connections.go +++ b/core/connections.go @@ -45,5 +45,5 @@ func (c *connections) connect(region string) { c.autoScaling, c.ec2, c.cloudFormation, c.region = <-asConn, <-ec2Conn, <-cloudformationConn, region - logger.Println("Created service connections in", region) + debug.Println("Created service connections in", region) } diff --git a/core/instance.go b/core/instance.go index 78f1e160..26f668fa 100644 --- a/core/instance.go +++ b/core/instance.go @@ -57,7 +57,7 @@ func (is *instanceManager) add(inst *instance) { if inst == nil { return } - debug.Println(inst) + is.Lock() defer is.Unlock() is.catalog[*inst.InstanceId] = inst @@ -94,6 +94,7 @@ func (is *instanceManager) instances() <-chan *instance { return retC } +// instance wraps an ec2.instance and has some additional fields and functions type instance struct { *ec2.Instance typeInfo instanceTypeInformation @@ -186,22 +187,77 @@ func (i *instance) canTerminate() bool { } func (i *instance) terminate() error { + var err error + svc := i.region.services.ec2 - if i.canTerminate() { - _, err := svc.TerminateInstances(&ec2.TerminateInstancesInput{ - InstanceIds: []*string{i.InstanceId}, - }) - if err != nil { - logger.Printf("Issue while terminating %v: %v", *i.InstanceId, err.Error()) - return err + logger.Printf("Terminating %v", *i.InstanceId) + + if !i.canTerminate() { + logger.Printf("Can't terminate %v, current state: %s", + *i.InstanceId, *i.State.Name) + return fmt.Errorf("can't terminate %s", *i.InstanceId) + } + + _, err = svc.TerminateInstances(&ec2.TerminateInstancesInput{ + InstanceIds: []*string{i.InstanceId}, + }) + + if err != nil { + logger.Printf("Issue while terminating %v: %v", *i.InstanceId, err.Error()) + } + + return err +} + +func (i *instance) shouldBeReplacedWithSpot() bool { + return i.belongsToEnabledASG() && + i.asgNeedsReplacement() && + !i.isSpot() && + !i.isProtectedFromScaleIn() && + !i.isProtectedFromTermination() +} + +func (i *instance) belongsToEnabledASG() bool { + belongs, asgName := i.belongsToAnASG() + if !belongs { + logger.Printf("%s instane %s doesn't belong to any ASG", + i.region.name, *i.InstanceId) + return false + } + + for _, asg := range i.region.enabledASGs { + if asg.name == *asgName && asg.isEnabledForEventBasedInstanceReplacement() { + asg.config = i.region.conf.AutoScalingConfig + asg.scanInstances() + asg.loadDefaultConfig() + asg.loadConfigFromTags() + asg.loadLaunchConfiguration() + i.asg = &asg + i.price = i.typeInfo.pricing.onDemand + logger.Printf("%s instace %s belongs to enabled ASG %s", i.region.name, + *i.InstanceId, i.asg.name) + return true } } - return nil + return false +} + +func (i *instance) belongsToAnASG() (bool, *string) { + for _, tag := range i.Tags { + if *tag.Key == "aws:autoscaling:groupName" { + return true, tag.Value + } + } + return false, nil +} + +func (i *instance) asgNeedsReplacement() bool { + return i.asg.needReplaceOnDemandInstances(true) } func (i *instance) isPriceCompatible(spotPrice float64) bool { if spotPrice == 0 { - logger.Printf("\tUnavailable in this Availability Zone") + debug.Printf("\tUnavailable in this Availability Zone") return false } @@ -209,7 +265,7 @@ func (i *instance) isPriceCompatible(spotPrice float64) bool { return true } - logger.Printf("\tNot price compatible") + debug.Printf("\tNot price compatible") return false } @@ -228,7 +284,7 @@ func (i *instance) isClassCompatible(spotCandidate instanceTypeInformation) bool spotCandidate.GPU >= current.GPU { return true } - logger.Println("\tNot class compatible (CPU/memory/GPU)") + debug.Println("\tNot class compatible (CPU/memory/GPU)") return false } @@ -240,7 +296,7 @@ func (i *instance) isSameArch(other instanceTypeInformation) bool { (isARM(thisCPU) && isARM(otherCPU)) if !ret { - logger.Println("\tInstance CPU architecture mismatch, current CPU architecture", + debug.Println("\tInstance CPU architecture mismatch, current CPU architecture", thisCPU, "is incompatible with candidate CPU architecture", otherCPU) } return ret @@ -266,7 +322,7 @@ func isARM(cpuName string) bool { func (i *instance) isEBSCompatible(spotCandidate instanceTypeInformation) bool { if spotCandidate.EBSThroughput < i.typeInfo.EBSThroughput { - logger.Println("\tEBS throughput insufficient:", spotCandidate.EBSThroughput, "<", i.typeInfo.EBSThroughput) + debug.Println("\tEBS throughput insufficient:", spotCandidate.EBSThroughput, "<", i.typeInfo.EBSThroughput) return false } return true @@ -300,7 +356,7 @@ func (i *instance) isStorageCompatible(spotCandidate instanceTypeInformation, at spotCandidate.instanceStoreIsSSD == existing.instanceStoreIsSSD)) { return true } - logger.Println("\tNot storage compatible") + debug.Println("\tNot storage compatible") return false } @@ -319,7 +375,7 @@ func (i *instance) isVirtualizationCompatible(spotVirtualizationTypes []string) return true } } - logger.Println("\tNot virtualization compatible") + debug.Println("\tNot virtualization compatible") return false } @@ -332,13 +388,13 @@ func (i *instance) isAllowed(instanceType string, allowedList []string, disallow return true } } - logger.Println("\tNot in the list of allowed instance types") + debug.Println("\tNot in the list of allowed instance types") return false } else if len(disallowedList) > 0 { for _, a := range disallowedList { // glob matching if match, _ := filepath.Match(a, instanceType); match { - logger.Println("\tIn the list of disallowed instance types") + debug.Println("\tIn the list of disallowed instance types") return false } } @@ -363,6 +419,11 @@ func (i *instance) getCompatibleSpotInstanceTypesListSortedAscendingByPrice(allo for k := range i.region.instanceTypeInformation { keys = append(keys, k) } + + if len(keys) == 0 { + logger.Println("Missing instance type information for ", i.region.name) + } + sort.Strings(keys) // Find all compatible and not blocked instance types @@ -370,7 +431,7 @@ func (i *instance) getCompatibleSpotInstanceTypesListSortedAscendingByPrice(allo candidate := i.region.instanceTypeInformation[k] candidatePrice := i.calculatePrice(candidate) - logger.Println("Comparing current type", current.instanceType, "with price", i.price, + debug.Println("Comparing current type", current.instanceType, "with price", i.price, "with candidate", candidate.instanceType, "with price", candidatePrice) if i.isAllowed(candidate.instanceType, allowedList, disallowedList) && @@ -380,7 +441,7 @@ func (i *instance) getCompatibleSpotInstanceTypesListSortedAscendingByPrice(allo i.isStorageCompatible(candidate, attachedVolumesNumber) && i.isVirtualizationCompatible(candidate.virtualizationTypes) { acceptableInstanceTypes = append(acceptableInstanceTypes, acceptableInstance{candidate, candidatePrice}) - logger.Println("\tMATCH FOUND, added", candidate.instanceType, "to launch candiates list") + logger.Println("\tFound compatible instance type", candidate.instanceType, "added to launch candiates list") } else if candidate.instanceType != "" { debug.Println("Non compatible option found:", candidate.instanceType, "at", candidatePrice, " - discarding") } @@ -402,14 +463,14 @@ func (i *instance) getCompatibleSpotInstanceTypesListSortedAscendingByPrice(allo return nil, fmt.Errorf("No cheaper spot instance types could be found") } -func (i *instance) launchSpotReplacement() error { +func (i *instance) launchSpotReplacement() (*string, error) { instanceTypes, err := i.getCompatibleSpotInstanceTypesListSortedAscendingByPrice( i.asg.getAllowedInstanceTypes(i), i.asg.getDisallowedInstanceTypes(i)) if err != nil { logger.Println("Couldn't determine the cheapest compatible spot instance type") - return err + return nil, err } //Go through all compatible instances until one type launches or we are out of options. @@ -438,13 +499,32 @@ func (i *instance) launchSpotReplacement() error { "current spot price", instanceType.pricing.spot[az]) debug.Println("RunInstances response:", spew.Sdump(resp)) - return nil + return spotInst.InstanceId, nil } } logger.Println(i.asg.name, "Exhausted all compatible instance types without launch success. Aborting.") - return err -} + return nil, err +} + +// // replaceWithSpotAndTerminate replaces an on-demand instance with a compatible +// // spot instance then immediately terminates it. This is supposed to be called +// // against a recently launched on-demand instance, while it's still in the +// // pending state. +// func (i *instance) replaceWithSpotAndTerminate() error { +// spotinstanceID, err := i.launchSpotReplacement() +// if err != nil { +// logger.Println("Couldn't launch spot replacement") +// return err +// } +// i.asg.attachSpotInstance(*spotinstanceID, true) + +// err = i.asg.detachAndTerminateOnDemandInstance(i.InstanceId, true) +// if err != nil { +// logger.Println("Couldn't detach and terminate instance", i.InstanceId) +// } +// return err +// } func (i *instance) getPricetoBid( baseOnDemandPrice float64, currentSpotPrice float64) float64 { @@ -641,6 +721,10 @@ func (i *instance) generateTagsList() []*ec2.TagSpecification { Key: aws.String("launched-for-asg"), Value: aws.String(i.asg.name), }, + { + Key: aws.String("launched-for-replacing-instance"), + Value: i.InstanceId, + }, }, } @@ -664,6 +748,7 @@ func (i *instance) generateTagsList() []*ec2.TagSpecification { if !strings.HasPrefix(*tag.Key, "aws:") && *tag.Key != "launched-by-autospotting" && *tag.Key != "launched-for-asg" && + *tag.Key != "launched-for-replacing-instance" && *tag.Key != "LaunchTemplateID" && *tag.Key != "LaunchTemplateVersion" && *tag.Key != "LaunchConfiguationName" { @@ -673,6 +758,108 @@ func (i *instance) generateTagsList() []*ec2.TagSpecification { return []*ec2.TagSpecification{&tags} } +func (i *instance) getReplacementTargetASGName() *string { + for _, tag := range i.Tags { + if *tag.Key == "launched-for-asg" { + return tag.Value + } + } + return nil +} + +func (i *instance) getReplacementTargetInstanceID() *string { + for _, tag := range i.Tags { + if *tag.Key == "launched-for-replacing-instance" { + return tag.Value + } + } + return nil +} + +func (i *instance) isUnattachedSpotInstanceLaunchedForAnEnabledASG() bool { + asgName := i.getReplacementTargetASGName() + if asgName == nil { + logger.Printf("%s is missing the tag value for 'launched-for-asg'", *i.InstanceId) + return false + } + asg := i.region.findEnabledASGByName(*asgName) + + if asg != nil && + asg.isEnabledForEventBasedInstanceReplacement() && + !asg.hasMemberInstance(i) && + i.isSpot() { + logger.Println("Found unattached spot instance", *i.InstanceId) + return true + } + return false +} + +func (i *instance) swapWithGroupMember() (*instance, error) { + odInstanceID := i.getReplacementTargetInstanceID() + if odInstanceID == nil { + logger.Println("Couldn't find target on-demand instance of", *i.InstanceId) + return nil, fmt.Errorf("couldn't find target instance for %s", *i.InstanceId) + } + + if err := i.region.scanInstance(odInstanceID); err != nil { + logger.Printf("Couldn't describe the target on-demand instance %s", *odInstanceID) + return nil, fmt.Errorf("target instance %s couldn't be described", *odInstanceID) + } + + odInstance := i.region.instances.get(*odInstanceID) + if odInstance == nil { + logger.Printf("Target on-demand instance %s couldn't be found", *odInstanceID) + return nil, fmt.Errorf("target instance %s is missing", *odInstanceID) + } + + asgName := i.getReplacementTargetASGName() + + if asgName == nil { + logger.Printf("Spot instance %s is missing ASG name", *i.InstanceId) + return nil, fmt.Errorf("spot instance %s is missing asg tag", *i.InstanceId) + } + + if !odInstance.shouldBeReplacedWithSpot() { + logger.Printf("Target on-demand instance %s shouldn't be replaced", *odInstanceID) + i.terminate() + return nil, fmt.Errorf("target instance %s should not be replaced with spot", + *odInstanceID) + } + + asg := i.region.findEnabledASGByName(*asgName) + if asgName == nil { + logger.Printf("Missing ASG data for region %s", i.region.name) + return nil, fmt.Errorf("region %s is missing asg data", i.region.name) + } + + max := *asg.MaxSize + + if *asg.DesiredCapacity == max { + if err := asg.setAutoScalingMaxSize(max + 1); err != nil { + logger.Printf("%s Couldn't temporarily expand ASG %s", + i.region.name, *asg.AutoScalingGroupName) + return nil, fmt.Errorf("couldn't increase ASG %s", *asg.AutoScalingGroupName) + } + defer asg.setAutoScalingMaxSize(max) + } + + if err := asg.attachSpotInstance(*i.InstanceId, true); err != nil { + logger.Printf("Spot instance %s couldn't be attached to the group %s, terminating it...", + *i.InstanceId, asg.name) + i.terminate() + return nil, fmt.Errorf("couldn't attach spot instance %s ", *i.InstanceId) + } + + if err := asg.terminateInstanceInAutoScalingGroup(odInstanceID, true, true); err != nil { + logger.Printf("On-demand instance %s couldn't be terminated, re-trying...", + *odInstanceID) + return nil, fmt.Errorf("couldn't terminate on-demand instance %s", + *odInstanceID) + } + + return odInstance, nil +} + // returns an instance ID as *string, set to nil if we need to wait for the next // run in case there are no spot instances func (i *instance) isReadyToAttach(asg *autoScalingGroup) bool { @@ -706,8 +893,6 @@ func (i *instance) isReadyToAttach(asg *autoScalingGroup) bool { } return false } - -// Why the heck isn't this in the Go standard library? func min(x, y int) int { if x < y { return x diff --git a/core/instance_launch_events.go b/core/instance_launch_events.go new file mode 100644 index 00000000..80dbde4f --- /dev/null +++ b/core/instance_launch_events.go @@ -0,0 +1,24 @@ +package autospotting + +import ( + "encoding/json" + "errors" + + "github.com/aws/aws-lambda-go/events" +) + +// returns the InstanceID, State or an error +func parseEventData(event events.CloudWatchEvent) (*string, *string, error) { + var detailData instanceData + if err := json.Unmarshal(event.Detail, &detailData); err != nil { + logger.Println(err.Error()) + return nil, nil, err + } + + if detailData.InstanceID != nil && detailData.State != nil { + return detailData.InstanceID, detailData.State, nil + } + + logger.Println("This code shouldn't be reachable") + return nil, nil, errors.New("this code shoudn't be reached") +} diff --git a/core/instance_test.go b/core/instance_test.go index cb6d10f2..7289a3f6 100644 --- a/core/instance_test.go +++ b/core/instance_test.go @@ -6,7 +6,6 @@ import ( "reflect" "sort" "testing" - "time" "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/service/autoscaling" @@ -1244,11 +1243,12 @@ func TestGenerateTagList(t *testing.T) { ASGName string ASGLCName string instanceTags []*ec2.Tag + instanceID string expectedTagSpecification []*ec2.TagSpecification }{ {name: "no tags on original instance", - ASGLCName: "testLC0", - ASGName: "myASG", + ASGLCName: "testLC0", + ASGName: "myASG", instanceID: "foo", instanceTags: []*ec2.Tag{}, expectedTagSpecification: []*ec2.TagSpecification{ { @@ -1265,14 +1265,18 @@ func TestGenerateTagList(t *testing.T) { { Key: aws.String("launched-for-asg"), Value: aws.String("myASG"), + }, { + Key: aws.String("launched-for-replacing-instance"), + Value: aws.String("foo"), }, }, }, }, }, {name: "Multiple tags on original instance", - ASGLCName: "testLC0", - ASGName: "myASG", + ASGLCName: "testLC0", + ASGName: "myASG", + instanceID: "bar", instanceTags: []*ec2.Tag{ { Key: aws.String("foo"), @@ -1295,6 +1299,10 @@ func TestGenerateTagList(t *testing.T) { Key: aws.String("launched-by-autospotting"), Value: aws.String("true"), }, + { + Key: aws.String("launched-for-replacing-instance"), + Value: aws.String("bar"), + }, { Key: aws.String("launched-for-asg"), Value: aws.String("myASG"), @@ -1318,7 +1326,8 @@ func TestGenerateTagList(t *testing.T) { i := instance{ Instance: &ec2.Instance{ - Tags: tt.instanceTags, + Tags: tt.instanceTags, + InstanceId: aws.String(tt.instanceID), }, asg: &autoScalingGroup{ name: tt.ASGName, @@ -1557,7 +1566,7 @@ func Test_instance_createRunInstancesInput(t *testing.T) { IamInstanceProfile: &ec2.IamInstanceProfile{ Arn: aws.String("profile-arn"), - }, + }, InstanceId: aws.String("i-foo"), InstanceType: aws.String("t2.medium"), @@ -1633,6 +1642,10 @@ func Test_instance_createRunInstancesInput(t *testing.T) { Key: aws.String("launched-for-asg"), Value: aws.String("mygroup"), }, + { + Key: aws.String("launched-for-replacing-instance"), + Value: aws.String("i-foo"), + }, }, }, }, @@ -1676,7 +1689,7 @@ func Test_instance_createRunInstancesInput(t *testing.T) { IamInstanceProfile: &ec2.IamInstanceProfile{ Arn: aws.String("profile-arn"), }, - + InstanceId: aws.String("i-foo"), InstanceType: aws.String("t2.medium"), KeyName: aws.String("mykey"), @@ -1752,6 +1765,10 @@ func Test_instance_createRunInstancesInput(t *testing.T) { Key: aws.String("launched-for-asg"), Value: aws.String("mygroup"), }, + { + Key: aws.String("launched-for-replacing-instance"), + Value: aws.String("i-foo"), + }, }, }, }, @@ -1783,7 +1800,7 @@ func Test_instance_createRunInstancesInput(t *testing.T) { IamInstanceProfile: &ec2.IamInstanceProfile{ Arn: aws.String("profile-arn"), }, - + InstanceId: aws.String("i-foo"), InstanceType: aws.String("t2.medium"), Placement: &ec2.Placement{ @@ -1855,6 +1872,9 @@ func Test_instance_createRunInstancesInput(t *testing.T) { { Key: aws.String("launched-for-asg"), Value: aws.String("mygroup"), + }, { + Key: aws.String("launched-for-replacing-instance"), + Value: aws.String("i-foo"), }, }, }, @@ -1896,6 +1916,7 @@ func Test_instance_createRunInstancesInput(t *testing.T) { Arn: aws.String("profile-arn"), }, + InstanceId: aws.String("i-foo"), InstanceType: aws.String("t2.medium"), KeyName: aws.String("older-key"), @@ -1983,6 +2004,10 @@ func Test_instance_createRunInstancesInput(t *testing.T) { Key: aws.String("launched-for-asg"), Value: aws.String("mygroup"), }, + { + Key: aws.String("launched-for-replacing-instance"), + Value: aws.String("i-foo"), + }, }, }, }, @@ -2004,86 +2029,7 @@ func Test_instance_createRunInstancesInput(t *testing.T) { }) if !reflect.DeepEqual(got, tt.want) { - t.Errorf("instance.createRunInstancesInput() = %v, want %v", got, tt.want) - } - }) - } -} - -func Test_instance_isReadyToAttach(t *testing.T) { - //now := time.Now() - tenMinutesAgo := time.Now().Add(-10 * time.Minute) - - tests := []struct { - name string - instance instance - asg *autoScalingGroup - want bool - }{ - - { - name: "pending instance", - instance: instance{ - Instance: &ec2.Instance{ - InstanceId: aws.String("i-123"), - LaunchTime: &tenMinutesAgo, - State: &ec2.InstanceState{ - Name: aws.String(ec2.InstanceStateNamePending), - }, - }, - }, - asg: &autoScalingGroup{ - name: "my-asg", - Group: &autoscaling.Group{ - HealthCheckGracePeriod: aws.Int64(3600), - }, - }, - want: false, - }, - { - name: "not-ready running instance", - instance: instance{ - Instance: &ec2.Instance{ - InstanceId: aws.String("i-123"), - LaunchTime: &tenMinutesAgo, - State: &ec2.InstanceState{ - Name: aws.String(ec2.InstanceStateNameRunning), - }, - }, - }, - asg: &autoScalingGroup{ - name: "my-asg", - Group: &autoscaling.Group{ - HealthCheckGracePeriod: aws.Int64(3600), - }, - }, - want: false, - }, - { - name: "ready running instance", - instance: instance{ - Instance: &ec2.Instance{ - InstanceId: aws.String("i-123"), - LaunchTime: &tenMinutesAgo, - State: &ec2.InstanceState{ - Name: aws.String(ec2.InstanceStateNameRunning), - }, - }, - }, - asg: &autoScalingGroup{ - name: "my-asg", - Group: &autoscaling.Group{ - HealthCheckGracePeriod: aws.Int64(300), - }, - }, - want: true, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - - if got := tt.instance.isReadyToAttach(tt.asg); got != tt.want { - t.Errorf("instance.isReadyToAttach() = %v, want %v", got, tt.want) + t.Errorf("Instance.createRunInstancesInput() = %v, want %v", got, tt.want) } }) } @@ -2093,13 +2039,13 @@ func Test_instance_isSameArch(t *testing.T) { tests := []struct { name string - instance instance + Instance instance spotCandidate instanceTypeInformation want bool }{ { name: "Same architecture: both Intel", - instance: instance{ + Instance: instance{ typeInfo: instanceTypeInformation{ PhysicalProcessor: "Intel", }, @@ -2112,7 +2058,7 @@ func Test_instance_isSameArch(t *testing.T) { { name: "Same architecture: both AMD", - instance: instance{ + Instance: instance{ typeInfo: instanceTypeInformation{ PhysicalProcessor: "AMD", }, @@ -2125,7 +2071,7 @@ func Test_instance_isSameArch(t *testing.T) { { name: "Same architecture: Intel and AMD", - instance: instance{ + Instance: instance{ typeInfo: instanceTypeInformation{ PhysicalProcessor: "Intel", }, @@ -2138,7 +2084,7 @@ func Test_instance_isSameArch(t *testing.T) { { name: "Same architecture: AMD and Intel", - instance: instance{ + Instance: instance{ typeInfo: instanceTypeInformation{ PhysicalProcessor: "AMD", }, @@ -2151,7 +2097,7 @@ func Test_instance_isSameArch(t *testing.T) { { name: "Same architecture: Intel and Variable", - instance: instance{ + Instance: instance{ typeInfo: instanceTypeInformation{ PhysicalProcessor: "Intel", }, @@ -2164,7 +2110,7 @@ func Test_instance_isSameArch(t *testing.T) { { name: "Same architecture: Variable and Intel", - instance: instance{ + Instance: instance{ typeInfo: instanceTypeInformation{ PhysicalProcessor: "Variable", }, @@ -2177,7 +2123,7 @@ func Test_instance_isSameArch(t *testing.T) { { name: "Same architecture, both ARM-based", - instance: instance{ + Instance: instance{ typeInfo: instanceTypeInformation{ PhysicalProcessor: "AWS", }, @@ -2190,7 +2136,7 @@ func Test_instance_isSameArch(t *testing.T) { { name: "Different architecture, Intel and ARM", - instance: instance{ + Instance: instance{ typeInfo: instanceTypeInformation{ PhysicalProcessor: "Intel", }, @@ -2203,7 +2149,7 @@ func Test_instance_isSameArch(t *testing.T) { { name: "Different architecture, AMD and ARM", - instance: instance{ + Instance: instance{ typeInfo: instanceTypeInformation{ PhysicalProcessor: "Intel", }, @@ -2216,7 +2162,7 @@ func Test_instance_isSameArch(t *testing.T) { { name: "Different architecture, ARM and Intel", - instance: instance{ + Instance: instance{ typeInfo: instanceTypeInformation{ PhysicalProcessor: "AWS", }, @@ -2229,7 +2175,7 @@ func Test_instance_isSameArch(t *testing.T) { { name: "Different architecture, ARM and AMD", - instance: instance{ + Instance: instance{ typeInfo: instanceTypeInformation{ PhysicalProcessor: "AWS", }, @@ -2243,9 +2189,147 @@ func Test_instance_isSameArch(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - if got := tt.instance.isSameArch(tt.spotCandidate); got != tt.want { - t.Errorf("instance.isSameArch() = %v, want %v", got, tt.want) + if got := tt.Instance.isSameArch(tt.spotCandidate); got != tt.want { + t.Errorf("Instance.isSameArch() = %v, want %v", got, tt.want) + } + }) + } +} + +func Test_instance_isUnattachedSpotInstanceLaunchedForAnEnabledASG(t *testing.T) { + + tests := []struct { + name string + i *instance + wantASG *autoScalingGroup + wantUnattached bool + }{ + // { + // name: "on-demand instance", + // i: &instance{ + // Instance: &ec2.Instance{ + // InstanceLifecycle: nil, + // }, + // }, + // }, + + // { + // name: "no instances launched for this ASG", + // asg: autoScalingGroup{ + // name: "mygroup", + // region: ®ion{ + // instances: makeInstancesWithCatalog( + // instanceMap{ + // "id-1": { + // Instance: &ec2.Instance{ + // InstanceId: aws.String("id-1"), + // Tags: []*ec2.Tag{}, + // }, + // }, + // }, + // ), + // }, + // }, + // want: nil, + // }, + // { + // name: "instance launched for another ASG", + // asg: autoScalingGroup{ + // name: "mygroup", + // region: ®ion{ + // instances: makeInstancesWithCatalog( + // instanceMap{ + // "id-1": { + // Instance: &ec2.Instance{ + // InstanceId: aws.String("id-1"), + // Tags: []*ec2.Tag{}, + // }, + // }, + // "id-2": { + // Instance: &ec2.Instance{ + // InstanceId: aws.String("id-2"), + // Tags: []*ec2.Tag{ + // { + // Key: aws.String("launched-for-asg"), + // Value: aws.String("another-asg"), + // }, + // { + // Key: aws.String("another-key"), + // Value: aws.String("another-value"), + // }, + // }, + // }, + // }, + // }, + // ), + // }, + // }, + // want: nil, + // }, { + // name: "instance launched for current ASG", + // asg: autoScalingGroup{ + // name: "mygroup", + // Group: &autoscaling.Group{ + // Instances: []*autoscaling.Instance{ + // {InstanceId: aws.String("foo")}, + // {InstanceId: aws.String("bar")}, + // {InstanceId: aws.String("baz")}, + // }, + // }, + + // region: ®ion{ + // instances: makeInstancesWithCatalog( + // instanceMap{ + // "id-1": { + // Instance: &ec2.Instance{ + // InstanceId: aws.String("id-1"), + // Tags: []*ec2.Tag{}, + // }, + // }, + // "id-2": { + // Instance: &ec2.Instance{ + // InstanceId: aws.String("id-2"), + // Tags: []*ec2.Tag{ + // { + // Key: aws.String("launched-for-asg"), + // Value: aws.String("mygroup"), + // }, + // { + // Key: aws.String("another-key"), + // Value: aws.String("another-value"), + // }, + // }, + // }, + // }, + // }, + // ), + // }, + // }, + // want: &instance{ + // Instance: &ec2.Instance{ + // InstanceId: aws.String("id-2"), + // Tags: []*ec2.Tag{ + // { + // Key: aws.String("launched-for-asg"), + // Value: aws.String("mygroup"), + // }, + // { + // Key: aws.String("another-key"), + // Value: aws.String("another-value"), + // }, + // }, + // }, + // }, + // }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + + gotUnattached := tt.i.isUnattachedSpotInstanceLaunchedForAnEnabledASG() + if !reflect.DeepEqual(gotUnattached, tt.wantUnattached) { + t.Errorf("instance.isUnattachedSpotInstanceLaunchedForAnEnabledASG() = %v, want %v", gotUnattached, tt.wantUnattached) } }) + } } diff --git a/core/main.go b/core/main.go index 38d5a4ff..63b8e1f5 100644 --- a/core/main.go +++ b/core/main.go @@ -1,50 +1,72 @@ package autospotting import ( + "encoding/json" + "errors" + "fmt" "io/ioutil" "log" "os" "strings" "sync" + "github.com/aws/aws-lambda-go/events" "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws/session" "github.com/aws/aws-sdk-go/service/ec2" "github.com/aws/aws-sdk-go/service/ec2/ec2iface" + ec2instancesinfo "github.com/cristim/ec2-instances-info" ) var logger, debug *log.Logger -var hourlySavings float64 -var savingsMutex = &sync.RWMutex{} - -// Run starts processing all AWS regions looking for AutoScaling groups -// enabled and taking action by replacing more pricy on-demand instances with -// compatible and cheaper spot instances. -func Run(cfg *Config) { +// AutoSpotting hosts global configuration and has as methods all the public +// entrypoints of this library +type AutoSpotting struct { + config *Config + hourlySavings float64 + savingsMutex *sync.RWMutex + mainEC2Conn ec2iface.EC2API +} - setupLogging(cfg) +var as *AutoSpotting - debug.Println(*cfg) +// Init initializes some data structures reusable across multiple event runs +func (a *AutoSpotting) Init(cfg *Config) { + data, err := ec2instancesinfo.Data() + if err != nil { + log.Fatal(err.Error()) + } + cfg.InstanceData = data + a.config = cfg + a.savingsMutex = &sync.RWMutex{} + a.config.setupLogging() // use this only to list all the other regions - ec2Conn := connectEC2(cfg.MainRegion) + a.mainEC2Conn = connectEC2(a.config.MainRegion) + as = a +} - addDefaultFilteringMode(cfg) - addDefaultFilter(cfg) +// ProcessCronEvent starts processing all AWS regions looking for AutoScaling groups +// enabled and taking action by replacing more pricy on-demand instances with +// compatible and cheaper spot instances. +func (a *AutoSpotting) ProcessCronEvent() { - allRegions, err := getRegions(ec2Conn) + a.config.addDefaultFilteringMode() + a.config.addDefaultFilter() + + allRegions, err := a.getRegions() if err != nil { logger.Println(err.Error()) return } - processRegions(allRegions, cfg) + a.processRegions(allRegions) } -func addDefaultFilteringMode(cfg *Config) { +func (cfg *Config) addDefaultFilteringMode() { if cfg.TagFilteringMode != "opt-out" { debug.Printf("Configured filtering mode: '%s', considering it as 'opt-in'(default)\n", cfg.TagFilteringMode) @@ -54,7 +76,7 @@ func addDefaultFilteringMode(cfg *Config) { } } -func addDefaultFilter(cfg *Config) { +func (cfg *Config) addDefaultFilter() { if len(strings.TrimSpace(cfg.FilterByTags)) == 0 { switch cfg.TagFilteringMode { case "opt-out": @@ -65,11 +87,12 @@ func addDefaultFilter(cfg *Config) { } } -func disableLogging() { - setupLogging(&Config{LogFile: ioutil.Discard}) +func (cfg *Config) disableLogging() { + cfg.LogFile = ioutil.Discard + cfg.setupLogging() } -func setupLogging(cfg *Config) { +func (cfg *Config) setupLogging() { logger = log.New(cfg.LogFile, "", cfg.LogFlag) if os.Getenv("AUTOSPOTTING_DEBUG") == "true" { @@ -83,14 +106,15 @@ func setupLogging(cfg *Config) { // processAllRegions iterates all regions in parallel, and replaces instances // for each of the ASGs tagged with tags as specified by slice represented by cfg.FilterByTags // by default this is all asg with the tag 'spot-enabled=true'. -func processRegions(regions []string, cfg *Config) { +func (a *AutoSpotting) processRegions(regions []string) { var wg sync.WaitGroup for _, r := range regions { wg.Add(1) - r := region{name: r, conf: cfg} + + r := region{name: r, conf: a.config} go func() { @@ -99,7 +123,7 @@ func processRegions(regions []string, cfg *Config) { r.processRegion() } else { debug.Println("Not enabled to run in", r.name) - debug.Println("List of enabled regions:", cfg.Regions) + debug.Println("List of enabled regions:", r.conf.Regions) } wg.Done() @@ -120,12 +144,12 @@ func connectEC2(region string) *ec2.EC2 { } // getRegions generates a list of AWS regions. -func getRegions(ec2conn ec2iface.EC2API) ([]string, error) { +func (a *AutoSpotting) getRegions() ([]string, error) { var output []string logger.Println("Scanning for available AWS regions") - resp, err := ec2conn.DescribeRegions(&ec2.DescribeRegionsInput{}) + resp, err := a.mainEC2Conn.DescribeRegions(&ec2.DescribeRegionsInput{}) if err != nil { logger.Println(err.Error()) @@ -143,3 +167,133 @@ func getRegions(ec2conn ec2iface.EC2API) ([]string, error) { } return output, nil } + +//EventHandler implements the event handling logic and is the main entrypoint of +// AutoSpotting +func (a *AutoSpotting) EventHandler(event *json.RawMessage) { + var snsEvent events.SNSEvent + var cloudwatchEvent events.CloudWatchEvent + + if event == nil { + logger.Println("Missing event data, running as if triggered from a cron event...") + // Event is Autospotting Cron Scheduling + a.ProcessCronEvent() + return + } + + log.Println("Received event: \n", string(*event)) + parseEvent := *event + + // Try to parse event as an Sns Message + if err := json.Unmarshal(parseEvent, &snsEvent); err != nil { + log.Println(err.Error()) + return + } + + // If the event comes from Sns - extract the Cloudwatch event embedded in it + if snsEvent.Records != nil { + snsRecord := snsEvent.Records[0] + parseEvent = []byte(snsRecord.SNS.Message) + } + + // Try to parse the event as Cloudwatch Event Rule + if err := json.Unmarshal(parseEvent, &cloudwatchEvent); err != nil { + log.Println(err.Error()) + return + } + + // If the event is for an Instance Spot Interruption + if cloudwatchEvent.DetailType == "EC2 Spot Instance Interruption Warning" { + log.Println("Triggered by", cloudwatchEvent.DetailType) + if instanceID, err := getInstanceIDDueForTermination(cloudwatchEvent); err != nil { + log.Println("Could't get instance ID of terminating spot instance", err.Error()) + return + } else if instanceID != nil { + spotTermination := newSpotTermination(cloudwatchEvent.Region) + spotTermination.executeAction(instanceID, a.config.TerminationNotificationAction) + } + + // If event is Instance state change + } else if cloudwatchEvent.DetailType == "EC2 Instance State-change Notification" { + log.Println("Triggered by", cloudwatchEvent.DetailType) + instanceID, state, err := parseEventData(cloudwatchEvent) + if err != nil { + log.Println("Could't get instance ID of newly launched instance", err.Error()) + return + } else if instanceID != nil { + a.handleNewInstanceLaunch(cloudwatchEvent.Region, *instanceID, *state) + } + + } else { + // Cron Scheduling + a.ProcessCronEvent() + } + +} + +func (a *AutoSpotting) handleNewInstanceLaunch(regionName string, instanceID string, state string) error { + r := region{name: regionName, conf: a.config, services: connections{}} + + if !r.enabled() { + return fmt.Errorf("region %s is not enabled", regionName) + } + + r.services.connect(regionName) + r.setupAsgFilters() + r.scanForEnabledAutoScalingGroups() + + logger.Println("Scanning full instance information in", r.name) + r.determineInstanceTypeInformation(r.conf) + + if err := r.scanInstance(aws.String(instanceID)); err != nil { + logger.Printf("%s Couldn't scan instance %s: %s", regionName, + instanceID, err.Error()) + return err + } + + i := r.instances.get(instanceID) + if i == nil { + logger.Printf("%s Instance %s is missing, skipping...", + regionName, instanceID) + return errors.New("instance missing") + } + logger.Printf("%s Found instance %s in state %s", + i.region.name, *i.InstanceId, *i.State.Name) + + if state == "pending" && i.belongsToEnabledASG() && i.shouldBeReplacedWithSpot() { + logger.Printf("%s instance %s is in pending state, belongs to an enabled ASG "+ + "and should be replaced with spot, attempting to launch spot replacement", i.region.name, *i.InstanceId) + if _, err := i.launchSpotReplacement(); err != nil { + logger.Printf("%s Couldn't launch spot replacement for %s", + i.region.name, *i.InstanceId) + return err + } + } else { + logger.Printf("%s skipping instance %s: either not in pending state (%s), doesn't "+ + "belong to an enabled ASG or should not be replaced with spot, ", + i.region.name, *i.InstanceId, *i.State.Name) + } + + if state == "running" { + logger.Printf("%s Found instance %s in running state, checking if it's a spot instance "+ + "that should be attached to any ASG", i.region.name, *i.InstanceId) + unattached := i.isUnattachedSpotInstanceLaunchedForAnEnabledASG() + if !unattached { + logger.Printf("%s Found instance %s is already attached to an ASG, skipping it", + i.region.name, *i.InstanceId) + return nil + } + + logger.Printf("%s Found instance %s is not yet attached to its ASG, "+ + "attempting to swap it against a running on-demand instance", + i.region.name, *i.InstanceId) + + if _, err := i.swapWithGroupMember(); err != nil { + logger.Printf("%s, couldn't perform spot replacement of %s ", + i.region.name, *i.InstanceId) + return err + } + } + + return nil +} diff --git a/core/main_test.go b/core/main_test.go index 95b38a02..94de16c3 100644 --- a/core/main_test.go +++ b/core/main_test.go @@ -2,6 +2,9 @@ package autospotting import ( "fmt" + "io/ioutil" + "log" + "os" "reflect" "testing" @@ -9,6 +12,18 @@ import ( "github.com/aws/aws-sdk-go/service/ec2" ) +func TestMain(m *testing.M) { + a := &AutoSpotting{} + + a.Init(&Config{ + MainRegion: "us-east-1", + }) + + logger = log.New(ioutil.Discard, "", 0) + debug = log.New(ioutil.Discard, "", 0) + + os.Exit(m.Run()) +} func Test_getRegions(t *testing.T) { tests := []struct { @@ -33,6 +48,7 @@ func Test_getRegions(t *testing.T) { { name: "return an error", ec2conn: mockEC2{ + dro: &ec2.DescribeRegionsOutput{ Regions: []*ec2.Region{ {RegionName: aws.String("foo")}, @@ -47,8 +63,9 @@ func Test_getRegions(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { + as.mainEC2Conn = tt.ec2conn - got, err := getRegions(tt.ec2conn) + got, err := as.getRegions() CheckErrors(t, err, tt.wantErr) if !reflect.DeepEqual(got, tt.want) { @@ -94,7 +111,7 @@ func Test_spotEnabledIsAddedByDefault(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - addDefaultFilter(&tt.config) + tt.config.addDefaultFilter() if !reflect.DeepEqual(tt.config.FilterByTags, tt.want) { t.Errorf("addDefaultFilter() = %v, want %v", tt.config.FilterByTags, tt.want) @@ -139,7 +156,7 @@ func Test_addDefaultFilterMode(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - addDefaultFilteringMode(&tt.cfg) + tt.cfg.addDefaultFilteringMode() if !reflect.DeepEqual(tt.cfg.TagFilteringMode, tt.want) { t.Errorf("addDefaultFilteringMode() = %v, want %v", tt.cfg.TagFilteringMode, tt.want) diff --git a/core/mock_test.go b/core/mock_test.go index 2297589d..f5e7af0b 100644 --- a/core/mock_test.go +++ b/core/mock_test.go @@ -55,6 +55,9 @@ type mockEC2 struct { // DescribeLaunchTemplateVersionsOutput dltvo *ec2.DescribeLaunchTemplateVersionsOutput dltverr error + + // WaitUntilInstanceRunning error + // wuirerr error } func (m mockEC2) DescribeSpotPriceHistory(in *ec2.DescribeSpotPriceHistoryInput) (*ec2.DescribeSpotPriceHistoryOutput, error) { diff --git a/core/region.go b/core/region.go index 16ce4437..dca6a072 100644 --- a/core/region.go +++ b/core/region.go @@ -11,7 +11,6 @@ import ( "github.com/aws/aws-sdk-go/service/autoscaling" "github.com/aws/aws-sdk-go/service/cloudformation" "github.com/aws/aws-sdk-go/service/ec2" - "github.com/davecgh/go-spew/spew" ) // Tag represents an Asg Tag: Key, Value @@ -89,8 +88,6 @@ func (r *region) processRegion() { logger.Println("Scanning full instance information in", r.name) r.determineInstanceTypeInformation(r.conf) - debug.Println(spew.Sdump(r.instanceTypeInformation)) - logger.Println("Scanning instances in", r.name) err := r.scanInstances() if err != nil { @@ -139,7 +136,6 @@ func splitTagAndValue(value string) *Tag { func (r *region) processDescribeInstancesPage(page *ec2.DescribeInstancesOutput, lastPage bool) bool { logger.Println("Processing page of DescribeInstancesPages for", r.name) - debug.Println(page) if len(page.Reservations) > 0 && page.Reservations[0].Instances != nil { @@ -182,6 +178,30 @@ func (r *region) scanInstances() error { return nil } +func (r *region) scanInstance(instanceID *string) error { + svc := r.services.ec2 + input := &ec2.DescribeInstancesInput{ + Filters: []*ec2.Filter{ + { + Name: aws.String("instance-id"), + Values: []*string{instanceID}, + }, + }, + } + + r.instances = makeInstances() + + err := svc.DescribeInstancesPages( + input, + r.processDescribeInstancesPage) + + if err != nil { + return err + } + + return nil +} + func (r *region) addInstance(inst *ec2.Instance) { r.instances.add(&instance{ Instance: inst, @@ -200,8 +220,6 @@ func (r *region) determineInstanceTypeInformation(cfg *Config) { var price prices - debug.Println(it) - // populate on-demand information price.onDemand = it.Pricing[r.name].Linux.OnDemand * cfg.OnDemandPriceMultiplier price.spot = make(spotPriceMap) @@ -231,7 +249,6 @@ func (r *region) determineInstanceTypeInformation(cfg *Config) { info.instanceStoreDeviceCount = it.Storage.Devices info.instanceStoreIsSSD = it.Storage.SSD } - debug.Println(info) r.instanceTypeInformation[it.InstanceType] = info } } @@ -243,7 +260,6 @@ func (r *region) determineInstanceTypeInformation(cfg *Config) { logger.Println(err.Error()) } - debug.Println(spew.Sdump(r.instanceTypeInformation)) } func (r *region) requestSpotPrices() error { @@ -364,7 +380,7 @@ func (r *region) findMatchingASGsInPageOfResults(groups []*autoscaling.Group, asgName := *group.AutoScalingGroupName if group.MixedInstancesPolicy != nil { - logger.Printf("Skipping group %s because it's using a mixed instances policy", + debug.Printf("Skipping group %s because it's using a mixed instances policy", asgName) continue } @@ -374,7 +390,7 @@ func (r *region) findMatchingASGsInPageOfResults(groups []*autoscaling.Group, // expression. The goal is to add the matching ASGs when running in opt-in // mode and the other way round. if optInFilterMode != groupMatchesExpectedTags { - logger.Printf("Skipping group %s because its tags, the currently "+ + debug.Printf("Skipping group %s because its tags, the currently "+ "configured filtering mode (%s) and tag filters do not align\n", asgName, r.conf.TagFilteringMode) continue @@ -410,7 +426,7 @@ func (r *region) scanForEnabledAutoScalingGroups() { &autoscaling.DescribeAutoScalingGroupsInput{}, func(page *autoscaling.DescribeAutoScalingGroupsOutput, lastPage bool) bool { pageNum++ - logger.Println("Processing page", pageNum, "of DescribeAutoScalingGroupsPages for", r.name) + debug.Println("Processing page", pageNum, "of DescribeAutoScalingGroupsPages for", r.name) matchingAsgs := r.findMatchingASGsInPageOfResults(page.AutoScalingGroups, r.tagsToFilterASGsBy) r.enabledASGs = append(r.enabledASGs, matchingAsgs...) return true @@ -437,9 +453,18 @@ func (r *region) processEnabledAutoScalingGroups() { r.wg.Add(1) go func(a autoScalingGroup) { - a.process() + a.processCronEvent() r.wg.Done() }(asg) } r.wg.Wait() } + +func (r *region) findEnabledASGByName(name string) *autoScalingGroup { + for _, asg := range r.enabledASGs { + if asg.name == name { + return &asg + } + } + return nil +} diff --git a/core/spot_price_test.go b/core/spot_price_test.go index 04490844..e970bc6f 100644 --- a/core/spot_price_test.go +++ b/core/spot_price_test.go @@ -2,7 +2,6 @@ package autospotting import ( "errors" - "os" "testing" "time" @@ -10,11 +9,6 @@ import ( "github.com/aws/aws-sdk-go/service/ec2" ) -func TestMain(m *testing.M) { - disableLogging() - os.Exit(m.Run()) -} - func Test_fetch(t *testing.T) { tests := []struct { name string diff --git a/core/spot_termination.go b/core/spot_termination.go index 6ff41db5..4a820464 100644 --- a/core/spot_termination.go +++ b/core/spot_termination.go @@ -28,12 +28,12 @@ type SpotTermination struct { //InstanceData represents JSON structure of the Detail property of CloudWatch event when a spot instance is terminated //Reference = https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/spot-interruptions.html#spot-instance-termination-notices type instanceData struct { - InstanceID string `json:"instance-id"` - InstanceAction string `json:"instance-action"` + InstanceID *string `json:"instance-id"` + InstanceAction *string `json:"instance-action"` + State *string `json:"state"` } -//NewSpotTermination is a constructor for creating an instance of spotTermination to call DetachInstance -func NewSpotTermination(region string) SpotTermination { +func newSpotTermination(region string) SpotTermination { logger.Println("Connection to region ", region) @@ -47,9 +47,9 @@ func NewSpotTermination(region string) SpotTermination { } } -//GetInstanceIDDueForTermination checks if the given CloudWatch event data is triggered from a spot termination +//getInstanceIDDueForTermination checks if the given CloudWatch event data is triggered from a spot termination //If it is a termination event for a spot instance, it returns the instance id present in the event data -func GetInstanceIDDueForTermination(event events.CloudWatchEvent) (*string, error) { +func getInstanceIDDueForTermination(event events.CloudWatchEvent) (*string, error) { var detailData instanceData if err := json.Unmarshal(event.Detail, &detailData); err != nil { @@ -57,8 +57,8 @@ func GetInstanceIDDueForTermination(event events.CloudWatchEvent) (*string, erro return nil, err } - if detailData.InstanceAction != "" { - return &detailData.InstanceID, nil + if detailData.InstanceAction != nil && *detailData.InstanceAction != "" { + return detailData.InstanceID, nil } return nil, nil @@ -129,7 +129,7 @@ func (s *SpotTermination) getAsgName(instanceID *string) (string, error) { // ExecuteAction execute the proper termination action (terminate|detach) based on the value of // terminationNotificationAction and the presence of a LifecycleHook on ASG. -func (s *SpotTermination) ExecuteAction(instanceID *string, terminationNotificationAction string) error { +func (s *SpotTermination) executeAction(instanceID *string, terminationNotificationAction string) error { if s.asSvc == nil { return errors.New("AutoScaling service not defined. Please use NewSpotTermination()") } diff --git a/core/spot_termination_test.go b/core/spot_termination_test.go index 05c6edac..9218998b 100644 --- a/core/spot_termination_test.go +++ b/core/spot_termination_test.go @@ -6,6 +6,7 @@ import ( "testing" "github.com/aws/aws-lambda-go/events" + "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/service/autoscaling" "github.com/aws/aws-sdk-go/service/ec2" ) @@ -13,7 +14,7 @@ import ( func TestNewSpotTermination(t *testing.T) { region := "foo" - spotTermination := NewSpotTermination(region) + spotTermination := newSpotTermination(region) if spotTermination.asSvc == nil || spotTermination.ec2Svc == nil { t.Errorf("Unable to connect to region %s", region) @@ -24,8 +25,8 @@ func TestGetInstanceIDDueForTermination(t *testing.T) { expectedInstanceID := "i-123456" dummyInstanceData := instanceData{ - InstanceID: expectedInstanceID, - InstanceAction: "terminate", + InstanceID: aws.String(expectedInstanceID), + InstanceAction: aws.String("terminate"), } tests := []struct { @@ -63,7 +64,7 @@ func TestGetInstanceIDDueForTermination(t *testing.T) { for _, tc := range tests { t.Run(tc.name, func(t *testing.T) { - instanceID, _ := GetInstanceIDDueForTermination(tc.cloudWatchEvent) + instanceID, _ := getInstanceIDDueForTermination(tc.cloudWatchEvent) if tc.expected == nil && instanceID != nil { t.Errorf("Expected nil instanceID, actual: %s", *instanceID) @@ -336,7 +337,7 @@ func TestExecuteAction(t *testing.T) { for _, tc := range tests { t.Run(tc.name, func(t *testing.T) { - err := tc.spotTermination.ExecuteAction(&instanceID, tc.terminationNotificationAction) + err := tc.spotTermination.executeAction(&instanceID, tc.terminationNotificationAction) if err != nil && err.Error() != tc.expectedError.Error() { t.Errorf("Error in ExecuteAction: expected %s actual %s", tc.expectedError.Error(), err.Error()) From cceb68647aa0604679e1a8331daad63c1ea0ed9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cristian=20M=C4=83gheru=C8=99an-Stanciu?= Date: Wed, 31 Jul 2019 12:01:41 +0200 Subject: [PATCH 02/17] Further improvements Only handle the "running" events of new instances because the instance information wasn't filled while the instance was in pending state. Also run spot attachment in parallel with detachment of the on-demand instance in order to avoid race conditions with the group's process that enforces the desired capacity like when doing multiple instances in parallel for new groups or multiple instance launches. --- .../AutoSpotting/regional_template.yaml | 11 ++-- core/autoscaling.go | 4 +- core/instance.go | 45 +++++++--------- core/main.go | 52 +++++++++++-------- 4 files changed, 55 insertions(+), 57 deletions(-) diff --git a/cloudformation/stacks/AutoSpotting/regional_template.yaml b/cloudformation/stacks/AutoSpotting/regional_template.yaml index d1ca4539..3e52246e 100644 --- a/cloudformation/stacks/AutoSpotting/regional_template.yaml +++ b/cloudformation/stacks/AutoSpotting/regional_template.yaml @@ -3,7 +3,7 @@ Description: > "Implements support for triggering the main AutoSpotting Lambda function on regional events such as instance launches or imminent spot terminations that - can only be detected from wthin a given region" + can only be detected from within a given region" Parameters: AutoSpottingLambdaARN: Description: "The ARN of the main AutoSpotting Lambda function" @@ -96,7 +96,7 @@ Fn::GetAtt: - "EventHandler" - "Arn" - InstancePendingLambdaPermission: + InstanceRunningLambdaPermission: Type: "AWS::Lambda::Permission" Properties: Action: "lambda:InvokeFunction" @@ -105,9 +105,9 @@ Principal: "events.amazonaws.com" SourceArn: Fn::GetAtt: - - "InstancePendingEventRule" + - "InstanceRunningEventRule" - "Arn" - InstancePendingEventRule: + InstanceRunningEventRule: Type: "AWS::Events::Rule" Properties: Description: > @@ -119,12 +119,11 @@ - "aws.ec2" detail: state: - - "pending" - "running" State: "ENABLED" Targets: - - Id: "InstancePendingEventGenerator" + Id: "InstanceRunningEventGenerator" Arn: Fn::GetAtt: - "EventHandler" diff --git a/core/autoscaling.go b/core/autoscaling.go index 57b2ebec..5008eb7f 100644 --- a/core/autoscaling.go +++ b/core/autoscaling.go @@ -228,8 +228,8 @@ func (a *autoScalingGroup) enableForInstanceLaunchEventHandling() { func (a *autoScalingGroup) isEnabledForEventBasedInstanceReplacement() bool { if time.Now().Sub(*a.CreatedTime) < time.Hour { logger.Println("ASG %s is newer than an hour, enabling it for event-based "+ - "instance replacement", a.name) - a.enableForInstanceLaunchEventHandling() + "instance replacement", a.name) + a.enableForInstanceLaunchEventHandling() return true } diff --git a/core/instance.go b/core/instance.go index 26f668fa..0c621ce8 100644 --- a/core/instance.go +++ b/core/instance.go @@ -507,25 +507,6 @@ func (i *instance) launchSpotReplacement() (*string, error) { return nil, err } -// // replaceWithSpotAndTerminate replaces an on-demand instance with a compatible -// // spot instance then immediately terminates it. This is supposed to be called -// // against a recently launched on-demand instance, while it's still in the -// // pending state. -// func (i *instance) replaceWithSpotAndTerminate() error { -// spotinstanceID, err := i.launchSpotReplacement() -// if err != nil { -// logger.Println("Couldn't launch spot replacement") -// return err -// } -// i.asg.attachSpotInstance(*spotinstanceID, true) - -// err = i.asg.detachAndTerminateOnDemandInstance(i.InstanceId, true) -// if err != nil { -// logger.Println("Couldn't detach and terminate instance", i.InstanceId) -// } -// return err -// } - func (i *instance) getPricetoBid( baseOnDemandPrice float64, currentSpotPrice float64) float64 { @@ -843,12 +824,24 @@ func (i *instance) swapWithGroupMember() (*instance, error) { defer asg.setAutoScalingMaxSize(max) } - if err := asg.attachSpotInstance(*i.InstanceId, true); err != nil { - logger.Printf("Spot instance %s couldn't be attached to the group %s, terminating it...", - *i.InstanceId, asg.name) - i.terminate() - return nil, fmt.Errorf("couldn't attach spot instance %s ", *i.InstanceId) - } + // We use this to capture the error message from the closure below that is + // executed asynchronously. + c := make(chan error) + + // Attach the new spot instance in parallel with the termination of the + // running on-demand instance in order to avoid increasing the capacity enough + // for the ASG to notice it and start terminating instances. The ASGs were + // sometimes seen to just terminates newly launched spot instances so the + // group was unintentionally kept with on-demand capacity. + go func() { + if err := asg.attachSpotInstance(*i.InstanceId, true); err != nil { + logger.Printf("Spot instance %s couldn't be attached to the group %s, terminating it...", + *i.InstanceId, asg.name) + i.terminate() + c <- fmt.Errorf("couldn't attach spot instance %s ", *i.InstanceId) + } + c <- nil + }() if err := asg.terminateInstanceInAutoScalingGroup(odInstanceID, true, true); err != nil { logger.Printf("On-demand instance %s couldn't be terminated, re-trying...", @@ -857,7 +850,7 @@ func (i *instance) swapWithGroupMember() (*instance, error) { *odInstanceID) } - return odInstance, nil + return odInstance, <-c } // returns an instance ID as *string, set to nil if we need to wait for the next diff --git a/core/main.go b/core/main.go index 63b8e1f5..ccced096 100644 --- a/core/main.go +++ b/core/main.go @@ -260,39 +260,45 @@ func (a *AutoSpotting) handleNewInstanceLaunch(regionName string, instanceID str logger.Printf("%s Found instance %s in state %s", i.region.name, *i.InstanceId, *i.State.Name) - if state == "pending" && i.belongsToEnabledASG() && i.shouldBeReplacedWithSpot() { - logger.Printf("%s instance %s is in pending state, belongs to an enabled ASG "+ - "and should be replaced with spot, attempting to launch spot replacement", i.region.name, *i.InstanceId) + if state != "running" { + logger.Printf("%s Instance %s is not in the running state", + i.region.name, *i.InstanceId) + return errors.New("instance not in running state") + } + + if i.belongsToEnabledASG() && i.shouldBeReplacedWithSpot() { + logger.Printf("%s instance %s belongs to an enabled ASG and should be "+ + "replaced with spot, attempting to launch spot replacement", + i.region.name, *i.InstanceId) if _, err := i.launchSpotReplacement(); err != nil { logger.Printf("%s Couldn't launch spot replacement for %s", i.region.name, *i.InstanceId) return err } } else { - logger.Printf("%s skipping instance %s: either not in pending state (%s), doesn't "+ - "belong to an enabled ASG or should not be replaced with spot, ", - i.region.name, *i.InstanceId, *i.State.Name) + logger.Printf("%s skipping instance %s: either doesn't belong to an "+ + "enabled ASG or should not be replaced with spot, ", + i.region.name, *i.InstanceId) + debug.Printf("%#v", i) } - if state == "running" { - logger.Printf("%s Found instance %s in running state, checking if it's a spot instance "+ - "that should be attached to any ASG", i.region.name, *i.InstanceId) - unattached := i.isUnattachedSpotInstanceLaunchedForAnEnabledASG() - if !unattached { - logger.Printf("%s Found instance %s is already attached to an ASG, skipping it", - i.region.name, *i.InstanceId) - return nil - } - - logger.Printf("%s Found instance %s is not yet attached to its ASG, "+ - "attempting to swap it against a running on-demand instance", + logger.Printf("%s Checking if %s is a spot instance that should be "+ + "attached to any ASG", i.region.name, *i.InstanceId) + unattached := i.isUnattachedSpotInstanceLaunchedForAnEnabledASG() + if !unattached { + logger.Printf("%s Instance %s is already attached to an ASG, skipping it", i.region.name, *i.InstanceId) + return nil + } - if _, err := i.swapWithGroupMember(); err != nil { - logger.Printf("%s, couldn't perform spot replacement of %s ", - i.region.name, *i.InstanceId) - return err - } + logger.Printf("%s Found instance %s is not yet attached to its ASG, "+ + "attempting to swap it against a running on-demand instance", + i.region.name, *i.InstanceId) + + if _, err := i.swapWithGroupMember(); err != nil { + logger.Printf("%s, couldn't perform spot replacement of %s ", + i.region.name, *i.InstanceId) + return err } return nil From a0288f78badae013847a1de5484514965f219f0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cristian=20M=C4=83gheru=C8=99an-Stanciu?= Date: Mon, 12 Aug 2019 14:12:02 +0200 Subject: [PATCH 03/17] More improvements - Suspend termination processes on ASG while swapping instances - Handle launch lifecycle hooks of unattached spot instances by intercepting the API call failures from CloudTrail --- .../AutoSpotting/regional_template.yaml | 47 ++++++- .../stacks/AutoSpotting/template.yaml | 2 + core/autoscaling.go | 55 ++++++++ core/cloudtrail.go | 19 +++ core/instance.go | 47 ++----- core/main.go | 129 +++++++++++++++++- 6 files changed, 258 insertions(+), 41 deletions(-) create mode 100644 core/cloudtrail.go diff --git a/cloudformation/stacks/AutoSpotting/regional_template.yaml b/cloudformation/stacks/AutoSpotting/regional_template.yaml index 3e52246e..11fe8c25 100644 --- a/cloudformation/stacks/AutoSpotting/regional_template.yaml +++ b/cloudformation/stacks/AutoSpotting/regional_template.yaml @@ -30,6 +30,7 @@ Ref: "LambdaRegionalExecutionRoleARN" Code: ZipFile: | + from base64 import b64decode from boto3 import client from json import dumps from os import environ @@ -42,6 +43,7 @@ return arn.split(':')[3] def handler(event, context): + print("Running Lambda function", lambda_arn) snsEvent = { 'Records': [ { @@ -62,7 +64,7 @@ LogType='Tail', Payload=dumps(snsEvent), ) - print(response) + print("Invoked funcion log tail:\n", b64decode(response["LogResult"]).decode('utf-8')) except: print_exc() print("Unexpected error:", exc_info()[0]) @@ -118,8 +120,8 @@ source: - "aws.ec2" detail: - state: - - "running" + state: + - "running" State: "ENABLED" Targets: - @@ -128,3 +130,42 @@ Fn::GetAtt: - "EventHandler" - "Arn" + LifecycleHookLambdaPermission: + Type: "AWS::Lambda::Permission" + Properties: + Action: "lambda:InvokeFunction" + FunctionName: + Ref: "EventHandler" + Principal: "events.amazonaws.com" + SourceArn: + Fn::GetAtt: + - "LifecycleHookEventRule" + - "Arn" + LifecycleHookEventRule: + Type: "AWS::Events::Rule" + Properties: + Description: > + "This rule is triggered after we failed to complete a lifecycle hook" + EventPattern: + detail-type: + - "AWS API Call via CloudTrail" + source: + - "aws.cloudtrail" + detail: + eventSource: + - "autoscaling.amazonaws.com" + eventName: + - "CompleteLifecycleAction" + errorCode: + - "ValidationException" + requestParameters: + lifecycleActionResult: + - "CONTINUE" + State: "ENABLED" + Targets: + - + Id: "LifecycleHookEventGenerator" + Arn: + Fn::GetAtt: + - "EventHandler" + - "Arn" diff --git a/cloudformation/stacks/AutoSpotting/template.yaml b/cloudformation/stacks/AutoSpotting/template.yaml index 83384688..faf52de1 100644 --- a/cloudformation/stacks/AutoSpotting/template.yaml +++ b/cloudformation/stacks/AutoSpotting/template.yaml @@ -456,6 +456,8 @@ - "autoscaling:DescribeLifecycleHooks" - "autoscaling:DescribeTags" - "autoscaling:DetachInstances" + - "autoscaling:ResumeProcesses" + - "autoscaling:SuspendProcesses" - "autoscaling:TerminateInstanceInAutoScalingGroup" - "autoscaling:UpdateAutoScalingGroup" - "cloudformation:Describe*" diff --git a/core/autoscaling.go b/core/autoscaling.go index 5008eb7f..d83a3513 100644 --- a/core/autoscaling.go +++ b/core/autoscaling.go @@ -615,6 +615,29 @@ func (a *autoScalingGroup) terminateInstanceInAutoScalingGroup( return nil } +func (a *autoScalingGroup) hasLaunchLifecycleHooks() (bool, error) { + + resDLH, err := a.region.services.autoScaling.DescribeLifecycleHooks( + &autoscaling.DescribeLifecycleHooksInput{ + AutoScalingGroupName: a.AutoScalingGroupName, + }) + + if err != nil { + logger.Println(err.Error()) + return false, err + } + + for _, hook := range resDLH.LifecycleHooks { + if "autoscaling:EC2_INSTANCE_LAUNCHING" == *hook.LifecycleTransition { + debug.Printf("Group %s has launch lifecycle hook(s): %s", + *a.AutoScalingGroupName, *hook.LifecycleHookName) + return true, nil + } + } + + return false, nil +} + // Counts the number of already running instances on-demand or spot, in any or a specific AZ. func (a *autoScalingGroup) alreadyRunningInstanceCount( spot bool, availabilityZone string) (int64, int64) { @@ -645,6 +668,38 @@ func (a *autoScalingGroup) alreadyRunningInstanceCount( return count, total } +func (a *autoScalingGroup) suspendTerminations() { + logger.Printf("Suspending termination processes on ASG %s", a.name) + + for _, process := range a.SuspendedProcesses { + if *process.ProcessName == "Terminate" { + logger.Printf("ASG %s already has the termination process suspended", a.name) + return + } + } + + _, err := a.region.services.autoScaling.SuspendProcesses( + &autoscaling.ScalingProcessQuery{ + AutoScalingGroupName: a.AutoScalingGroupName, + ScalingProcesses: []*string{aws.String("Terminate")}, + }) + if err != nil { + logger.Printf("couldn't suspend termination processes on ASG %s ", a.name) + } +} + +func (a *autoScalingGroup) resumeTerminations() { + logger.Printf("Resuming termination processes on ASG %s", a.name) + + _, err := a.region.services.autoScaling.ResumeProcesses( + &autoscaling.ScalingProcessQuery{ + AutoScalingGroupName: a.AutoScalingGroupName, + ScalingProcesses: []*string{aws.String("Terminate")}, + }) + if err != nil { + logger.Printf("couldn't resume termination processes on ASG %s ", a.name) + } +} func (a *autoScalingGroup) isEnabled() bool { for _, asg := range a.region.enabledASGs { if asg.name == a.name { diff --git a/core/cloudtrail.go b/core/cloudtrail.go new file mode 100644 index 00000000..6766a4e6 --- /dev/null +++ b/core/cloudtrail.go @@ -0,0 +1,19 @@ +package autospotting + +// CloudTrailEvent s used to unmarshal a CloudTrail Event from the Detail field +// of a CloudWatch event +type CloudTrailEvent struct { + EventName string `json:"eventName"` + AwsRegion string `json:"awsRegion"` + ErrorCode string `json:"errorCode"` + ErrorMessage string `json:"errorMessage"` + RequestParameters RequestParameters `json:"requestParameters"` +} + +// RequestParameters is used to unmarshal the parameters of a CloudTrail event +type RequestParameters struct { + LifecycleHookName string `json:"lifecycleHookName"` + InstanceID string `json:"instanceId"` + LifecycleActionResult string `json:"lifecycleActionResult"` + AutoScalingGroupName string `json:"autoScalingGroupName"` +} diff --git a/core/instance.go b/core/instance.go index 0c621ce8..59349ab8 100644 --- a/core/instance.go +++ b/core/instance.go @@ -775,7 +775,7 @@ func (i *instance) isUnattachedSpotInstanceLaunchedForAnEnabledASG() bool { return false } -func (i *instance) swapWithGroupMember() (*instance, error) { +func (i *instance) swapWithGroupMember(asg *autoScalingGroup) (*instance, error) { odInstanceID := i.getReplacementTargetInstanceID() if odInstanceID == nil { logger.Println("Couldn't find target on-demand instance of", *i.InstanceId) @@ -793,13 +793,6 @@ func (i *instance) swapWithGroupMember() (*instance, error) { return nil, fmt.Errorf("target instance %s is missing", *odInstanceID) } - asgName := i.getReplacementTargetASGName() - - if asgName == nil { - logger.Printf("Spot instance %s is missing ASG name", *i.InstanceId) - return nil, fmt.Errorf("spot instance %s is missing asg tag", *i.InstanceId) - } - if !odInstance.shouldBeReplacedWithSpot() { logger.Printf("Target on-demand instance %s shouldn't be replaced", *odInstanceID) i.terminate() @@ -807,11 +800,7 @@ func (i *instance) swapWithGroupMember() (*instance, error) { *odInstanceID) } - asg := i.region.findEnabledASGByName(*asgName) - if asgName == nil { - logger.Printf("Missing ASG data for region %s", i.region.name) - return nil, fmt.Errorf("region %s is missing asg data", i.region.name) - } + asg.suspendTerminations() max := *asg.MaxSize @@ -824,25 +813,17 @@ func (i *instance) swapWithGroupMember() (*instance, error) { defer asg.setAutoScalingMaxSize(max) } - // We use this to capture the error message from the closure below that is - // executed asynchronously. - c := make(chan error) - - // Attach the new spot instance in parallel with the termination of the - // running on-demand instance in order to avoid increasing the capacity enough - // for the ASG to notice it and start terminating instances. The ASGs were - // sometimes seen to just terminates newly launched spot instances so the - // group was unintentionally kept with on-demand capacity. - go func() { - if err := asg.attachSpotInstance(*i.InstanceId, true); err != nil { - logger.Printf("Spot instance %s couldn't be attached to the group %s, terminating it...", - *i.InstanceId, asg.name) - i.terminate() - c <- fmt.Errorf("couldn't attach spot instance %s ", *i.InstanceId) - } - c <- nil - }() + logger.Printf("Attaching spot instance %s to the group %s", + *i.InstanceId, asg.name) + if err := asg.attachSpotInstance(*i.InstanceId, true); err != nil { + logger.Printf("Spot instance %s couldn't be attached to the group %s, terminating it...", + *i.InstanceId, asg.name) + i.terminate() + return nil, fmt.Errorf("couldn't attach spot instance %s ", *i.InstanceId) + } + logger.Printf("Terminating on-demand instance %s from the group %s", + *odInstanceID, asg.name) if err := asg.terminateInstanceInAutoScalingGroup(odInstanceID, true, true); err != nil { logger.Printf("On-demand instance %s couldn't be terminated, re-trying...", *odInstanceID) @@ -850,7 +831,9 @@ func (i *instance) swapWithGroupMember() (*instance, error) { *odInstanceID) } - return odInstance, <-c + asg.resumeTerminations() + + return odInstance, nil } // returns an instance ID as *string, set to nil if we need to wait for the next diff --git a/core/main.go b/core/main.go index ccced096..3155587a 100644 --- a/core/main.go +++ b/core/main.go @@ -201,10 +201,10 @@ func (a *AutoSpotting) EventHandler(event *json.RawMessage) { log.Println(err.Error()) return } - + eventType := cloudwatchEvent.DetailType // If the event is for an Instance Spot Interruption - if cloudwatchEvent.DetailType == "EC2 Spot Instance Interruption Warning" { - log.Println("Triggered by", cloudwatchEvent.DetailType) + if eventType == "EC2 Spot Instance Interruption Warning" { + log.Println("Triggered by", eventType) if instanceID, err := getInstanceIDDueForTermination(cloudwatchEvent); err != nil { log.Println("Could't get instance ID of terminating spot instance", err.Error()) return @@ -214,8 +214,8 @@ func (a *AutoSpotting) EventHandler(event *json.RawMessage) { } // If event is Instance state change - } else if cloudwatchEvent.DetailType == "EC2 Instance State-change Notification" { - log.Println("Triggered by", cloudwatchEvent.DetailType) + } else if eventType == "EC2 Instance State-change Notification" { + log.Println("Triggered by", eventType) instanceID, state, err := parseEventData(cloudwatchEvent) if err != nil { log.Println("Could't get instance ID of newly launched instance", err.Error()) @@ -224,6 +224,9 @@ func (a *AutoSpotting) EventHandler(event *json.RawMessage) { a.handleNewInstanceLaunch(cloudwatchEvent.Region, *instanceID, *state) } + } else if eventType == "AWS API Call via CloudTrail" { + log.Println("Triggered by", eventType) + a.handleLifecycleHookEvent(cloudwatchEvent) } else { // Cron Scheduling a.ProcessCronEvent() @@ -231,6 +234,97 @@ func (a *AutoSpotting) EventHandler(event *json.RawMessage) { } +func isValidLifecycleHookEvent(ctEvent CloudTrailEvent) bool { + return ctEvent.EventName == "CompleteLifecycleAction" && + ctEvent.ErrorCode == "ValidationException" && + ctEvent.RequestParameters.LifecycleActionResult == "CONTINUE" && + strings.HasPrefix(ctEvent.ErrorMessage, "No active Lifecycle Action found with instance ID") +} + +func (a *AutoSpotting) handleLifecycleHookEvent(event events.CloudWatchEvent) error { + var ctEvent CloudTrailEvent + + // Try to parse the event.Detail as Cloudwatch Event Rule + if err := json.Unmarshal(event.Detail, &ctEvent); err != nil { + log.Println(err.Error()) + return err + } + logger.Printf("CloudTrail Event data: %#v", ctEvent) + + regionName := ctEvent.AwsRegion + instanceID := ctEvent.RequestParameters.InstanceID + eventASGName := ctEvent.RequestParameters.AutoScalingGroupName + + if !isValidLifecycleHookEvent(ctEvent) { + return fmt.Errorf("unexpected event: %#v", ctEvent) + } + + r := region{name: regionName, conf: a.config, services: connections{}} + + if !r.enabled() { + return fmt.Errorf("region %s is not enabled", r.name) + } + r.services.connect(regionName) + r.setupAsgFilters() + r.scanForEnabledAutoScalingGroups() + + if err := r.scanInstance(aws.String(instanceID)); err != nil { + logger.Printf("%s Couldn't scan instance %s: %s", regionName, + instanceID, err.Error()) + return err + } + + i := r.instances.get(instanceID) + if i == nil { + logger.Printf("%s Instance %s is missing, skipping...", + regionName, instanceID) + return errors.New("instance missing") + } + logger.Printf("%s Found instance %s in state %s", + i.region.name, *i.InstanceId, *i.State.Name) + + if *i.State.Name != "running" { + logger.Printf("%s Instance %s is not in the running state", + i.region.name, *i.InstanceId) + return errors.New("instance not in running state") + } + + unattached := i.isUnattachedSpotInstanceLaunchedForAnEnabledASG() + if !unattached { + logger.Printf("%s Instance %s is already attached to an ASG, skipping it", + i.region.name, *i.InstanceId) + return nil + } + + asgName := i.getReplacementTargetASGName() + + if asgName == nil || *asgName != eventASGName { + logger.Printf("event ASG name doesn't match the ASG name set on the tags " + + "of the unattached spot instance") + return fmt.Errorf("ASG name mismatch: event ASG name %s doesn't match the "+ + "ASG name set on the unattached spot instance %s", eventASGName, *asgName) + } + + asg := i.region.findEnabledASGByName(*asgName) + + if asg == nil { + logger.Printf("Missing ASG data for region %s", i.region.name) + return fmt.Errorf("region %s is missing asg data", i.region.name) + } + + logger.Printf("%s Found instance %s is not yet attached to its ASG, "+ + "attempting to swap it against a running on-demand instance", + i.region.name, *i.InstanceId) + + if _, err := i.swapWithGroupMember(asg); err != nil { + logger.Printf("%s, couldn't perform spot replacement of %s ", + i.region.name, *i.InstanceId) + return err + } + + return nil +} + func (a *AutoSpotting) handleNewInstanceLaunch(regionName string, instanceID string, state string) error { r := region{name: regionName, conf: a.config, services: connections{}} @@ -291,11 +385,34 @@ func (a *AutoSpotting) handleNewInstanceLaunch(regionName string, instanceID str return nil } + asgName := i.getReplacementTargetASGName() + asg := i.region.findEnabledASGByName(*asgName) + + if asg == nil { + logger.Printf("Missing ASG data for region %s", i.region.name) + return fmt.Errorf("region %s is missing asg data", i.region.name) + } + logger.Printf("%s Found instance %s is not yet attached to its ASG, "+ "attempting to swap it against a running on-demand instance", i.region.name, *i.InstanceId) - if _, err := i.swapWithGroupMember(); err != nil { + hasHooks, err := asg.hasLaunchLifecycleHooks() + + if err != nil { + logger.Printf("%s ASG %s - couldn't describe Lifecycle Hooks", + i.region.name, *asgName) + return err + } + + if hasHooks { + logger.Printf("%s ASG %s has instance launch lifecycle hooks, skipping "+ + "instance %s until it attempts to continue the lifecycle hook itself", + i.region.name, *asgName, *i.InstanceId) + return nil + } + + if _, err := i.swapWithGroupMember(asg); err != nil { logger.Printf("%s, couldn't perform spot replacement of %s ", i.region.name, *i.InstanceId) return err From 0783910127a5d10ffa914953309433d515660508 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cristian=20M=C4=83gheru=C8=99an-Stanciu?= Date: Mon, 12 Aug 2019 18:08:03 +0200 Subject: [PATCH 04/17] Fix CloudWatch event rule --- cloudformation/stacks/AutoSpotting/regional_template.yaml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cloudformation/stacks/AutoSpotting/regional_template.yaml b/cloudformation/stacks/AutoSpotting/regional_template.yaml index 11fe8c25..f96d5ca9 100644 --- a/cloudformation/stacks/AutoSpotting/regional_template.yaml +++ b/cloudformation/stacks/AutoSpotting/regional_template.yaml @@ -150,10 +150,8 @@ detail-type: - "AWS API Call via CloudTrail" source: - - "aws.cloudtrail" + - "aws.autoscaling" detail: - eventSource: - - "autoscaling.amazonaws.com" eventName: - "CompleteLifecycleAction" errorCode: From 74938a42b750291e7904c37989776ca820dd5e79 Mon Sep 17 00:00:00 2001 From: Chris Date: Tue, 20 Aug 2019 12:16:41 +1000 Subject: [PATCH 05/17] Allow flavor to be customised (#359) --- docker-compose.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-compose.yaml b/docker-compose.yaml index 31186391..289bd0fa 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -5,7 +5,7 @@ services: context: . dockerfile: Dockerfile-build environment: - - FLAVOR=nightly + - FLAVOR=${FLAVOR:-nightly} - CGO_ENABLED=0 - COVERALLS_TOKEN - TRAVIS_BUILD_NUMBER From f11b2dceed3ca25aff16e6d329d6e75ad0746fef Mon Sep 17 00:00:00 2001 From: Chris Date: Tue, 20 Aug 2019 15:36:02 +1000 Subject: [PATCH 06/17] Fix typo (#361) --- TECHNICAL_DETAILS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TECHNICAL_DETAILS.md b/TECHNICAL_DETAILS.md index 3315657e..465e2ce5 100644 --- a/TECHNICAL_DETAILS.md +++ b/TECHNICAL_DETAILS.md @@ -368,7 +368,7 @@ connection draining on the load balancer. #### Pros #### -- doesn'require any configuration changes +- doesn't require any configuration changes - instances behind ELBs are detached automatically (or start to be drained) as soon as the imminent spot termination event is received. - if you already have lifecycle hooks they will be executed, but in this case we From 28944e7b12c78a89bd5ea76c90e1d5816724fc32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cristian=20M=C4=83gheru=C8=99an-Stanciu?= Date: Fri, 13 Sep 2019 15:24:36 +0200 Subject: [PATCH 07/17] Increase automated testing coverage - Refactor the cron execution mode to support determining the action and running the action as independent steps, which makes the code easier to test. - switch to Go 1.13 - Suspend the AutoScaling termination process for 5 minutes - Reset the hourlySavings value between tests - Improve mocking - Print log statements during tests when the AUTOSPOTTING_DEBUG variable is set to "true" when running tests in verbose mode - change isProtectedFromTermination to return true in case of API failures. --- autospotting.go | 2 +- core/action.go | 82 +++ core/autoscaling.go | 181 +++--- core/autoscaling_test.go | 1128 ++++++++++++++++++++++++++++++++++++-- core/instance.go | 35 +- core/main_test.go | 13 +- core/mock_test.go | 14 +- core/region.go | 3 +- go.mod | 2 +- 9 files changed, 1335 insertions(+), 125 deletions(-) create mode 100644 core/action.go diff --git a/autospotting.go b/autospotting.go index 96b3aac9..7256a5ff 100644 --- a/autospotting.go +++ b/autospotting.go @@ -31,7 +31,7 @@ func main() { if err != nil { log.Fatal(err) } - Handler(nil, parseEvent) + Handler(context.TODO(), parseEvent) } else { eventHandler(nil) } diff --git a/core/action.go b/core/action.go new file mode 100644 index 00000000..4988dfb5 --- /dev/null +++ b/core/action.go @@ -0,0 +1,82 @@ +package autospotting + +type target struct { + asg *autoScalingGroup + totalInstances int64 + onDemandInstance *instance + spotInstance *instance +} + +type runer interface { + run() +} + +// No-op run +type skipRun struct { + reason string +} + +func (s skipRun) run() {} + +// enables the ASG for the new event-based logic +type enableEventHandling struct { + target target +} + +func (eeh enableEventHandling) run() { + eeh.target.asg.enableForInstanceLaunchEventHandling() +} + +// terminates a random spot instance after enabling the event-based logic +type terminateSpotInstance struct { + target target +} + +func (tsi terminateSpotInstance) run() { + asg := tsi.target.asg + + asg.enableForInstanceLaunchEventHandling() + asg.terminateRandomSpotInstanceIfHavingEnough( + tsi.target.totalInstances, true) +} + +// launches a spot instance replacement +type launchSpotReplacement struct { + target target +} + +func (lsr launchSpotReplacement) run() { + spotInstanceID, err := lsr.target.onDemandInstance.launchSpotReplacement() + if err != nil { + logger.Printf("Could not launch cheapest spot instance: %s", err) + return + } + logger.Printf("Successfully launched spot instance %s, exiting...", *spotInstanceID) + return +} + +type terminateUnneededSpotInstance struct { + target target +} + +func (tusi terminateUnneededSpotInstance) run() { + asg := tusi.target.asg + total := tusi.target.totalInstances + spotInstance := tusi.target.spotInstance + spotInstanceID := *spotInstance.InstanceId + + asg.terminateRandomSpotInstanceIfHavingEnough(total, true) + logger.Println("Spot instance", spotInstanceID, "is not need anymore by ASG", + asg.name, "terminating the spot instance.") + spotInstance.terminate() +} + +type swapSpotInstance struct { + target target +} + +func (ssi swapSpotInstance) run() { + asg := ssi.target.asg + spotInstanceID := *ssi.target.spotInstance.InstanceId + asg.replaceOnDemandInstanceWithSpot(nil, spotInstanceID) +} diff --git a/core/autoscaling.go b/core/autoscaling.go index d83a3513..b06f52da 100644 --- a/core/autoscaling.go +++ b/core/autoscaling.go @@ -4,6 +4,7 @@ import ( "errors" "fmt" "strings" + "sync" "time" "github.com/aws/aws-sdk-go/aws" @@ -22,16 +23,16 @@ type autoScalingGroup struct { config AutoScalingConfig } -func (a *autoScalingGroup) loadLaunchConfiguration() error { +func (a *autoScalingGroup) loadLaunchConfiguration() (*launchConfiguration, error) { //already done if a.launchConfiguration != nil { - return nil + return a.launchConfiguration, nil } lcName := a.LaunchConfigurationName if lcName == nil { - return errors.New("missing launch configuration") + return nil, errors.New("missing launch configuration") } svc := a.region.services.autoScaling @@ -43,54 +44,70 @@ func (a *autoScalingGroup) loadLaunchConfiguration() error { if err != nil { logger.Println(err.Error()) - return err + return nil, err } a.launchConfiguration = &launchConfiguration{ LaunchConfiguration: resp.LaunchConfigurations[0], } - return nil + return a.launchConfiguration, nil } -func (a *autoScalingGroup) needReplaceOnDemandInstances(wait bool) bool { - onDemandRunning, totalRunning := a.alreadyRunningInstanceCount(false, "") +func (a *autoScalingGroup) needReplaceOnDemandInstances() (bool, int64) { + onDemandRunning, totalRunning := a.alreadyRunningInstanceCount(false, nil) + debug.Printf("onDemandRunning=%v totalRunning=%v a.minOnDemand=%v", + onDemandRunning, totalRunning, a.minOnDemand) + if totalRunning == 0 { logger.Printf("The group %s is currently empty or in the process of launching new instances", a.name) - return true + return true, totalRunning } if onDemandRunning > a.minOnDemand { logger.Println("Currently more than enough OnDemand instances running") - return true + return true, totalRunning } + if onDemandRunning == a.minOnDemand { logger.Println("Currently OnDemand running equals to the required number, skipping run") - return false + return false, totalRunning } logger.Println("Currently fewer OnDemand instances than required !") - if a.allInstancesRunning() && a.instances.count64() >= *a.DesiredCapacity { - logger.Println("All instances are running and desired capacity is satisfied") - if randomSpot := a.getAnySpotInstance(); randomSpot != nil { - if totalRunning == 1 { - logger.Println("Warning: blocking replacement of very last instance - consider raising ASG to >= 2") - } else { - logger.Println("Terminating a random spot instance", - *randomSpot.Instance.InstanceId) - switch a.config.TerminationMethod { - case DetachTerminationMethod: - randomSpot.terminate() - default: - a.terminateInstanceInAutoScalingGroup(randomSpot.Instance.InstanceId, wait, false) - } - } - } + return false, totalRunning +} + +func (a *autoScalingGroup) terminateRandomSpotInstanceIfHavingEnough(totalRunning int64, wait bool) error { + + if totalRunning == 1 { + logger.Println("Warning: blocking replacement of very last instance - consider raising ASG to >= 2") + return nil + } + + if a.allInstancesRunning() && a.instances.count64() < *a.DesiredCapacity { + logger.Println("Not enough capacity in the group") + return nil + } + + randomSpot := a.getAnySpotInstance() + if randomSpot == nil { + logger.Println("Couldn't pick a random spot instance") + return nil + } + + logger.Println("Terminating randomly-selected spot instance", + *randomSpot.Instance.InstanceId) + + switch a.config.TerminationMethod { + case DetachTerminationMethod: + return randomSpot.terminate() + default: + return a.terminateInstanceInAutoScalingGroup(randomSpot.Instance.InstanceId, wait, false) } - return false } func (a *autoScalingGroup) allInstancesRunning() bool { - _, totalRunning := a.alreadyRunningInstanceCount(false, "") + _, totalRunning := a.alreadyRunningInstanceCount(false, nil) return totalRunning == a.instances.count64() } @@ -110,6 +127,7 @@ func (a *autoScalingGroup) licensedToRun() (bool, error) { as.hourlySavings += savings monthlySavings := as.hourlySavings * 24 * 30 + if (monthlySavings > 1000) && strings.Contains(a.region.conf.Version, "nightly") && a.region.conf.LicenseType == "evaluation" { @@ -120,7 +138,7 @@ func (a *autoScalingGroup) licensedToRun() (bool, error) { return true, nil } -func (a *autoScalingGroup) processCronEvent() { +func (a *autoScalingGroup) cronEventAction() runer { a.scanInstances() a.loadDefaultConfig() @@ -136,12 +154,12 @@ func (a *autoScalingGroup) processCronEvent() { if !shouldRun { logger.Println(a.region.name, a.name, "Skipping run, outside the enabled cron run schedule") - return + return skipRun{reason: "outside-cron-schedule"} } - if ok, err := a.licensedToRun(); !ok { + if licensed, err := a.licensedToRun(); !licensed { logger.Println(a.region.name, a.name, "Skipping group, license limit reached:", err.Error()) - return + return skipRun{reason: "over-license"} } if spotInstance == nil { @@ -152,51 +170,49 @@ func (a *autoScalingGroup) processCronEvent() { if onDemandInstance == nil { logger.Println(a.region.name, a.name, "No running unprotected on-demand instances were found, nothing to do here...") - a.enableForInstanceLaunchEventHandling() - return + + return enableEventHandling{target{asg: a}} } - if !a.needReplaceOnDemandInstances(true) { + if need, total := a.needReplaceOnDemandInstances(); !need { logger.Printf("Not allowed to replace any more of the running OD instances in %s", a.name) - a.enableForInstanceLaunchEventHandling() - return + return terminateSpotInstance{target{asg: a, totalInstances: total}} } a.loadLaunchConfiguration() - spotInstanceID, err := onDemandInstance.launchSpotReplacement() - if err != nil { - logger.Printf("Could not launch cheapest spot instance: %s", err) - return - } - logger.Printf("Successfully launched spot instance %s, exiting...", *spotInstanceID) - return + return launchSpotReplacement{target{ + onDemandInstance: onDemandInstance}} } spotInstanceID := *spotInstance.InstanceId logger.Println("Found unattached spot instance", spotInstanceID) - if !a.needReplaceOnDemandInstances(true) || !shouldRun { - logger.Println("Spot instance", spotInstanceID, "is not need anymore by ASG", - a.name, "terminating the spot instance.") - spotInstance.terminate() - return + if need, total := a.needReplaceOnDemandInstances(); !need || !shouldRun { + + return terminateUnneededSpotInstance{ + target{ + asg: a, + spotInstance: spotInstance, + totalInstances: total, + }} } if !spotInstance.isReadyToAttach(a) { logger.Printf("Spot instance %s not yet ready, waiting for next run while processing %s", spotInstanceID, a.name) - return + return skipRun{"spot instance replacement exists but not ready"} } logger.Println(a.region.name, "Found spot instance:", spotInstanceID, "Attaching it to", a.name) - a.replaceOnDemandInstanceWithSpot(nil, spotInstanceID) - + return swapSpotInstance{target{ + asg: a, + spotInstance: spotInstance}} } -func (a *autoScalingGroup) enableForInstanceLaunchEventHandling() { +func (a *autoScalingGroup) enableForInstanceLaunchEventHandling() bool { logger.Printf("Enabling group %s for the event-based instance replacement logic", a.name) @@ -204,11 +220,13 @@ func (a *autoScalingGroup) enableForInstanceLaunchEventHandling() { if *tag.Key == EnableInstanceLaunchEventHandlingTag { logger.Printf("Tag %s is already set on the group %s, current value is %s", EnableInstanceLaunchEventHandlingTag, a.name, *tag.Value) - return + return true } } svc := a.region.services.autoScaling + fmt.Printf("%#v", svc) + _, err := svc.CreateOrUpdateTags(&autoscaling.CreateOrUpdateTagsInput{ Tags: []*autoscaling.Tag{ { @@ -222,15 +240,16 @@ func (a *autoScalingGroup) enableForInstanceLaunchEventHandling() { }) if err != nil { logger.Println("Failed to enable ASG for event-based instance replacement:", err.Error()) + return false } + return true } func (a *autoScalingGroup) isEnabledForEventBasedInstanceReplacement() bool { if time.Now().Sub(*a.CreatedTime) < time.Hour { logger.Println("ASG %s is newer than an hour, enabling it for event-based "+ "instance replacement", a.name) - a.enableForInstanceLaunchEventHandling() - return true + return a.enableForInstanceLaunchEventHandling() } for _, tag := range a.Tags { @@ -351,7 +370,12 @@ func (a *autoScalingGroup) getInstance( continue } - if considerInstanceProtection && (i.isProtectedFromScaleIn() || i.isProtectedFromTermination()) { + protT, err := i.isProtectedFromTermination() + if err != nil { + debug.Println(a.name, "failed to determine termination protection for", *i.InstanceId) + } + + if considerInstanceProtection && (i.isProtectedFromScaleIn() || protT) { debug.Println(a.name, "skipping protected instance", *i.InstanceId) continue } @@ -640,7 +664,7 @@ func (a *autoScalingGroup) hasLaunchLifecycleHooks() (bool, error) { // Counts the number of already running instances on-demand or spot, in any or a specific AZ. func (a *autoScalingGroup) alreadyRunningInstanceCount( - spot bool, availabilityZone string) (int64, int64) { + spot bool, availabilityZone *string) (int64, int64) { var total, count int64 instanceCategory := "spot" @@ -650,25 +674,22 @@ func (a *autoScalingGroup) alreadyRunningInstanceCount( } logger.Println(a.name, "Counting already running on demand instances ") for inst := range a.instances.instances() { + if *inst.Instance.State.Name == "running" { - // Count running Spot instances - if spot && inst.isSpot() && - (*inst.Placement.AvailabilityZone == availabilityZone || availabilityZone == "") { - count++ - // Count running OnDemand instances - } else if !spot && !inst.isSpot() && - (*inst.Placement.AvailabilityZone == availabilityZone || availabilityZone == "") { - count++ - } // Count total running instances total++ + if availabilityZone == nil || *inst.Placement.AvailabilityZone == *availabilityZone { + if (spot && inst.isSpot()) || (!spot && !inst.isSpot()) { + count++ + } + } } } logger.Println(a.name, "Found", count, instanceCategory, "instances running on a total of", total) return count, total } -func (a *autoScalingGroup) suspendTerminations() { +func (a *autoScalingGroup) suspendTerminationProcess() { logger.Printf("Suspending termination processes on ASG %s", a.name) for _, process := range a.SuspendedProcesses { @@ -688,7 +709,7 @@ func (a *autoScalingGroup) suspendTerminations() { } } -func (a *autoScalingGroup) resumeTerminations() { +func (a *autoScalingGroup) resumeTerminationProcess() { logger.Printf("Resuming termination processes on ASG %s", a.name) _, err := a.region.services.autoScaling.ResumeProcesses( @@ -708,3 +729,25 @@ func (a *autoScalingGroup) isEnabled() bool { } return false } + +func (a *autoScalingGroup) temporarilySuspendTerminations(wg *sync.WaitGroup) { + wg.Add(1) + defer wg.Done() + + if a.isTerminationSuspended() { + return + } + + a.suspendTerminationProcess() + time.Sleep(300 * time.Second * a.region.conf.SleepMultiplier) + a.resumeTerminationProcess() +} + +func (a *autoScalingGroup) isTerminationSuspended() bool { + for _, process := range a.SuspendedProcesses { + if *process.ProcessName == "Terminate" { + return true + } + } + return false +} diff --git a/core/autoscaling_test.go b/core/autoscaling_test.go index 19d778f6..bd0dc195 100644 --- a/core/autoscaling_test.go +++ b/core/autoscaling_test.go @@ -5,6 +5,7 @@ import ( "fmt" "reflect" "testing" + "time" "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/service/autoscaling" @@ -18,7 +19,7 @@ func TestAlreadyRunningInstanceCount(t *testing.T) { asgName string asgInstances instances spot bool - availabilityZone string + availabilityZone *string expectedCount int64 expectedTotal int64 }{ @@ -26,7 +27,7 @@ func TestAlreadyRunningInstanceCount(t *testing.T) { asgName: "test-asg", asgInstances: makeInstances(), spot: true, - availabilityZone: "", + availabilityZone: nil, expectedCount: 0, expectedTotal: 0, }, @@ -44,7 +45,7 @@ func TestAlreadyRunningInstanceCount(t *testing.T) { }, ), spot: true, - availabilityZone: "", + availabilityZone: nil, expectedCount: 0, expectedTotal: 0, }, @@ -69,7 +70,7 @@ func TestAlreadyRunningInstanceCount(t *testing.T) { }, ), spot: true, - availabilityZone: "", + availabilityZone: nil, expectedCount: 0, expectedTotal: 2, }, @@ -94,7 +95,7 @@ func TestAlreadyRunningInstanceCount(t *testing.T) { }, ), spot: false, - availabilityZone: "", + availabilityZone: nil, expectedCount: 0, expectedTotal: 2, }, @@ -119,7 +120,7 @@ func TestAlreadyRunningInstanceCount(t *testing.T) { }, ), spot: false, - availabilityZone: "eu-west-1c", + availabilityZone: aws.String("eu-west-1c"), expectedCount: 0, expectedTotal: 2, }, @@ -144,7 +145,7 @@ func TestAlreadyRunningInstanceCount(t *testing.T) { }, ), spot: false, - availabilityZone: "eu-west-1b", + availabilityZone: aws.String("eu-west-1b"), expectedCount: 1, expectedTotal: 2, }, @@ -169,7 +170,7 @@ func TestAlreadyRunningInstanceCount(t *testing.T) { }, ), spot: true, - availabilityZone: "eu-west-1c", + availabilityZone: aws.String("eu-west-1c"), expectedCount: 0, expectedTotal: 2, }, @@ -194,7 +195,7 @@ func TestAlreadyRunningInstanceCount(t *testing.T) { }, ), spot: true, - availabilityZone: "", + availabilityZone: nil, expectedCount: 2, expectedTotal: 2, }, @@ -219,7 +220,7 @@ func TestAlreadyRunningInstanceCount(t *testing.T) { }, ), spot: true, - availabilityZone: "", + availabilityZone: nil, expectedCount: 0, expectedTotal: 1, }, @@ -244,7 +245,7 @@ func TestAlreadyRunningInstanceCount(t *testing.T) { }, ), spot: false, - availabilityZone: "", + availabilityZone: nil, expectedCount: 1, expectedTotal: 2, }, @@ -269,7 +270,7 @@ func TestAlreadyRunningInstanceCount(t *testing.T) { }, ), spot: false, - availabilityZone: "", + availabilityZone: nil, expectedCount: 0, expectedTotal: 1, }, @@ -887,9 +888,37 @@ func TestLoadLaunchConfiguration(t *testing.T) { name string nameLC *string regionASG *region + groupLC *launchConfiguration expectedLC *launchConfiguration expectedErr error }{ + {name: "launch configuration already exists", + groupLC: &launchConfiguration{ + LaunchConfiguration: &autoscaling.LaunchConfiguration{ + LaunchConfigurationName: aws.String("foo"), + }, + }, + // nameLC: aws.String("foo"), + // regionASG: ®ion{ + // services: connections{ + // autoScaling: mockASG{ + // dlco: &autoscaling.DescribeLaunchConfigurationsOutput{ + // LaunchConfigurations: []*autoscaling.LaunchConfiguration{ + // { + // LaunchConfigurationName: aws.String("foo"), + // }, + // }, + // }, + // }, + // }, + // }, + expectedErr: nil, + expectedLC: &launchConfiguration{ + LaunchConfiguration: &autoscaling.LaunchConfiguration{ + LaunchConfigurationName: aws.String("foo"), + }, + }, + }, {name: "nil launch configuration name", nameLC: nil, regionASG: ®ion{ @@ -941,14 +970,13 @@ func TestLoadLaunchConfiguration(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { a := autoScalingGroup{ - - region: tt.regionASG, + launchConfiguration: tt.groupLC, + region: tt.regionASG, Group: &autoscaling.Group{ LaunchConfigurationName: tt.nameLC, }, } - err := a.loadLaunchConfiguration() - lc := a.launchConfiguration + lc, err := a.loadLaunchConfiguration() if !reflect.DeepEqual(tt.expectedErr, err) { t.Errorf("loadLaunchConfiguration received error status: %+v expected %+v", @@ -1273,7 +1301,7 @@ func TestGetOnDemandInstanceInAZ(t *testing.T) { }, { - name: "ASG has 'running' instance in AZ ad we we get error when trying to determine termination protection", + name: "ASG has 'running' instance in AZ but we we get error when trying to determine termination protection", asgInstances: makeInstancesWithCatalog( instanceMap{ "spot-stopped": { @@ -1317,24 +1345,9 @@ func TestGetOnDemandInstanceInAZ(t *testing.T) { }, }, ), - az: aws.String("1b"), - expected: &instance{ - Instance: &ec2.Instance{ - InstanceId: aws.String("ondemand-running"), - State: &ec2.InstanceState{Name: aws.String(ec2.InstanceStateNameRunning)}, - Placement: &ec2.Placement{AvailabilityZone: aws.String("1b")}, - InstanceLifecycle: aws.String(""), - }, - region: ®ion{ - services: connections{ - ec2: mockEC2{ - diaerr: errors.New("error when determining instance termination protection"), - }, - }, - }, - }, + az: aws.String("1b"), + expected: nil, }, - { name: "ASG has 'running' but protected from termination instance in AZ", asgInstances: makeInstancesWithCatalog( @@ -1599,6 +1612,17 @@ func TestGetAnyOnDemandInstance(t *testing.T) { Placement: &ec2.Placement{AvailabilityZone: aws.String("1b")}, InstanceLifecycle: aws.String(""), }, + region: ®ion{ + services: connections{ + ec2: mockEC2{ + diao: &ec2.DescribeInstanceAttributeOutput{ + DisableApiTermination: &ec2.AttributeBooleanValue{ + Value: aws.Bool(false), + }, + }, + }, + }, + }, }, }, ), @@ -1608,7 +1632,19 @@ func TestGetAnyOnDemandInstance(t *testing.T) { State: &ec2.InstanceState{Name: aws.String(ec2.InstanceStateNameRunning)}, Placement: &ec2.Placement{AvailabilityZone: aws.String("1b")}, InstanceLifecycle: aws.String(""), - }}, + }, + region: ®ion{ + services: connections{ + ec2: mockEC2{ + diao: &ec2.DescribeInstanceAttributeOutput{ + DisableApiTermination: &ec2.AttributeBooleanValue{ + Value: aws.Bool(false), + }, + }, + }, + }, + }, + }, }, }, {name: "ASG has multiple 'running' OnDemand instances", @@ -1637,6 +1673,17 @@ func TestGetAnyOnDemandInstance(t *testing.T) { Placement: &ec2.Placement{AvailabilityZone: aws.String("1c")}, InstanceLifecycle: aws.String(""), }, + region: ®ion{ + services: connections{ + ec2: mockEC2{ + diao: &ec2.DescribeInstanceAttributeOutput{ + DisableApiTermination: &ec2.AttributeBooleanValue{ + Value: aws.Bool(false), + }, + }, + }, + }, + }, }, "ondemand-running2": { Instance: &ec2.Instance{ @@ -1645,9 +1692,20 @@ func TestGetAnyOnDemandInstance(t *testing.T) { Placement: &ec2.Placement{AvailabilityZone: aws.String("1b")}, InstanceLifecycle: aws.String(""), }, + + region: ®ion{ + services: connections{ + ec2: mockEC2{ + diao: &ec2.DescribeInstanceAttributeOutput{ + DisableApiTermination: &ec2.AttributeBooleanValue{ + Value: aws.Bool(false), + }, + }, + }, + }, + }, }, - }, - ), + }), expected: []*instance{ { Instance: &ec2.Instance{ @@ -1656,6 +1714,17 @@ func TestGetAnyOnDemandInstance(t *testing.T) { Placement: &ec2.Placement{AvailabilityZone: aws.String("1b")}, InstanceLifecycle: aws.String(""), }, + region: ®ion{ + services: connections{ + ec2: mockEC2{ + diao: &ec2.DescribeInstanceAttributeOutput{ + DisableApiTermination: &ec2.AttributeBooleanValue{ + Value: aws.Bool(false), + }, + }, + }, + }, + }, }, { Instance: &ec2.Instance{ @@ -1664,6 +1733,17 @@ func TestGetAnyOnDemandInstance(t *testing.T) { Placement: &ec2.Placement{AvailabilityZone: aws.String("1c")}, InstanceLifecycle: aws.String(""), }, + region: ®ion{ + services: connections{ + ec2: mockEC2{ + diao: &ec2.DescribeInstanceAttributeOutput{ + DisableApiTermination: &ec2.AttributeBooleanValue{ + Value: aws.Bool(false), + }, + }, + }, + }, + }, }, }, }, @@ -1733,6 +1813,17 @@ func TestGetAnySpotInstance(t *testing.T) { Placement: &ec2.Placement{AvailabilityZone: aws.String("1c")}, InstanceLifecycle: aws.String(""), }, + region: ®ion{ + services: connections{ + ec2: mockEC2{ + diao: &ec2.DescribeInstanceAttributeOutput{ + DisableApiTermination: &ec2.AttributeBooleanValue{ + Value: aws.Bool(false), + }, + }, + }, + }, + }, }, }, ), @@ -1756,6 +1847,17 @@ func TestGetAnySpotInstance(t *testing.T) { Placement: &ec2.Placement{AvailabilityZone: aws.String("1b")}, InstanceLifecycle: aws.String("spot"), }, + region: ®ion{ + services: connections{ + ec2: mockEC2{ + diao: &ec2.DescribeInstanceAttributeOutput{ + DisableApiTermination: &ec2.AttributeBooleanValue{ + Value: aws.Bool(false), + }, + }, + }, + }, + }, }, "ondemand-stopped": { Instance: &ec2.Instance{ @@ -1772,6 +1874,17 @@ func TestGetAnySpotInstance(t *testing.T) { Placement: &ec2.Placement{AvailabilityZone: aws.String("1b")}, InstanceLifecycle: aws.String(""), }, + region: ®ion{ + services: connections{ + ec2: mockEC2{ + diao: &ec2.DescribeInstanceAttributeOutput{ + DisableApiTermination: &ec2.AttributeBooleanValue{ + Value: aws.Bool(false), + }, + }, + }, + }, + }, }, }, ), @@ -1781,7 +1894,19 @@ func TestGetAnySpotInstance(t *testing.T) { State: &ec2.InstanceState{Name: aws.String(ec2.InstanceStateNameRunning)}, Placement: &ec2.Placement{AvailabilityZone: aws.String("1b")}, InstanceLifecycle: aws.String("spot"), - }}, + }, + region: ®ion{ + services: connections{ + ec2: mockEC2{ + diao: &ec2.DescribeInstanceAttributeOutput{ + DisableApiTermination: &ec2.AttributeBooleanValue{ + Value: aws.Bool(false), + }, + }, + }, + }, + }, + }, }, }, {name: "ASG has multiple 'running' Spot instances", @@ -1793,6 +1918,16 @@ func TestGetAnySpotInstance(t *testing.T) { State: &ec2.InstanceState{Name: aws.String(ec2.InstanceStateNameRunning)}, Placement: &ec2.Placement{AvailabilityZone: aws.String("1a")}, InstanceLifecycle: aws.String("spot"), + }, region: ®ion{ + services: connections{ + ec2: mockEC2{ + diao: &ec2.DescribeInstanceAttributeOutput{ + DisableApiTermination: &ec2.AttributeBooleanValue{ + Value: aws.Bool(false), + }, + }, + }, + }, }, }, "spot-running2": { @@ -1802,6 +1937,17 @@ func TestGetAnySpotInstance(t *testing.T) { Placement: &ec2.Placement{AvailabilityZone: aws.String("1b")}, InstanceLifecycle: aws.String("spot"), }, + region: ®ion{ + services: connections{ + ec2: mockEC2{ + diao: &ec2.DescribeInstanceAttributeOutput{ + DisableApiTermination: &ec2.AttributeBooleanValue{ + Value: aws.Bool(false), + }, + }, + }, + }, + }, }, "ondemand-stopped": { Instance: &ec2.Instance{ @@ -1817,6 +1963,16 @@ func TestGetAnySpotInstance(t *testing.T) { State: &ec2.InstanceState{Name: aws.String(ec2.InstanceStateNameRunning)}, Placement: &ec2.Placement{AvailabilityZone: aws.String("1b")}, InstanceLifecycle: aws.String(""), + }, region: ®ion{ + services: connections{ + ec2: mockEC2{ + diao: &ec2.DescribeInstanceAttributeOutput{ + DisableApiTermination: &ec2.AttributeBooleanValue{ + Value: aws.Bool(false), + }, + }, + }, + }, }, }, }, @@ -1828,6 +1984,16 @@ func TestGetAnySpotInstance(t *testing.T) { State: &ec2.InstanceState{Name: aws.String(ec2.InstanceStateNameRunning)}, Placement: &ec2.Placement{AvailabilityZone: aws.String("1a")}, InstanceLifecycle: aws.String("spot"), + }, region: ®ion{ + services: connections{ + ec2: mockEC2{ + diao: &ec2.DescribeInstanceAttributeOutput{ + DisableApiTermination: &ec2.AttributeBooleanValue{ + Value: aws.Bool(false), + }, + }, + }, + }, }, }, { @@ -1837,6 +2003,17 @@ func TestGetAnySpotInstance(t *testing.T) { Placement: &ec2.Placement{AvailabilityZone: aws.String("1b")}, InstanceLifecycle: aws.String("spot"), }, + region: ®ion{ + services: connections{ + ec2: mockEC2{ + diao: &ec2.DescribeInstanceAttributeOutput{ + DisableApiTermination: &ec2.AttributeBooleanValue{ + Value: aws.Bool(false), + }, + }, + }, + }, + }, }, }, }, @@ -2176,7 +2353,7 @@ func TestReplaceOnDemandInstanceWithSpot(t *testing.T) { "spot-running": { Instance: &ec2.Instance{ InstanceId: aws.String("spot-running"), - State: &ec2.InstanceState{Name: aws.String(ec2.InstanceStateNameStopped)}, + State: &ec2.InstanceState{Name: aws.String(ec2.InstanceStateNameRunning)}, Placement: &ec2.Placement{AvailabilityZone: aws.String("1z")}, InstanceLifecycle: aws.String("spot"), }, @@ -2194,14 +2371,106 @@ func TestReplaceOnDemandInstanceWithSpot(t *testing.T) { }, }, }, + + {name: "found OnDemand instance in asg, without lifecycle hooks", + spotID: "spot-running", + expected: errors.New(""), + asg: &autoScalingGroup{ + name: "test-asg", + Group: &autoscaling.Group{ + MaxSize: aws.Int64(4), + MinSize: aws.Int64(1), + DesiredCapacity: aws.Int64(2), + }, + instances: makeInstancesWithCatalog(instanceMap{ + "on-demand-running": { + Instance: &ec2.Instance{ + InstanceId: aws.String("on-demand-running"), + State: &ec2.InstanceState{Name: aws.String(ec2.InstanceStateNameRunning)}, + Placement: &ec2.Placement{AvailabilityZone: aws.String("1z")}, + }, + region: ®ion{ + services: connections{ + ec2: &mockEC2{ + tio: nil, + tierr: nil, + }, + }, + }, + }, + }), + region: ®ion{ + name: "test-region", + conf: &Config{}, + services: connections{ + autoScaling: &mockASG{ + uasgo: nil, + uasgerr: nil, + dio: nil, + dierr: nil, + tiiasgo: nil, + tiiasgerr: nil, + dlho: &autoscaling.DescribeLifecycleHooksOutput{ + LifecycleHooks: []*autoscaling.LifecycleHook{}, + }, + }, + ec2: &mockEC2{}, + }, + instances: makeInstancesWithCatalog( + instanceMap{ + "on-demand-running": { + Instance: &ec2.Instance{ + InstanceId: aws.String("on-demand-running"), + State: &ec2.InstanceState{Name: aws.String(ec2.InstanceStateNameRunning)}, + Placement: &ec2.Placement{AvailabilityZone: aws.String("1z")}, + }, + region: ®ion{ + services: connections{ + ec2: &mockEC2{ + tio: nil, + tierr: nil, + }, + }, + }, + }, + + "spot-running": { + Instance: &ec2.Instance{ + InstanceId: aws.String("spot-running"), + State: &ec2.InstanceState{Name: aws.String(ec2.InstanceStateNameRunning)}, + Placement: &ec2.Placement{AvailabilityZone: aws.String("1z")}, + InstanceLifecycle: aws.String("spot"), + }, + region: ®ion{ + services: connections{ + ec2: &mockEC2{ + tio: nil, + tierr: nil, + diaerr: nil, + diao: &ec2.DescribeInstanceAttributeOutput{ + DisableApiTermination: &ec2.AttributeBooleanValue{ + Value: aws.Bool(false), + }, + }, + }, + }, + }, + }, + }, + ), + }, + }, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { + fmt.Println(tt.name) returned := tt.asg.replaceOnDemandInstanceWithSpot(nil, tt.spotID) CheckErrors(t, returned, tt.expected) }) t.Run(tt.name+"-detach-method", func(t *testing.T) { + fmt.Println(tt.name) tt.asg.config.TerminationMethod = "detach" returned := tt.asg.replaceOnDemandInstanceWithSpot(nil, tt.spotID) CheckErrors(t, returned, tt.expected) @@ -3134,6 +3403,7 @@ func Test_autoScalingGroup_licensedToRun(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { a := tt.asg + as.hourlySavings = 0 got, err := a.licensedToRun() if (err != nil) != tt.wantErr { @@ -3143,6 +3413,7 @@ func Test_autoScalingGroup_licensedToRun(t *testing.T) { if got != tt.want { t.Errorf("autoScalingGroup.licensedToRun() = %v, want %v", got, tt.want) } + as.hourlySavings = 0 }) } } @@ -3236,3 +3507,782 @@ func Test_autoScalingGroup_calculateHourlySavings(t *testing.T) { }) } } + +func Test_autoScalingGroup_needReplaceOnDemandInstances(t *testing.T) { + + tests := []struct { + name string + instances instances + minOnDemand int64 + wantNeed bool + wantTotal int64 + }{ + { + name: "empty group", + instances: makeInstancesWithCatalog(instanceMap{}), + wantNeed: true, + wantTotal: 0, + }, + { + name: "group with more on-demand instances than enough", + minOnDemand: 0, + instances: makeInstancesWithCatalog(instanceMap{ + "i-1": &instance{ + Instance: &ec2.Instance{ + State: &ec2.InstanceState{ + Name: aws.String("running"), + }, + }, + }, + }), + wantNeed: true, + wantTotal: 1, + }, + { + name: "group with as much on-demand instances as desired", + minOnDemand: 1, + instances: makeInstancesWithCatalog(instanceMap{ + "i-1": &instance{ + Instance: &ec2.Instance{ + State: &ec2.InstanceState{ + Name: aws.String("running"), + }, + }, + }, + }), + wantNeed: false, + wantTotal: 1, + }, + { + name: "group with fewer on-demand instances than desired", + minOnDemand: 2, + instances: makeInstancesWithCatalog(instanceMap{ + "i-1": &instance{ + Instance: &ec2.Instance{ + State: &ec2.InstanceState{ + Name: aws.String("running"), + }, + }, + }, + }), + wantNeed: false, + wantTotal: 1, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + a := &autoScalingGroup{ + instances: tt.instances, + minOnDemand: tt.minOnDemand, + } + gotNeed, gotTotal := a.needReplaceOnDemandInstances() + if gotNeed != tt.wantNeed { + t.Errorf("autoScalingGroup.needReplaceOnDemandInstances() got = %v, want %v", gotNeed, tt.wantNeed) + + } + if gotTotal != tt.wantTotal { + t.Errorf("autoScalingGroup.needReplaceOnDemandInstances() got1 = %v, want %v", gotTotal, tt.wantTotal) + } + }) + } +} + +func Test_autoScalingGroup_terminateRandomSpotInstanceIfHavingEnough(t *testing.T) { + + tests := []struct { + name string + group *autoscaling.Group + asgName string + region *region + launchConfiguration *launchConfiguration + instances instances + minOnDemand int64 + config AutoScalingConfig + totalRunning int64 + wait bool + wantErr bool + }{ + { + name: "last running instance", + totalRunning: 1, + wantErr: false, + }, + + {name: "not enough running capacity in the group", + group: &autoscaling.Group{ + DesiredCapacity: aws.Int64(2), + }, + instances: makeInstancesWithCatalog(instanceMap{ + "i-foo": &instance{ + Instance: &ec2.Instance{ + InstanceId: aws.String("i-foo"), + State: &ec2.InstanceState{ + Name: aws.String("running"), + }, + }, + }, + }), + wantErr: false, + }, + + {name: "no spot capacity in the group", + group: &autoscaling.Group{ + DesiredCapacity: aws.Int64(1), + }, + instances: makeInstancesWithCatalog(instanceMap{ + "i-f00": &instance{ + Instance: &ec2.Instance{ + InstanceId: aws.String("i-foo"), + State: &ec2.InstanceState{ + Name: aws.String("running"), + }, + }, + }, + }), + wantErr: false, + }, + + {name: "spot capacity exists in the group, terminating using the default termination method", + group: &autoscaling.Group{ + DesiredCapacity: aws.Int64(1), + }, + instances: makeInstancesWithCatalog(instanceMap{ + "i-foo": &instance{ + Instance: &ec2.Instance{ + InstanceId: aws.String("i-foo"), + InstanceLifecycle: aws.String("spot"), + State: &ec2.InstanceState{ + Name: aws.String("running"), + }, + }, + region: ®ion{ + services: connections{ + ec2: mockEC2{ + diao: &ec2.DescribeInstanceAttributeOutput{}, + }, + }, + }, + }, + }), + region: ®ion{ + services: connections{ + ec2: mockEC2{ + tierr: errors.New("dummy error"), + }, + autoScaling: mockASG{ + dlho: &autoscaling.DescribeLifecycleHooksOutput{}, + }, + }, + }, + wantErr: false, + }, + + {name: "spot capacity exists in the group, terminating using the non-default termination method", + group: &autoscaling.Group{ + DesiredCapacity: aws.Int64(1), + }, + instances: makeInstancesWithCatalog(instanceMap{ + "i-f00": &instance{ + Instance: &ec2.Instance{ + InstanceId: aws.String("i-foo"), + InstanceLifecycle: aws.String("spot"), + State: &ec2.InstanceState{ + Name: aws.String("running"), + }, + }, + region: ®ion{ + services: connections{ + ec2: mockEC2{ + tierr: nil, + }, + }, + }, + }, + }), + region: ®ion{ + services: connections{ + autoScaling: mockASG{ + dlho: &autoscaling.DescribeLifecycleHooksOutput{}, + }, + }, + }, + config: AutoScalingConfig{ + TerminationMethod: DetachTerminationMethod, + }, + wantErr: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + a := &autoScalingGroup{ + Group: tt.group, + name: tt.asgName, + region: tt.region, + launchConfiguration: tt.launchConfiguration, + instances: tt.instances, + minOnDemand: tt.minOnDemand, + config: tt.config, + } + if err := a.terminateRandomSpotInstanceIfHavingEnough(tt.totalRunning, tt.wait); (err != nil) != tt.wantErr { + t.Errorf("autoScalingGroup.terminateRandomSpotInstanceIfHavingEnough() error = %v, wantErr %v", err, tt.wantErr) + } + }) + } +} + +func Test_autoScalingGroup_cronEventAction(t *testing.T) { + asgNoInstances := autoScalingGroup{ + Group: &autoscaling.Group{}, + region: ®ion{ + conf: &Config{ + AutoScalingConfig: AutoScalingConfig{ + CronSchedule: "* *", + CronScheduleState: "on", + }, + LicenseType: "evaluation", + Version: "custom", + }, + instances: makeInstancesWithCatalog( + instanceMap{}), + }, + } + + asgTerminateSpotInstance := autoScalingGroup{ + + Group: &autoscaling.Group{ + Instances: []*autoscaling.Instance{ + { + InstanceId: aws.String("i-ondemand"), + ProtectedFromScaleIn: aws.Bool(false), + }, + }, + }, + region: ®ion{ + conf: &Config{ + AutoScalingConfig: AutoScalingConfig{ + CronSchedule: "* *", + CronScheduleState: "on", + MinOnDemandNumber: 1, + }, + LicenseType: "custom", + Version: "nightly", + }, + instances: makeInstancesWithCatalog( + instanceMap{ + "i-ondemand": { + Instance: &ec2.Instance{ + InstanceId: aws.String("i-ondemand"), + + Placement: &ec2.Placement{ + AvailabilityZone: aws.String("us-east-1a"), + }, + State: &ec2.InstanceState{ + Name: aws.String(ec2.InstanceStateNameRunning), + }, + }, + price: 30.3, + typeInfo: instanceTypeInformation{ + pricing: prices{ + onDemand: 30.3, + spot: spotPriceMap{ + "us-east-1a": 0.2, + }, + }, + }, + }, + }), + services: connections{ + ec2: mockEC2{ + diao: &ec2.DescribeInstanceAttributeOutput{}, + }, + }, + }, + } + + onDemandInstance := instance{ + Instance: &ec2.Instance{ + InstanceId: aws.String("i-ondemand"), + + Placement: &ec2.Placement{ + AvailabilityZone: aws.String("us-east-1a"), + }, + State: &ec2.InstanceState{ + Name: aws.String(ec2.InstanceStateNameRunning), + }, + }, + price: 30.3, + typeInfo: instanceTypeInformation{ + pricing: prices{ + onDemand: 30.3, + spot: spotPriceMap{ + "us-east-1a": 0.2, + }, + }, + }, + } + + asgLaunchSpotReplacement := autoScalingGroup{ + instances: makeInstancesWithCatalog(instanceMap{}), + Group: &autoscaling.Group{ + Instances: []*autoscaling.Instance{ + { + InstanceId: aws.String("i-ondemand"), + ProtectedFromScaleIn: aws.Bool(false), + }, + }, + }, + region: ®ion{ + conf: &Config{ + AutoScalingConfig: AutoScalingConfig{ + CronSchedule: "* *", + CronScheduleState: "on", + + MinOnDemandNumber: 0, + }, + LicenseType: "custom", + Version: "nightly", + }, + instances: makeInstancesWithCatalog( + instanceMap{ + "i-ondemand": &onDemandInstance, + }), + services: connections{ + ec2: mockEC2{ + diao: &ec2.DescribeInstanceAttributeOutput{}, + }, + }, + }, + } + + spotInstance := instance{ + Instance: &ec2.Instance{ + InstanceId: aws.String("i-spot"), + InstanceLifecycle: aws.String("spot"), + LaunchTime: aws.Time(time.Now().Add(-1 * time.Hour)), + + Placement: &ec2.Placement{ + AvailabilityZone: aws.String("us-east-1a"), + }, + State: &ec2.InstanceState{ + Name: aws.String(ec2.InstanceStateNameRunning), + }, + Tags: []*ec2.Tag{ + {Key: aws.String("launched-for-asg"), + Value: aws.String("asg-foo"), + }, + }, + }, + price: 30.3, + typeInfo: instanceTypeInformation{ + pricing: prices{ + onDemand: 30.3, + spot: spotPriceMap{ + "us-east-1a": 0.2, + }, + }, + }, + } + + asgExistingSpotReplacementButUnneeded := autoScalingGroup{ + name: "asg-foo", + instances: makeInstancesWithCatalog(instanceMap{}), + Group: &autoscaling.Group{ + Instances: []*autoscaling.Instance{ + { + InstanceId: aws.String("i-ondemand"), + ProtectedFromScaleIn: aws.Bool(false), + }, + }, + HealthCheckGracePeriod: aws.Int64(60), + }, + region: ®ion{ + conf: &Config{ + AutoScalingConfig: AutoScalingConfig{ + CronSchedule: "* *", + CronScheduleState: "on", + + MinOnDemandNumber: 1, + }, + LicenseType: "custom", + Version: "nightly", + }, + instances: makeInstancesWithCatalog( + instanceMap{ + "i-ondemand": &onDemandInstance, + "i-spot": &spotInstance, + }), + services: connections{ + ec2: mockEC2{ + diao: &ec2.DescribeInstanceAttributeOutput{}, + }, + }, + }, + } + + asgExistingSpotReplacementButNotReady := autoScalingGroup{ + name: "asg-foo", + instances: makeInstancesWithCatalog(instanceMap{}), + Group: &autoscaling.Group{ + Instances: []*autoscaling.Instance{ + { + InstanceId: aws.String("i-ondemand"), + ProtectedFromScaleIn: aws.Bool(false), + }, + }, + HealthCheckGracePeriod: aws.Int64(7200), + }, + region: ®ion{ + conf: &Config{ + AutoScalingConfig: AutoScalingConfig{ + CronSchedule: "* *", + CronScheduleState: "on", + MinOnDemandNumber: 0, + }, + LicenseType: "custom", + Version: "nightly", + }, + instances: makeInstancesWithCatalog( + instanceMap{ + "i-ondemand": &onDemandInstance, + "i-spot": &spotInstance, + }), + services: connections{ + ec2: mockEC2{ + diao: &ec2.DescribeInstanceAttributeOutput{}, + }, + }, + }, + } + + asgExistingSpotReplacementAndReady := autoScalingGroup{ + name: "asg-foo", + instances: makeInstancesWithCatalog(instanceMap{}), + Group: &autoscaling.Group{ + Instances: []*autoscaling.Instance{ + { + InstanceId: aws.String("i-ondemand"), + ProtectedFromScaleIn: aws.Bool(false), + }, + }, + HealthCheckGracePeriod: aws.Int64(60), + }, + region: ®ion{ + conf: &Config{ + AutoScalingConfig: AutoScalingConfig{ + CronSchedule: "* *", + CronScheduleState: "on", + MinOnDemandNumber: 0, + }, + LicenseType: "custom", + Version: "nightly", + }, + instances: makeInstancesWithCatalog( + instanceMap{ + "i-ondemand": &onDemandInstance, + "i-spot": &spotInstance, + }), + services: connections{ + ec2: mockEC2{ + diao: &ec2.DescribeInstanceAttributeOutput{}, + }, + }, + }, + } + tests := []struct { + name string + asg *autoScalingGroup + want runer + }{ + {name: "should not run", + + asg: &autoScalingGroup{ + Group: &autoscaling.Group{}, + region: ®ion{ + conf: &Config{ + AutoScalingConfig: AutoScalingConfig{ + CronSchedule: "* *", + CronScheduleState: "off", + }, + }, + instances: makeInstancesWithCatalog(instanceMap{}), + }, + }, + want: skipRun{reason: "outside-cron-schedule"}, + }, + + {name: "not licensed to run", + asg: &autoScalingGroup{ + Group: &autoscaling.Group{ + Instances: []*autoscaling.Instance{ + { + InstanceId: aws.String("i-spot"), + ProtectedFromScaleIn: aws.Bool(false), + }, + }, + }, + region: ®ion{ + conf: &Config{ + AutoScalingConfig: AutoScalingConfig{ + CronSchedule: "* *", + CronScheduleState: "on", + }, + LicenseType: "evaluation", + Version: "nightly", + }, + instances: makeInstancesWithCatalog( + instanceMap{ + "i-spot": { + Instance: &ec2.Instance{ + InstanceId: aws.String("i-spot"), + InstanceLifecycle: aws.String("spot"), + Placement: &ec2.Placement{ + AvailabilityZone: aws.String("us-east-1a"), + }, + State: &ec2.InstanceState{ + Name: aws.String(ec2.InstanceStateNameRunning), + }, + }, + price: 0.2, + typeInfo: instanceTypeInformation{ + pricing: prices{ + onDemand: 30.3, + spot: spotPriceMap{ + "us-east-1a": 0.2, + }, + }, + }, + }, + }), + services: connections{ + ec2: mockEC2{ + diao: &ec2.DescribeInstanceAttributeOutput{}, + }, + }, + }, + }, + want: skipRun{reason: "over-license"}, + }, + {name: "no instances, just enable the event handling", + asg: &asgNoInstances, + want: enableEventHandling{target{asg: &asgNoInstances}}, + }, + + {name: "not allowed to replace instances, terminate-spot-instance", + asg: &asgTerminateSpotInstance, + want: terminateSpotInstance{target{asg: &asgTerminateSpotInstance, totalInstances: 1}}, + }, + + {name: "allowed to replace instance, launch spot instance replacement", + asg: &asgLaunchSpotReplacement, + want: launchSpotReplacement{target{ + onDemandInstance: &onDemandInstance}, + }, + }, + + {name: "allowed to replace instance, spot instance replacement exists but not needed", + asg: &asgExistingSpotReplacementButUnneeded, + want: terminateUnneededSpotInstance{ + target{ + asg: &asgExistingSpotReplacementButUnneeded, + spotInstance: &spotInstance, + totalInstances: 1, + }, + }, + }, + + {name: "allowed to replace instance, spot instance replacement exists but not ready", + asg: &asgExistingSpotReplacementButNotReady, + want: skipRun{reason: "spot instance replacement exists but not ready"}, + }, + + {name: "allowed to replace instance, spot instance replacement exists and ready", + asg: &asgExistingSpotReplacementAndReady, + want: swapSpotInstance{target{ + asg: &asgExistingSpotReplacementAndReady, + spotInstance: &spotInstance, + }, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + as.hourlySavings = 0 + if got := tt.asg.cronEventAction(); !reflect.DeepEqual(got, tt.want) { + t.Errorf("autoScalingGroup.cronEventAction() \ngot %#v, \nwant %#v", got, tt.want) + } + as.hourlySavings = 0 + }) + } +} + +func Test_autoScalingGroup_enableForInstanceLaunchEventHandling(t *testing.T) { + type fields struct { + } + tests := []struct { + name string + Group *autoscaling.Group + asgName string + region *region + launchConfiguration *launchConfiguration + instances instances + minOnDemand int64 + config AutoScalingConfig + want bool + }{ + { + name: "tag already set", + want: true, + Group: &autoscaling.Group{ + Tags: []*autoscaling.TagDescription{ + { + Key: aws.String("foo"), + Value: aws.String("bar"), + }, + { + Key: aws.String(EnableInstanceLaunchEventHandlingTag), + Value: aws.String("true"), + }, + }, + }, + }, + + { + name: "tag not already set, API error", + want: false, + Group: &autoscaling.Group{ + AutoScalingGroupName: aws.String("foo"), + Tags: []*autoscaling.TagDescription{ + { + Key: aws.String("foo"), + Value: aws.String("bar"), + }, + }, + }, + region: ®ion{ + services: connections{ + autoScaling: mockASG{ + couterr: errors.New("API error"), + }, + }, + }, + }, + { + name: "tag not already set, API success", + want: true, + Group: &autoscaling.Group{ + AutoScalingGroupName: aws.String("foo"), + Tags: []*autoscaling.TagDescription{ + { + Key: aws.String("foo"), + Value: aws.String("bar"), + }, + }, + }, + region: ®ion{ + services: connections{ + autoScaling: mockASG{ + couterr: nil, + }, + }, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + a := &autoScalingGroup{ + Group: tt.Group, + name: tt.asgName, + region: tt.region, + launchConfiguration: tt.launchConfiguration, + instances: tt.instances, + minOnDemand: tt.minOnDemand, + config: tt.config, + } + if got := a.enableForInstanceLaunchEventHandling(); !reflect.DeepEqual(got, tt.want) { + t.Errorf("autoScalingGroup.enableForInstanceLaunchEventHandling() \ngot %#v, \nwant %#v", got, tt.want) + } + }) + } +} + +func Test_autoScalingGroup_isEnabledForEventBasedInstanceReplacement(t *testing.T) { + + tests := []struct { + name string + Group *autoscaling.Group + asgName string + region *region + launchConfiguration *launchConfiguration + instances instances + minOnDemand int64 + config AutoScalingConfig + want bool + }{ + { + name: "recently created ASG, tagging it works", + Group: &autoscaling.Group{ + CreatedTime: aws.Time(time.Now().Add(-10 * time.Minute)), + Tags: []*autoscaling.TagDescription{ + { + Key: aws.String("foo"), + Value: aws.String("bar"), + }, + }, + }, + region: ®ion{ + services: connections{ + autoScaling: mockASG{ + couterr: nil, + }, + }, + }, + want: true, + }, + + { + name: "older ASG, not already tagged", + Group: &autoscaling.Group{ + CreatedTime: aws.Time(time.Now().Add(-10 * time.Hour)), + Tags: []*autoscaling.TagDescription{ + { + Key: aws.String("foo"), + Value: aws.String("bar"), + }, + }, + }, + want: false, + }, { + name: "older ASG, already tagged", + Group: &autoscaling.Group{ + CreatedTime: aws.Time(time.Now().Add(-10 * time.Hour)), + Tags: []*autoscaling.TagDescription{ + { + Key: aws.String("foo"), + Value: aws.String("bar"), + }, + { + Key: aws.String(EnableInstanceLaunchEventHandlingTag), + Value: aws.String("true"), + }, + }, + }, + want: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + a := &autoScalingGroup{ + Group: tt.Group, + name: tt.asgName, + region: tt.region, + launchConfiguration: tt.launchConfiguration, + instances: tt.instances, + minOnDemand: tt.minOnDemand, + config: tt.config, + } + if got := a.isEnabledForEventBasedInstanceReplacement(); got != tt.want { + t.Errorf("autoScalingGroup.isEnabledForEventBasedInstanceReplacement() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/core/instance.go b/core/instance.go index 59349ab8..c8249a61 100644 --- a/core/instance.go +++ b/core/instance.go @@ -1,6 +1,7 @@ package autospotting import ( + "errors" "fmt" "math" "path/filepath" @@ -144,8 +145,9 @@ func (i *instance) isSpot() bool { *i.InstanceLifecycle == "spot" } -func (i *instance) isProtectedFromTermination() bool { +func (i *instance) isProtectedFromTermination() (bool, error) { + debug.Println("\tCheching termination protection for instance: ", *i.InstanceId) // determine and set the API termination protection field diaRes, err := i.region.services.ec2.DescribeInstanceAttribute( &ec2.DescribeInstanceAttributeInput{ @@ -153,15 +155,22 @@ func (i *instance) isProtectedFromTermination() bool { InstanceId: i.InstanceId, }) - if err == nil && + if err != nil { + // better safe than sorry! + logger.Printf("Couldn't describe instance attritbutes, assuming instance %v is protected: %v\n", + *i.InstanceId, err.Error()) + return true, err + } + + if diaRes != nil && diaRes.DisableApiTermination != nil && diaRes.DisableApiTermination.Value != nil && *diaRes.DisableApiTermination.Value { logger.Printf("\t: %v Instance, %v is protected from termination\n", *i.Placement.AvailabilityZone, *i.InstanceId) - return true + return true, nil } - return false + return false, nil } func (i *instance) isProtectedFromScaleIn() bool { @@ -188,9 +197,10 @@ func (i *instance) canTerminate() bool { func (i *instance) terminate() error { var err error + logger.Printf("Instance: %v\n", i) - svc := i.region.services.ec2 logger.Printf("Terminating %v", *i.InstanceId) + svc := i.region.services.ec2 if !i.canTerminate() { logger.Printf("Can't terminate %v, current state: %s", @@ -210,11 +220,12 @@ func (i *instance) terminate() error { } func (i *instance) shouldBeReplacedWithSpot() bool { + protT, _ := i.isProtectedFromTermination() return i.belongsToEnabledASG() && i.asgNeedsReplacement() && !i.isSpot() && !i.isProtectedFromScaleIn() && - !i.isProtectedFromTermination() + !protT } func (i *instance) belongsToEnabledASG() bool { @@ -252,7 +263,8 @@ func (i *instance) belongsToAnASG() (bool, *string) { } func (i *instance) asgNeedsReplacement() bool { - return i.asg.needReplaceOnDemandInstances(true) + ret, _ := i.asg.needReplaceOnDemandInstances() + return ret } func (i *instance) isPriceCompatible(spotPrice float64) bool { @@ -504,7 +516,8 @@ func (i *instance) launchSpotReplacement() (*string, error) { } logger.Println(i.asg.name, "Exhausted all compatible instance types without launch success. Aborting.") - return nil, err + return nil, errors.New("exhausted all compatible instance types") + } func (i *instance) getPricetoBid( @@ -800,7 +813,9 @@ func (i *instance) swapWithGroupMember(asg *autoScalingGroup) (*instance, error) *odInstanceID) } - asg.suspendTerminations() + var waiter sync.WaitGroup + defer waiter.Wait() + go asg.temporarilySuspendTerminations(&waiter) max := *asg.MaxSize @@ -831,8 +846,6 @@ func (i *instance) swapWithGroupMember(asg *autoScalingGroup) (*instance, error) *odInstanceID) } - asg.resumeTerminations() - return odInstance, nil } diff --git a/core/main_test.go b/core/main_test.go index 94de16c3..be32c51f 100644 --- a/core/main_test.go +++ b/core/main_test.go @@ -2,6 +2,7 @@ package autospotting import ( "fmt" + "io" "io/ioutil" "log" "os" @@ -19,8 +20,16 @@ func TestMain(m *testing.M) { MainRegion: "us-east-1", }) - logger = log.New(ioutil.Discard, "", 0) - debug = log.New(ioutil.Discard, "", 0) + var logOutput io.Writer + + if os.Getenv("AUTOSPOTTING_DEBUG") == "true" { + logOutput = os.Stdout + } else { + logOutput = ioutil.Discard + } + + logger = log.New(logOutput, "", 0) + debug = log.New(logOutput, "", 0) os.Exit(m.Run()) } diff --git a/core/mock_test.go b/core/mock_test.go index f5e7af0b..13479be2 100644 --- a/core/mock_test.go +++ b/core/mock_test.go @@ -57,7 +57,7 @@ type mockEC2 struct { dltverr error // WaitUntilInstanceRunning error - // wuirerr error + wuirerr error } func (m mockEC2) DescribeSpotPriceHistory(in *ec2.DescribeSpotPriceHistoryInput) (*ec2.DescribeSpotPriceHistoryOutput, error) { @@ -89,6 +89,10 @@ func (m mockEC2) DescribeLaunchTemplateVersions(*ec2.DescribeLaunchTemplateVersi return m.dltvo, m.dltverr } +func (m mockEC2) WaitUntilInstanceRunning(*ec2.DescribeInstancesInput) error { + return m.wuirerr +} + // For testing we "convert" the SecurityGroupIDs/SecurityGroupNames by // prefixing the original name/id with "sg-" if not present already. We // also fill up the rest of the string to the length of a typical ID with @@ -161,6 +165,10 @@ type mockASG struct { // DescribeLifecycleHooks dlho *autoscaling.DescribeLifecycleHooksOutput dlherr error + + // CreateOrUpdateTags + couto *autoscaling.CreateOrUpdateTagsOutput + couterr error } func (m mockASG) DetachInstances(*autoscaling.DetachInstancesInput) (*autoscaling.DetachInstancesOutput, error) { @@ -201,6 +209,10 @@ func (m mockASG) DescribeLifecycleHooks(*autoscaling.DescribeLifecycleHooksInput return m.dlho, m.dlherr } +func (m mockASG) CreateOrUpdateTags(*autoscaling.CreateOrUpdateTagsInput) (*autoscaling.CreateOrUpdateTagsOutput, error) { + return m.couto, m.couterr +} + // All fields are composed of the abbreviation of their method // This is useful when methods are doing multiple calls to AWS API type mockCloudFormation struct { diff --git a/core/region.go b/core/region.go index dca6a072..e5a7e944 100644 --- a/core/region.go +++ b/core/region.go @@ -453,7 +453,8 @@ func (r *region) processEnabledAutoScalingGroups() { r.wg.Add(1) go func(a autoScalingGroup) { - a.processCronEvent() + action := a.cronEventAction() + action.run() r.wg.Done() }(asg) } diff --git a/go.mod b/go.mod index 9fa5bfa6..f48c30f8 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/AutoSpotting/AutoSpotting -go 1.12 +go 1.13 require ( github.com/aws/aws-lambda-go v1.11.1 From 1b4a83870e1bae3104ba3719e58711a4aaf22441 Mon Sep 17 00:00:00 2001 From: Cristian Magherusan-Stanciu Date: Sun, 1 Dec 2019 20:53:12 +0100 Subject: [PATCH 08/17] Suspend termination processes for 5 minutes --- go.mod | 3 ++- go.sum | 8 ++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/go.mod b/go.mod index f48c30f8..c7a3dbf1 100644 --- a/go.mod +++ b/go.mod @@ -7,13 +7,14 @@ require ( github.com/aws/aws-sdk-go v1.20.15 github.com/cristim/ec2-instances-info v0.0.0-20190708120723-b53a9860c46d github.com/davecgh/go-spew v1.1.1 + github.com/mattn/goveralls v0.0.4 // indirect github.com/namsral/flag v0.0.0-20170814194028-67f268f20922 github.com/pkg/errors v0.8.1 // indirect github.com/robfig/cron v1.2.0 github.com/stretchr/objx v0.2.0 // indirect golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4 // indirect + golang.org/x/lint v0.0.0-20191125180803-fdd1cda4f05f // indirect golang.org/x/net v0.0.0-20190628185345-da137c7871d7 // indirect golang.org/x/sys v0.0.0-20190626221950-04f50cda93cb // indirect golang.org/x/text v0.3.2 // indirect - golang.org/x/tools v0.0.0-20190706070813-72ffa07ba3db // indirect ) diff --git a/go.sum b/go.sum index a71b7250..feac660a 100644 --- a/go.sum +++ b/go.sum @@ -16,6 +16,8 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/jmespath/go-jmespath v0.0.0-20180206201540-c2b33e8439af h1:pmfjZENx5imkbgOkpRUYLnmbU7UEFbjtDA2hxJ1ichM= github.com/jmespath/go-jmespath v0.0.0-20180206201540-c2b33e8439af/go.mod h1:Nht3zPeWKUH0NzdCt2Blrr5ys8VGpn0CEB0cQHVjt7k= +github.com/mattn/goveralls v0.0.4 h1:/mdWfiU2y8kZ48EtgByYev/XT3W4dkTuKLOJJsh/r+o= +github.com/mattn/goveralls v0.0.4/go.mod h1:8d1ZMHsd7fW6IRPKQh46F2WRpyib5/X4FOpevwGNQEw= github.com/namsral/flag v0.0.0-20170814194028-67f268f20922 h1:dRRQLGaXoPysHledlqbOa53vGxt0WjaVtdCexlWiRjA= github.com/namsral/flag v0.0.0-20170814194028-67f268f20922/go.mod h1:OXldTctbM6SWH1K899kPZcf65KxJiD7MsceFUpB5yDo= github.com/pkg/errors v0.8.1 h1:iURUrRGxPUNPdy5/HRSm+Yj6okJ6UtLINN0Q9M4+h3I= @@ -33,6 +35,8 @@ github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0 github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/lint v0.0.0-20191125180803-fdd1cda4f05f h1:J5lckAjkw6qYlOZNj90mLYNTEKDvWeuc1yieZ8qUzUE= +golang.org/x/lint v0.0.0-20191125180803-fdd1cda4f05f/go.mod h1:5qLYkcX4OjUUV8bRuDixDT3tpyyb+LUpUlRWLxfhWrs= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190424112056-4829fb13d2c6 h1:FP8hkuE6yUEaJnK7O2eTuejKWwW+Rhfj80dQ2JcKxCU= golang.org/x/net v0.0.0-20190424112056-4829fb13d2c6/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= @@ -47,5 +51,9 @@ golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs= golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190706070813-72ffa07ba3db h1:9hRk1xeL9LTT3yX/941DqeBz87XgHAQuj+TbimYJuiw= golang.org/x/tools v0.0.0-20190706070813-72ffa07ba3db/go.mod h1:jcCCGcm9btYwXyDqrUWc6MKQKKGJCWEQ3AfLSRIbEuI= +golang.org/x/tools v0.0.0-20191125144606-a911d9008d1f h1:kDxGY2VmgABOe55qheT/TFqUMtcTHnomIPS1iv3G4Ms= +golang.org/x/tools v0.0.0-20191125144606-a911d9008d1f/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= gopkg.in/urfave/cli.v1 v1.20.0/go.mod h1:vuBzUtMdQeixQj8LVd+/98pzhxNGQoyuPBlsXHOQNO0= From 5e035eebec6864eb86ebb99cb5f588de38129afd Mon Sep 17 00:00:00 2001 From: mello7tre Date: Thu, 23 Jan 2020 18:44:17 +0100 Subject: [PATCH 09/17] Feat/event based instance replacement (#371) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Allow flavor to be customised (#359) * fix typo (#360) * Fix typo (#361) * UserData wrappers for CloudFormation helper scripts when using Beanstalk (#366) * Support custom role for cfn-init in Beanstalk UserData * Wrappers & refactoring * Docs * Docs fixes * More docs fixes * Docs fixes * Yet more docs fixes * Lambda & Kubernetes config * AutoSpottingElasticBeanstalk managed policy + Rename beanstalk_cfn_init_role to beanstalk_cfn_wrappers * Update param description * Kubernetes config fix * Rename Beanstalk variable + Move test data * Add missing permission for AutoSpottingBeanstalk role * Bring Go to 1.13 (#367) * merge fixes * Begin - port regional CloudFormation stack changes to the main template so it can also run from a StackSet * Merge fix * Fix to template * Multiple fixes to handle lambda concurrency As instaces can be launched in concurrency/overlapping we have the problems related to multiple lambdas acting on the same ASG * Removed commented code * Begin working on: https://github.com/AutoSpotting/AutoSpotting/pull/354#issuecomment-558796054 * Progress #354 Created Queue in CF template Begin sending message * Progress #354 * Progress #354 * Progress #354 * Progress #354 * Progress #354 * * Use inline python lambda to increase ASG Use it only if AttachInstances method fails for wrong ASG max size. * * Use inline python lambda to increase ASG progress * * Use inline python lambda to increase ASG progress * * Improvements on event based instance replacement - Ported to StackSet deploy mode. - Fix to "ScalingActivityInProgress" if launched ondemand instance is terminated before going "inservice": Created method "waitForInstanceStatus", that wait, until a max retry (5), that an instance belonging to an ASG is in the desired status (InService). (Sleep time is 5*retry) - Fix to multiple problems caused by lambda concurrency changing ASG MaxSize: Created another Lambda (LambdaManageASG) in the same region of the main one; code python3.7, inline in template. Lambda concurrency is set to one. Is function is to change ASG MaxSize by the ammount specified. Used a "try-catch approach": if AttachInstances return error code "ValidationError" and string "update the AutoScalingGroup sizes" is present in error message, means that we need to increase ASG MaxSize. So we execute method changeAutoScalingMaxSize that invoke LambdaManageASG. If invoke return error "ErrCodeTooManyRequestsException", means that multiple Main lambdas are executing LambdaManageASG, we sleep for a random interval part of seconds and retry. Method attachSpotInstance now return an int that represent the ammount of change to the ASG MaxSize, so that in swapWithGroupMember we can defer the call to changeAutoScalingMaxSize to decrease the ASG MaxSize of the previously increased ammount. We use "waitForInstanceStatus" in attachSpotInstance before returning to be sure that spot instance has been attached and are InService before beginning to terminate ondemand one. * * Improvements on event based instance replacement - Add logPrefix to better identify lambda actions in case of concurent executions. TE = Spot terminate event ST = Instance start event SC = Schedule event TE and ST are followed by instanceID, SC is followed by activation time. * * Improvements on event based instance replacement - fix/improvement on Suspending/Resuming Termination process: suspend/resume is now handled by LambdaManageASG too. When i successfully suspend termination, i add a tag "autospotting_suspend_process_by" with value equals to the instanceId event that triggered the main lambda. When i try to resume termination first check if the value of the tag above equals the current one. If not, means that another main lambda, different from the one who suspended it, is trying to resume the process; in that case i do not resume it. Considered that LambdaManageASG has a concurrent execution limit set to 1 we can have the following cases: 1) *) A main lambda suspend the process. *) No other lambda suspend it. *) Main lambda resume the process. *) Another main lambda suspend the process *) ....and so on 2) *) A main lambda suspend the process. *) Before the first one resume the process another one suspend it (so replacing the tag value) *) First lambda do not resume the process (tag value differ) *) Second lambda resume the process In the wrost scenario of a Lambda dying before resuming the process, it will be resumed after another one will suspend it. * * Improvements on event based instance replacement code cosmetic fix * * Improvements on event based instance replacement - for rand seed instead of using time.Now().UnixNano() we build a seed based on the instanceId that triggered the event. The seed is build this way: for every char of instanceId (starting from third char) we get his rune "representation" and sum it to previous one. We use it as a temporary seed and get a random number between 0 and 9. The final seed is the concatenation of the generated random numbers. This way we have a seed number (int64) that depend from the instanceId and of the same lenght. * * Improvements on event based instance replacement - no more need to "revert attach/detach order when running on minimum capacity". Defer changeAutoScalingMaxSize and use same logic of swapWithGroupMember. * * Improvements on event based instance replacement - Use suspendResumeProcess for schedule replaceOnDemandInstanceWithSpot too. We use it as a trick to avoid rare cases of concurrency between scheduled and event lambdas. As lambda that handle "suspendResumeProcess" have a concurrency limit of one, scheduled and event lambdas, if concurrent, will be "time shifted" by a random value. This way they will not execute attachSpotInstance at the same time. In case of scheduled lambda we add the "S" char to the instanceId used for the randSeed to avoid that it resume process suspended by event lambda. - Fix in swapWithGroupMember: defer asg.suspendResumeProcess for resume so that it will be executed even if function swapWithGroupMember return error. * * gofmt cosmetic changes * Fix to rand.Intn parameter to include even 9 Co-authored-by: Chris Farrenden Co-authored-by: Rubi <14269809+codenoid@users.noreply.github.com> Co-authored-by: Jawad Co-authored-by: Gábor Lipták --- Dockerfile | 2 +- Dockerfile-build | 2 +- START.md | 58 ++- autospotting.go | 1 + .../stacks/AutoSpotting/template.yaml | 362 +++++++++++++++++- core/autoscaling.go | 242 ++++++++++-- core/autoscaling_configuration.go | 20 + core/autoscaling_configuration_test.go | 64 ++++ core/beanstalk.go | 80 ++++ core/beanstalk_test.go | 96 +++++ core/config.go | 8 + core/connections.go | 7 +- core/instance.go | 34 +- core/instance_test.go | 140 +++++++ core/main.go | 25 +- core/region.go | 3 +- kubernetes/autospotting-cron.yaml.example | 2 + test_data/beanstalk_userdata_example.txt | 54 +++ .../beanstalk_userdata_wrapped_example.txt | 80 ++++ 19 files changed, 1205 insertions(+), 75 deletions(-) create mode 100644 core/beanstalk.go create mode 100644 core/beanstalk_test.go create mode 100644 test_data/beanstalk_userdata_example.txt create mode 100644 test_data/beanstalk_userdata_wrapped_example.txt diff --git a/Dockerfile b/Dockerfile index 3060c2b3..eba57d8f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM golang:1.12-alpine as golang +FROM golang:1.13-alpine as golang RUN apk add -U --no-cache ca-certificates git make COPY . /src WORKDIR /src diff --git a/Dockerfile-build b/Dockerfile-build index 0626b449..09f66fad 100644 --- a/Dockerfile-build +++ b/Dockerfile-build @@ -1,4 +1,4 @@ -FROM golang:1.12-alpine +FROM golang:1.13-alpine RUN apk add -U --no-cache ca-certificates git make zip COPY . /src diff --git a/START.md b/START.md index 6c07f4eb..f372e893 100644 --- a/START.md +++ b/START.md @@ -189,10 +189,53 @@ be risky, please handle with care. ### For Elastic Beanstalk ### -* In order to add tags to existing Elastic Beanstalk environment, you will need - to rebuild or update the environment with the `spot-enabled` tag. For more - details you can follow this - [guide](http://www.boringgeek.com/add-or-update-tags-on-existing-elastic-beanstalk-environments) +Elastic Beanstalk uses CloudFormation to create an Auto-Scaling Group. The ASG +is then in charge of automatically scaling your application up and down. As a +result, AutoSpotting works natively with Elastic Beanstalk. + +Follow these steps to configure AutoSpotting with Elastic Beanstalk. + +#### 1 - Add the `spot-enabled` tag #### + +Similar to standalone auto-scaling groups, you need to tag your Elastic Beanstalk +environment with the `spot-enabled` tag to let AutoSpotting manage the instances +in the group. + +To add tags to an existing Elastic Beanstalk environment, you will need to rebuild +or update the environment with the `spot-enabled` tag. For more details you can +follow this [guide](http://www.boringgeek.com/add-or-update-tags-on-existing-elastic-beanstalk-environments). + +#### 2 - Enable `patch_beanstalk_userdata` in AutoSpotting (optional) #### + +Elastic Beanstalk leverages CloudFormation for creating resources and initializing +instances. When a new instance is launched, Elastic Beanstalk configures it through +the auto-scaling configuration (`UserData` and tags). + +AutoSpotting launches spot instances outside of the auto-scaling group and attaches +them to the group after a grace period. As a result, the Elastic Beanstalk +initialization process can randomly fail or be delayed by 10+ minutes. +When it is delayed, the spot instances take a long time (10+ minutes) before being +initialized, appearing as healthy in Elastic Beanstalk and being added +to the load balancer. + +As a solution, you can configure AutoSpotting to alter the Elastic Beanstalk +user-data so that the Elastic Beanstalk initialization process can run even +if the spot instance is not a part of the auto-scaling group. + +To enable that option, set the `patch_beanstalk_userdata` variable to `true` +in your configuration. + +You will also need to update the permissions of the role used by your instances +to authorize requests to the CloudFormation API. Add the `AutoSpottingElasticBeanstalk` +policy to the role `aws-elasticbeanstalk-ec2-role` or the custom instance profile/role +used by your Beanstalk instances. + +The permissions contained in `AutoSpottingElasticBeanstalk` are required if you set +`patch_beanstalk_userdata` variable to `true`. If they are not added, your spot +instances will not be able to run correctly. + +You can get more information on the need for this configuration variable and +the permissions in the [bug report](https://github.com/AutoSpotting/AutoSpotting/issues/344). ## Configuration of AutoSpotting ## @@ -258,6 +301,13 @@ Usage of ./AutoSpotting: -tag_filters=[{spot-enabled true}]: Set of tags to filter the ASGs on. Default is -tag_filters 'spot-enabled=true' Example: ./AutoSpotting -tag_filters 'spot-enabled=true,Environment=dev,Team=vision' + + -patch_beanstalk_userdata="true": + Controls whether AutoSpotting patches Elastic Beanstalk UserData + scripts to use the instance role when calling CloudFormation + helpers instead of the standard CloudFormation authentication + method + Example: ./AutoSpotting --patch_beanstalk_userdata true ``` diff --git a/autospotting.go b/autospotting.go index 7256a5ff..754acc9b 100644 --- a/autospotting.go +++ b/autospotting.go @@ -67,6 +67,7 @@ func init() { SleepMultiplier: 1, Version: Version, LicenseType: os.Getenv("LICENSE"), + LambdaManageASG: os.Getenv("LAMBDA_MANAGE_ASG"), } parseCommandLineFlags() diff --git a/cloudformation/stacks/AutoSpotting/template.yaml b/cloudformation/stacks/AutoSpotting/template.yaml index faf52de1..ad8cfebb 100644 --- a/cloudformation/stacks/AutoSpotting/template.yaml +++ b/cloudformation/stacks/AutoSpotting/template.yaml @@ -254,6 +254,19 @@ binaries are restricted to up to $1000 in monthly savings, the others are not restricted" Type: "String" + PatchBeanstalkUserdata: + Default: "false" + AllowedValues: + - "false" + - "true" + Description: > + "Controls whether AutoSpotting patches Elastic Beanstalk UserData + scripts to use the instance role when calling CloudFormation helpers + instead of the standard CloudFormation authentication method. + After creating this CloudFormation stack, you must add the + AutoSpottingElasticBeanstalk managed policy to your Beanstalk + instance profile/role if you turn this option to On" + Type: "String" Conditions: StackSetsFalse: !Equals - !Ref DeployUsingStackSets @@ -289,7 +302,21 @@ - "LambdaRegionalExecutionRole" - "Arn" Resources: - AutoSpotTeminationEventRule: + SpotTerminationLambdaPermission: + Condition: "StackSetsTrue" + Type: "AWS::Lambda::Permission" + Properties: + Action: "lambda:InvokeFunction" + FunctionName: !If + - "StackSetsIsRegional" + - Ref: "EventHandler" + - Ref: "LambdaFunction" + Principal: "events.amazonaws.com" + SourceArn: + Fn::GetAtt: + - "SpotTeminationEventRule" + - "Arn" + SpotTeminationEventRule: Condition: "StackSetsTrue" Type: "AWS::Events::Rule" Properties: @@ -302,7 +329,148 @@ State: "ENABLED" Targets: - - Id: "AutoSpottingTerminationEventGenerator" + Id: "SpotTerminationEventGenerator" + InputTransformer: !If + - "StackSetsIsRegional" + - + InputPathsMap: { + "id": "$.id", + "detail-type": "$.detail-type", + "source": "$.source", + "account": "$.account", + "time": "$.time", + "region": "$.region", + "resources": "$.resources", + "detail": "$.detail" + } + InputTemplate: !Join ["", + [ + "{", + "\"id\": ,", + "\"detail-type\": ,", + "\"source\": ,", + "\"account\": ,", + "\"time\":