Skip to content

Commit

Permalink
Add pod draining lifecycle hook and lambda, update component to use l…
Browse files Browse the repository at this point in the history
…ib-iam (#7)

* Update to use lib-iam

* Add pod draining on node termination via lifecycle hook and lambda

* Make copy of tags object when adding Propagate value, otherwise it mutates original tags array and causes failure on previous objects

* Adhere to AWS convention

* Fix issue with config values

* Output role arns

* Add log permissions for lambda

* Add correct handler and add missing env var

* Correct syntax
  • Loading branch information
Samseppiol authored Sep 2, 2020
1 parent b3d088b commit c0ea41a
Show file tree
Hide file tree
Showing 9 changed files with 393 additions and 11 deletions.
3 changes: 3 additions & 0 deletions eks-cluster.cfhighlander.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
CfhighlanderTemplate do
Name 'eks-cluster'
DependsOn 'lib-iam'
Description "eks-cluster - #{component_version}"

Parameters do
Expand All @@ -20,4 +21,6 @@
ComponentParam 'MaxSize', '2'
end

LambdaFunctions 'draining_lambda'

end
46 changes: 35 additions & 11 deletions eks-cluster.cfndsl.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,35 @@
extra_tags.each { |key,value| tags << { Key: FnSub(key), Value: FnSub(value) } }

IAM_Role(:EksClusterRole) {
AssumeRolePolicyDocument service_role_assume_policy('eks')
AssumeRolePolicyDocument service_assume_role_policy('eks')
Path '/'
ManagedPolicyArns([
'arn:aws:iam::aws:policy/AmazonEKSServicePolicy',
'arn:aws:iam::aws:policy/AmazonEKSClusterPolicy'
])
}

AutoScaling_LifecycleHook(:DrainingLifecycleHook) {
AutoScalingGroupName Ref('EksNodeAutoScalingGroup')
HeartbeatTimeout 450
LifecycleTransition 'autoscaling:EC2_INSTANCE_TERMINATING'
}

Lambda_Permission(:DrainingLambdaPermission) {
Action 'lambda:InvokeFunction'
FunctionName FnGetAtt('Drainer', 'Arn')
Principal 'events.amazonaws.com'
SourceArn FnGetAtt('LifecycleEvent', 'Arn')
}

draining_lambda = external_parameters[:draining_lambda]
Events_Rule(:LifecycleEvent) {
Description FnSub("Rule for ${EnvironmentName} eks draining lifecycle hook")
State 'ENABLED'
EventPattern draining_lambda['event']['pattern']
Targets draining_lambda['event']['targets']
}

EC2_SecurityGroup(:EksClusterSecurityGroup) {
VpcId Ref('VPCId')
GroupDescription "EKS Cluster communication with worker nodes"
Expand Down Expand Up @@ -117,17 +138,12 @@
Version eks_version unless eks_version.nil?
}

policies = []
iam = external_parameters[:iam]
iam['policies'].each do |name,policy|
policies << iam_policy_allow(name,policy['action'],policy['resource'] || '*')
end if iam.has_key?('policies')

IAM_Role(:EksNodeRole) {
AssumeRolePolicyDocument service_role_assume_policy(iam['services'])
AssumeRolePolicyDocument service_assume_role_policy(iam['services'])
Path '/'
ManagedPolicyArns(iam['managed_policies']) if iam.has_key?('managed_policies')
Policies(policies) if policies.any?
ManagedPolicyArns(iam['managed_policies'])
Policies(iam_role_policies(iam['policies'])) if iam.has_key?('policies')
}

IAM_InstanceProfile(:EksNodeInstanceProfile) do
Expand Down Expand Up @@ -178,12 +194,12 @@
LaunchTemplateData(template_data)
}


asg_tags = [
{ Key: FnSub("k8s.io/cluster/${EksCluster}"), Value: 'owned' },
{ Key: 'k8s.io/cluster-autoscaler/enabled', Value: Ref('EnableScaling') }
]
asg_tags += tags
asg_tags.each {|tag| tag[:PropagateAtLaunch] = false }
asg_tags = tags.clone.map(&:clone).concat(asg_tags).uniq.each {|tag| tag[:PropagateAtLaunch] = false }
AutoScaling_AutoScalingGroup(:EksNodeAutoScalingGroup) {
UpdatePolicy(:AutoScalingRollingUpdate, {
MaxBatchSize: '1',
Expand Down Expand Up @@ -211,4 +227,12 @@
Value(Ref(:EksCluster))
}

Output(:DrainingLambdaRole) {
Value(FnGetAtt(:LambdaRoleDraining, :Arn))
}

Output(:EksNodeRole) {
Value(FnGetAtt(:EksNodeRole, :Arn))
}

end
41 changes: 41 additions & 0 deletions eks-cluster.config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,47 @@

detailed_monitoring: false

draining_lambda:
custom_policies:
DrainerPolicies:
action:
- autoscaling:CompleteLifecycleAction
- ec2:DescribeInstances
- eks:DescribeCluster
- sts:GetCallerIdentity
roles:
Draining:
policies_inline:
- DrainerPolicies
- cloudwatch-logs
functions:
Drainer:
code: drainer
handler: handler.lambda_handler
runtime: python3.7
timeout: 300
role: Draining
package_cmd: 'pip install -r requirements.txt -t .'
environment:
CLUSTER_NAME:
Ref: EksCluster
event:
pattern:
source:
- "aws.autoscaling"
detail-type:
- "EC2 Instance-terminate Lifecycle Action"
detail:
AutoScalingGroupName:
- Fn::Sub: '${EksNodeAutoScalingGroup}'
targets:
-
Arn:
Fn::GetAtt:
- "Drainer"
- "Arn"
Id: "EksDrainerFunction"

iam:
services:
- ec2
Expand Down
4 changes: 4 additions & 0 deletions lambdas/drainer/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
import os
import sys

sys.path.append(os.path.dirname(os.path.realpath(__file__)))
168 changes: 168 additions & 0 deletions lambdas/drainer/handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
import boto3
import base64
import logging
import os.path
import re
import yaml

from botocore.signers import RequestSigner
import kubernetes as k8s
from kubernetes.client.rest import ApiException

from k8s_utils import (abandon_lifecycle_action, cordon_node, node_exists, remove_all_pods)

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

KUBE_FILEPATH = '/tmp/kubeconfig'
REGION = os.environ['AWS_REGION']

eks = boto3.client('eks', region_name=REGION)
ec2 = boto3.client('ec2', region_name=REGION)
asg = boto3.client('autoscaling', region_name=REGION)
s3 = boto3.client('s3', region_name=REGION)


def create_kube_config(eks, cluster_name):
"""Creates the Kubernetes config file required when instantiating the API client."""
cluster_info = eks.describe_cluster(name=cluster_name)['cluster']
certificate = cluster_info['certificateAuthority']['data']
endpoint = cluster_info['endpoint']

kube_config = {
'apiVersion': 'v1',
'clusters': [
{
'cluster':
{
'server': endpoint,
'certificate-authority-data': certificate
},
'name': 'k8s'

}],
'contexts': [
{
'context':
{
'cluster': 'k8s',
'user': 'aws'
},
'name': 'aws'
}],
'current-context': 'aws',
'Kind': 'config',
'users': [
{
'name': 'aws',
'user': 'lambda'
}]
}

with open(KUBE_FILEPATH, 'w') as f:
yaml.dump(kube_config, f, default_flow_style=False)


def get_bearer_token(cluster, region):
"""Creates the authentication to token required by AWS IAM Authenticator. This is
done by creating a base64 encoded string which represents a HTTP call to the STS
GetCallerIdentity Query Request (https://docs.aws.amazon.com/STS/latest/APIReference/API_GetCallerIdentity.html).
The AWS IAM Authenticator decodes the base64 string and makes the request on behalf of the user.
"""
STS_TOKEN_EXPIRES_IN = 60
session = boto3.session.Session()

client = session.client('sts', region_name=region)
service_id = client.meta.service_model.service_id

signer = RequestSigner(
service_id,
region,
'sts',
'v4',
session.get_credentials(),
session.events
)

params = {
'method': 'GET',
'url': 'https://sts.{}.amazonaws.com/?Action=GetCallerIdentity&Version=2011-06-15'.format(region),
'body': {},
'headers': {
'x-k8s-aws-id': cluster
},
'context': {}
}

signed_url = signer.generate_presigned_url(
params,
region_name=region,
expires_in=STS_TOKEN_EXPIRES_IN,
operation_name=''
)

base64_url = base64.urlsafe_b64encode(signed_url.encode('utf-8')).decode('utf-8')

# need to remove base64 encoding padding:
# https://github.com/kubernetes-sigs/aws-iam-authenticator/issues/202
return 'k8s-aws-v1.' + re.sub(r'=*', '', base64_url)


def _lambda_handler(env, k8s_config, k8s_client, event):
kube_config_bucket = env['kube_config_bucket']
cluster_name = env['cluster_name']

if not os.path.exists(KUBE_FILEPATH):
if kube_config_bucket:
logger.info('No kubeconfig file found. Downloading...')
s3.download_file(kube_config_bucket, env['kube_config_object'], KUBE_FILEPATH)
else:
logger.info('No kubeconfig file found. Generating...')
create_kube_config(eks, cluster_name)

lifecycle_hook_name = event['detail']['LifecycleHookName']
auto_scaling_group_name = event['detail']['AutoScalingGroupName']

instance_id = event['detail']['EC2InstanceId']
logger.info('Instance ID: ' + instance_id)
instance = ec2.describe_instances(InstanceIds=[instance_id])['Reservations'][0]['Instances'][0]

node_name = instance['PrivateDnsName']
logger.info('Node name: ' + node_name)

# Configure
k8s_config.load_kube_config(KUBE_FILEPATH)
configuration = k8s_client.Configuration()
if not kube_config_bucket:
configuration.api_key['authorization'] = get_bearer_token(cluster_name, REGION)
configuration.api_key_prefix['authorization'] = 'Bearer'
# API
api = k8s_client.ApiClient(configuration)
v1 = k8s_client.CoreV1Api(api)

try:
if not node_exists(v1, node_name):
logger.error('Node not found.')
abandon_lifecycle_action(asg, auto_scaling_group_name, lifecycle_hook_name, instance_id)
return

cordon_node(v1, node_name)

remove_all_pods(v1, node_name)

asg.complete_lifecycle_action(LifecycleHookName=lifecycle_hook_name,
AutoScalingGroupName=auto_scaling_group_name,
LifecycleActionResult='CONTINUE',
InstanceId=instance_id)
except ApiException:
logger.exception('There was an error removing the pods from the node {}'.format(node_name))
abandon_lifecycle_action(asg, auto_scaling_group_name, lifecycle_hook_name, instance_id)


def lambda_handler(event, _):
env = {
'cluster_name': os.environ.get('CLUSTER_NAME'),
'kube_config_bucket': os.environ.get('KUBE_CONFIG_BUCKET'),
'kube_config_object': os.environ.get('KUBE_CONFIG_OBJECT')
}
return _lambda_handler(env, k8s.config, k8s.client, event)
8 changes: 8 additions & 0 deletions lambdas/drainer/k8s_rbac/cluster_role.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: lambda-cluster-access
rules:
- apiGroups: [""]
resources: ["pods", "pods/eviction", "nodes"]
verbs: ["create", "list", "patch"]
12 changes: 12 additions & 0 deletions lambdas/drainer/k8s_rbac/cluster_rolebinding.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: lambda-user-cluster-role-binding
subjects:
- kind: User
name: lambda
apiGroup: rbac.authorization.k8s.io
roleRef:
kind: ClusterRole
name: lambda-cluster-access
apiGroup: rbac.authorization.k8s.io
Loading

0 comments on commit c0ea41a

Please sign in to comment.