Fabric

Fabric is a python1 library, which provides access to the command line. At Vokal it is used for a number of automated tasks most notably in our CI environment, where we use it to update staging instances after merging a branch into master. Historically, we have used fabric to update the existing Docker image on an AWS instance then restart the relevant upstart service and run any necessary migrations. While this works fine, it does introduce downtime which can be irritating for client developers. In an effort to minimize downtime I've provided an updated fabfile which:

import os
import time
from datetime import date
from base64 import b64decode
from fabric.api import *
from fabric.operations import *
import requests
from boto import ec2


def staging():
    env.hosts = [env.STAGING_IP, ]
    env.user = "ubuntu"
    env.branch = "master"


def get_git_hash():
    return local('git rev-parse --short HEAD', capture=True)


def create_image(conn, instance_id):
    image_id = conn.create_image(
        instance_id,
        '{0}-{1}-{2}-{3}'.format(
            env.IMAGE_TAG,
            date.today().isoformat(),
            get_git_hash(),
            instance_id),
        no_reboot=False)
    print 'Created AMI {0}'.format(image_id)
    return image_id


def get_instance(conn):
    elastic_ips = conn.get_all_addresses([env.STAGING_IP, ])
    assert len(elastic_ips) == 1

    instance_id = elastic_ips[0].instance_id
    reservations = conn.get_all_instances([instance_id])
    assert len(reservations) == 1
    assert len(reservations[0].instances) == 1
    return reservations[0].instances[0]


def copy_instance(conn, inst):
    # check if an AMI for this machine exists.
    img_ids = conn.get_all_images(filters={'name': '*-{}'.format(inst.id)})

    # if it doesn't create an image for this machine.
    if not img_ids:
        img_id = create_image(conn, inst.id)
    else:
        assert len(img_ids) == 1
        img_id = img_ids[0].id

    img = conn.get_image(img_id)

    # make sure the AMI is ready to be consumed.
    while img.state == 'pending':
        print 'ami: {}'.format(img.state)
        img.update()
        time.sleep(5)
    assert img.state == 'available'

    # start new instance from AMI using the same data from the original instance.
    res = conn.run_instances(img_id,
                             security_group_ids=[g.id for g in inst.groups],
                             user_data=b64decode(inst.get_attribute('userData')['userData']),
                             instance_type=inst.get_attribute('instanceType')['instanceType'],
                             key_name=inst.key_name)

    assert len(res.instances) == 1
    new_inst = res.instances[0]

    # wait for instance to be running.
    while new_inst.state == 'pending':
        print 'instance {}'.format(new_inst.state)
        new_inst.update()
        time.sleep(5)

    print new_inst.state
    assert new_inst.state == 'running'

    # attempts to SSH into machine, this is because Security Groups are applied after
    # an instance runs.
    success = False
    with settings(host_string='ubuntu@{}'.format(new_inst.ip_address), warn_only=True):
        for i in xrange(12):
            try:
                sudo('echo')
                success = True
                break
            except:
                print 'instance security group: pending'
                time.sleep(10)

    assert success

    return new_inst


def update_instance(inst):
    with settings(host_string='ubuntu@{}'.format(inst.ip_address), warn_only=True):
        sudo("service {} stop".format(env.UPSTART_SERVICE_NAME))
        sudo("docker rm $(sudo docker ps -aq)")
        sudo("docker rmi $(sudo docker images --filter dangling=true --quiet)")

        sudo("docker pull {}".format(env.DOCKER_IMAGE_NAME))
        sudo("service {} start".format(env.UPSTART_SERVICE_NAME))
        # wait for the container to spin up.
        time.sleep(5)


def health_check(ip):
    resp = requests.get(env.HEALTH_CHECK_FMT.format(ip))
    return resp.status_code < 500


def remap_elastic_ip(conn, inst):
    return conn.associate_address(inst.id, public_ip=STAGING_IP, allow_reassociation=True)


def migrate():
    require("hosts", provided_by=[staging, ])

    raise NotImplementedError()

def updates():
    """updates assumes the presense of these variables passed in through drone.
    'STAGING_IP' which is the AWS's ELASTIC IP for staging
    'REGION' which is the region the instance is in.
    'AWS_ACCESS_KEY'
    'AWS_SECRET_KEY'
    'IMAGE_TAG' The AWS AMI tag name, which will prepend the current date, git-hash and instance_id
    'UPSTART_SERVICE_NAME' the name of the service on the instance
    'DOCKER_IMAGE_NAME' the name of the docker image.
    'DOCKER_CONTAINER_NAME' the name of the docker container
    'HEALTH_CHECK_FMT' url use in an HTTP GET request to check the health of the service.
    """
    assert env.STAGING_IP
    assert env.REGION
    assert env.AWS_ACCESS_KEY
    assert env.AWS_SECRET_KEY
    assert env.IMAGE_TAG
    assert env.UPSTART_SERVICE_NAME 
    assert env.DOCKER_IMAGE_NAME
    assert env.DOCKER_CONTAINER_NAME
    assert env.HEALTH_CHECK_FMT

    conn = ec2.connect_to_region(
        env.REGION,
        aws_access_key_id=env.AWS_ACCESS_KEY,
        aws_secret_access_key=env.AWS_SECRET_KEY)

    assert conn

    inst = get_instance(conn)
    new_inst = copy_instance(conn, inst)
    update_instance(new_inst)

    if healthy:

        create_image(conn, new_inst.id)
        time.sleep(30) # give some time for new image to boot up.

        healthy = False
        for i in xrange(12):
            if health_check(new_inst.id):
                healthy = True
                break
            time.sleep(10)

        assert healthy
        assert remap_elastic_ip(conn, new_inst)
        inst.terminate()
    else:
        new_inst.terminate()
        raise Exception("Health check failure, new instance terminated.")

After setting the environment variables, run fab updates. If any migrations or additional operations need to be run they can be implemented in the migrate function and run with fab staging migrate

1 Currently Fabric is only available for python 2.7