diff options
Diffstat (limited to '')
-rw-r--r-- | heartbeat/gcp-vpc-move-route.in | 490 |
1 files changed, 490 insertions, 0 deletions
diff --git a/heartbeat/gcp-vpc-move-route.in b/heartbeat/gcp-vpc-move-route.in new file mode 100644 index 0000000..3f543fe --- /dev/null +++ b/heartbeat/gcp-vpc-move-route.in @@ -0,0 +1,490 @@ +#!@PYTHON@ -tt +# - *- coding: utf- 8 - *- +# +# +# OCF resource agent to move an IP address within a VPC in GCP +# +# License: GNU General Public License (GPL) +# Copyright (c) 2018 Hervé Werner (MFG Labs) +# Copyright 2018 Google Inc. +# Based on code from Markus Guertler (aws-vpc-move-ip) +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + + +####################################################################### + +import atexit +import logging +import os +import sys +import time + +OCF_FUNCTIONS_DIR = os.environ.get("OCF_FUNCTIONS_DIR", "%s/lib/heartbeat" % os.environ.get("OCF_ROOT")) +sys.path.append(OCF_FUNCTIONS_DIR) + +from ocf import * + +try: + import googleapiclient.discovery + import pyroute2 + try: + from google.oauth2.service_account import Credentials as ServiceAccountCredentials + except ImportError: + from oauth2client.service_account import ServiceAccountCredentials +except ImportError: + pass + +if sys.version_info >= (3, 0): + # Python 3 imports. + import urllib.parse as urlparse + import urllib.request as urlrequest +else: + # Python 2 imports. + import urllib as urlparse + import urllib2 as urlrequest + + +GCP_API_URL_PREFIX = 'https://www.googleapis.com/compute/v1' +METADATA_SERVER = 'http://metadata.google.internal/computeMetadata/v1/' +METADATA_HEADERS = {'Metadata-Flavor': 'Google'} +METADATA = \ +'''<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="gcp-vpc-move-route" version="1.0"> +<version>1.0</version> +<longdesc lang="en"> +Resource Agent that can move a floating IP addresse within a GCP VPC by changing an +entry in the routing table. This agent also configures the floating IP locally +on the instance OS. +Requirements : +- IP forwarding must be enabled on all instances in order to be able to +terminate the route +- The floating IP address must be chosen so that it is outside all existing +subnets in the VPC network +- IAM permissions +(see https://cloud.google.com/compute/docs/access/iam-permissions) : +1) compute.routes.delete, compute.routes.get and compute.routes.update on the +route +2) compute.networks.updatePolicy on the network (to add a new route) +3) compute.networks.get on the network (to check the VPC network existence) +4) compute.routes.list on the project (to check conflicting routes) +</longdesc> +<shortdesc lang="en">Move IP within a GCP VPC</shortdesc> + +<parameters> + +<parameter name="ip" unique="1" required="1"> +<longdesc lang="en"> +Floating IP address. Note that this IP must be chosen outside of all existing +subnet ranges +</longdesc> +<shortdesc lang="en">Floating IP</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="vpc_network" required="0"> +<longdesc lang="en"> +Name of the VPC network +</longdesc> +<shortdesc lang="en">VPC network</shortdesc> +<content type="string" default="default" /> +</parameter> + +<parameter name="project"> +<longdesc lang="en"> +Project ID of the instance. It can be useful to set this attribute if +the instance is in a shared service project. Otherwise, the agent should +be able to determine the project ID automatically. +</longdesc> +<shortdesc lang="en">Project ID</shortdesc> +<content type="string" default="default" /> +</parameter> + +<parameter name="interface"> +<longdesc lang="en"> +Name of the network interface +</longdesc> +<shortdesc lang="en">Network interface name</shortdesc> +<content type="string" default="eth0" /> +</parameter> + +<parameter name="route_name" unique="1"> +<longdesc lang="en"> +Route name +</longdesc> +<shortdesc lang="en">Route name</shortdesc> +<content type="string" default="ra-%s" /> +</parameter> + +<parameter name="serviceaccount"> +<longdesc lang="en">Path to Service account JSON file</longdesc> +<shortdesc lang="en">Service account JSONfile</shortdesc> +<content type="string" default="" /> +</parameter> + +<parameter name="stackdriver_logging" unique="0" required="0"> +<longdesc lang="en">If enabled (set to true), IP failover logs will be posted to stackdriver logging</longdesc> +<shortdesc lang="en">Stackdriver-logging support</shortdesc> +<content type="boolean" default="" /> +</parameter> +</parameters> + +<actions> +<action name="start" timeout="180s" /> +<action name="stop" timeout="180s" /> +<action name="monitor" depth="0" timeout="30s" interval="60s" /> +<action name="validate-all" timeout="5s" /> +<action name="meta-data" timeout="5s" /> +</actions> +</resource-agent> +''' % os.path.basename(sys.argv[0]) + + +class Context(object): + __slots__ = 'conn', 'iface_idx', 'instance', 'instance_url', 'interface', \ + 'ip', 'iproute', 'project', 'route_name', 'vpc_network', \ + 'vpc_network_url', 'zone' + + +def wait_for_operation(ctx, response): + """Blocks until operation completes. + Code from GitHub's GoogleCloudPlatform/python-docs-samples + + Args: + response: dict, a request's response + """ + def _OperationGetter(response): + operation = response[u'name'] + if response.get(u'zone'): + return ctx.conn.zoneOperations().get( + project=ctx.project, zone=ctx.zone, operation=operation) + else: + return ctx.conn.globalOperations().get( + project=ctx.project, operation=operation) + + while True: + result = _OperationGetter(response).execute() + + if result['status'] == 'DONE': + if 'error' in result: + raise Exception(result['error']) + return result + + time.sleep(1) + + +def get_metadata(metadata_key, params=None, timeout=None): + """Performs a GET request with the metadata headers. + + Args: + metadata_key: string, the metadata to perform a GET request on. + params: dictionary, the query parameters in the GET request. + timeout: int, timeout in seconds for metadata requests. + + Returns: + HTTP response from the GET request. + + Raises: + urlerror.HTTPError: raises when the GET request fails. + """ + timeout = timeout or 60 + metadata_url = os.path.join(METADATA_SERVER, metadata_key) + params = urlparse.urlencode(params or {}) + url = '%s?%s' % (metadata_url, params) + request = urlrequest.Request(url, headers=METADATA_HEADERS) + request_opener = urlrequest.build_opener(urlrequest.ProxyHandler({})) + return request_opener.open(request, timeout=timeout * 1.1).read().decode("utf-8") + + +def validate(ctx): + if os.geteuid() != 0: + logger.error('You must run this agent as root') + sys.exit(OCF_ERR_PERM) + + try: + serviceaccount = os.environ.get("OCF_RESKEY_serviceaccount") + if not serviceaccount: + try: + from googleapiclient import _auth + credentials = _auth.default_credentials(); + except: + credentials = GoogleCredentials.get_application_default() + logging.debug("using application default credentials") + else: + scope = ['https://www.googleapis.com/auth/cloud-platform'] + logging.debug("using credentials from service account") + try: + credentials = ServiceAccountCredentials.from_service_account_file(filename=serviceaccount, scopes=scope) + except AttributeError: + credentials = ServiceAccountCredentials.from_json_keyfile_name(serviceaccount, scope) + except Exception as e: + logging.error(str(e)) + sys.exit(OCF_ERR_GENERIC) + ctx.conn = googleapiclient.discovery.build('compute', 'v1', credentials=credentials, cache_discovery=False) + except Exception as e: + logger.error('Couldn\'t connect with google api: ' + str(e)) + sys.exit(OCF_ERR_GENERIC) + + ctx.ip = os.environ.get('OCF_RESKEY_ip') + if not ctx.ip: + logger.error('Missing ip parameter') + sys.exit(OCF_ERR_CONFIGURED) + + try: + ctx.instance = get_metadata('instance/name') + ctx.zone = get_metadata('instance/zone').split('/')[-1] + ctx.project = os.environ.get( + 'OCF_RESKEY_project', get_metadata('project/project-id')) + except Exception as e: + logger.error( + 'Instance information not found. Is this a GCE instance ?: %s', str(e)) + sys.exit(OCF_ERR_GENERIC) + + ctx.instance_url = '%s/projects/%s/zones/%s/instances/%s' % ( + GCP_API_URL_PREFIX, ctx.project, ctx.zone, ctx.instance) + ctx.vpc_network = os.environ.get('OCF_RESKEY_vpc_network', 'default') + ctx.vpc_network_url = '%s/projects/%s/global/networks/%s' % ( + GCP_API_URL_PREFIX, ctx.project, ctx.vpc_network) + ctx.interface = os.environ.get('OCF_RESKEY_interface', 'eth0') + ctx.route_name = os.environ.get( + 'OCF_RESKEY_route_name', 'ra-%s' % os.path.basename(sys.argv[0])) + ctx.iproute = pyroute2.IPRoute() + atexit.register(ctx.iproute.close) + idxs = ctx.iproute.link_lookup(ifname=ctx.interface) + if not idxs: + logger.error('Network interface not found') + sys.exit(OCF_ERR_GENERIC) + ctx.iface_idx = idxs[0] + + +def check_conflicting_routes(ctx): + fl = '(destRange = "%s*") AND (network = "%s") AND (name != "%s")' % ( + ctx.ip, ctx.vpc_network_url, ctx.route_name) + try: + request = ctx.conn.routes().list(project=ctx.project, filter=fl) + response = request.execute() + except googleapiclient.errors.HttpError as e: + if e.resp.status == 404: + logger.error('VPC network not found') + if 'stop' in sys.argv[1]: + sys.exit(OCF_SUCCESS) + else: + sys.exit(OCF_ERR_CONFIGURED) + else: + raise + + route_list = response.get('items', None) + if route_list: + logger.error( + 'Conflicting unnmanaged routes for destination %s/32 in VPC %s found : %s', + ctx.ip, ctx.vpc_network, str(route_list)) + sys.exit(OCF_ERR_CONFIGURED) + + +def route_release(ctx): + request = ctx.conn.routes().delete(project=ctx.project, route=ctx.route_name) + wait_for_operation(ctx, request.execute()) + + +def ip_monitor(ctx): + logger.info('IP monitor: checking local network configuration') + + def address_filter(addr): + for attr in addr['attrs']: + if attr[0] == 'IFA_LOCAL': + if attr[1] == ctx.ip: + return True + else: + return False + + route = ctx.iproute.get_addr( + index=ctx.iface_idx, match=address_filter) + if not route: + logger.warning( + 'The floating IP %s is not locally configured on this instance (%s)', + ctx.ip, ctx.instance) + return OCF_NOT_RUNNING + + logger.debug( + 'The floating IP %s is correctly configured on this instance (%s)', + ctx.ip, ctx.instance) + return OCF_SUCCESS + + +def ip_release(ctx): + ctx.iproute.addr('del', index=ctx.iface_idx, address=ctx.ip, mask=32) + + +def ip_and_route_start(ctx): + logger.info('Bringing up the floating IP %s', ctx.ip) + + # Add a new entry in the routing table + # If the route entry exists and is pointing to another instance, take it over + + # Ensure that there is no route that we are not aware of that is also handling our IP + check_conflicting_routes(ctx) + + # There is no replace API, We need to first delete the existing route if any + try: + request = ctx.conn.routes().get(project=ctx.project, route=ctx.route_name) + request.execute() + # TODO: check specific exception for 404 + except googleapiclient.errors.HttpError as e: + if e.resp.status != 404: + raise + else: + route_release(ctx) + + route_body = { + 'name': ctx.route_name, + 'network': ctx.vpc_network_url, + 'destRange': '%s/32' % ctx.ip, + 'nextHopInstance': ctx.instance_url, + } + try: + request = ctx.conn.routes().insert(project=ctx.project, body=route_body) + wait_for_operation(ctx, request.execute()) + except googleapiclient.errors.HttpError: + try: + request = ctx.conn.networks().get( + project=ctx.project, network=ctx.vpc_network) + request.execute() + except googleapiclient.errors.HttpError as e: + if e.resp.status == 404: + logger.error('VPC network not found') + sys.exit(OCF_ERR_CONFIGURED) + else: + raise + else: + raise + + # Configure the IP address locally + # We need to release the IP first + if ip_monitor(ctx) == OCF_SUCCESS: + ip_release(ctx) + + ctx.iproute.addr('add', index=ctx.iface_idx, address=ctx.ip, mask=32) + ctx.iproute.link('set', index=ctx.iface_idx, state='up') + logger.info('Successfully brought up the floating IP %s', ctx.ip) + + +def route_monitor(ctx): + logger.info('GCP route monitor: checking route table') + + # Ensure that there is no route that we are not aware of that is also handling our IP + check_conflicting_routes(ctx) + + try: + request = ctx.conn.routes().get(project=ctx.project, route=ctx.route_name) + response = request.execute() + except googleapiclient.errors.HttpError as e: + if e.resp.status == 404: + return OCF_NOT_RUNNING + elif 'Insufficient Permission' in e.content: + return OCF_ERR_PERM + else: + raise + + routed_to_instance = response.get('nextHopInstance', '<unknown>') + instance_url = '%s/projects/%s/zones/%s/instances/%s' % ( + GCP_API_URL_PREFIX, ctx.project, ctx.zone, ctx.instance) + if routed_to_instance != instance_url: + logger.warning( + 'The floating IP %s is not routed to this instance (%s) but to instance %s', + ctx.ip, ctx.instance, routed_to_instance.split('/')[-1]) + return OCF_NOT_RUNNING + + logger.debug( + 'The floating IP %s is correctly routed to this instance (%s)', + ctx.ip, ctx.instance) + return OCF_SUCCESS + + +def ip_and_route_stop(ctx): + logger.info('Bringing down the floating IP %s', ctx.ip) + + # Delete the route entry + # If the route entry exists and is pointing to another instance, don't touch it + if route_monitor(ctx) == OCF_NOT_RUNNING: + logger.info( + 'The floating IP %s is already not routed to this instance (%s)', + ctx.ip, ctx.instance) + else: + route_release(ctx) + + if ip_monitor(ctx) == OCF_NOT_RUNNING: + logger.info('The floating IP %s is already down', ctx.ip) + else: + ip_release(ctx) + + +def configure_logs(ctx): + # Prepare logging + global logger + logging.getLogger('googleapiclient').setLevel(logging.WARN) + logging_env = os.environ.get('OCF_RESKEY_stackdriver_logging') + if logging_env: + logging_env = logging_env.lower() + if any(x in logging_env for x in ['yes', 'true', 'enabled']): + try: + import google.cloud.logging.handlers + client = google.cloud.logging.Client() + handler = google.cloud.logging.handlers.CloudLoggingHandler( + client, name=ctx.instance) + handler.setLevel(logging.INFO) + formatter = logging.Formatter('gcp:route "%(message)s"') + handler.setFormatter(formatter) + log.addHandler(handler) + logger = logging.LoggerAdapter(log, {'OCF_RESOURCE_INSTANCE': OCF_RESOURCE_INSTANCE}) + except ImportError: + logger.error('Couldn\'t import google.cloud.logging, ' + 'disabling Stackdriver-logging support') + + +def main(): + if 'meta-data' in sys.argv[1]: + print(METADATA) + return + + ctx = Context() + + validate(ctx) + if 'validate-all' in sys.argv[1]: + return + + configure_logs(ctx) + if 'start' in sys.argv[1]: + ip_and_route_start(ctx) + elif 'stop' in sys.argv[1]: + ip_and_route_stop(ctx) + elif 'status' in sys.argv[1] or 'monitor' in sys.argv[1]: + sys.exit(ip_monitor(ctx)) + else: + usage = 'usage: %s {start|stop|monitor|status|meta-data|validate-all}' % \ + os.path.basename(sys.argv[0]) + logger.error(usage) + sys.exit(OCF_ERR_UNIMPLEMENTED) + + +if __name__ == "__main__": + main() |