diff options
Diffstat (limited to 'agents/ibmz')
-rw-r--r-- | agents/ibmz/fence_ibmz.py | 566 |
1 files changed, 566 insertions, 0 deletions
diff --git a/agents/ibmz/fence_ibmz.py b/agents/ibmz/fence_ibmz.py new file mode 100644 index 0000000..d477ade --- /dev/null +++ b/agents/ibmz/fence_ibmz.py @@ -0,0 +1,566 @@ +#!@PYTHON@ -tt + +# Copyright (c) 2020 IBM Corp. +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library. If not, see +# <http://www.gnu.org/licenses/>. + +import atexit +import logging +import time +import sys + +import requests +from requests.packages import urllib3 + +sys.path.append("@FENCEAGENTSLIBDIR@") +from fencing import * +from fencing import fail_usage, run_delay, EC_GENERIC_ERROR + +DEFAULT_POWER_TIMEOUT = '300' +ERROR_NOT_FOUND = ("{obj_type} {obj_name} not found in this HMC. " + "Attention: names are case-sensitive.") + +class ApiClientError(Exception): + """ + Base exception for all API Client related errors. + """ + +class ApiClientRequestError(ApiClientError): + """ + Raised when an API request ends in error + """ + + def __init__(self, req_method, req_uri, status, reason, message): + self.req_method = req_method + self.req_uri = req_uri + self.status = status + self.reason = reason + self.message = message + super(ApiClientRequestError, self).__init__() + + def __str__(self): + return ( + "API request failed, details:\n" + "HTTP Request : {req_method} {req_uri}\n" + "HTTP Response status: {status}\n" + "Error reason: {reason}\n" + "Error message: {message}\n".format( + req_method=self.req_method, req_uri=self.req_uri, + status=self.status, reason=self.reason, message=self.message) + ) + +class APIClient(object): + DEFAULT_CONFIG = { + # how many connection-related errors to retry on + 'connect_retries': 3, + # how many times to retry on read errors (after request was sent to the + # server) + 'read_retries': 3, + # http methods that should be retried + 'method_whitelist': ['HEAD', 'GET', 'OPTIONS'], + # limit of redirects to perform to avoid loops + 'redirect': 5, + # how long to wait while establishing a connection + 'connect_timeout': 30, + # how long to wait for asynchronous operations (jobs) to complete + 'operation_timeout': 900, + # how long to wait between bytes sent by the remote side + 'read_timeout': 300, + # default API port + 'port': 6794, + # validate ssl certificates + 'ssl_verify': False, + # load on activate is set in the HMC activation profile and therefore + # no additional load is executed by the fence agent + 'load_on_activate': False + } + LABEL_BY_OP_MODE = { + 'classic': { + 'nodes': 'logical-partitions', + 'state-on': 'operating', + 'start': 'load', + 'stop': 'deactivate' + }, + 'dpm': { + 'nodes': 'partitions', + 'state-on': 'active', + 'start': 'start', + 'stop': 'stop' + } + } + def __init__(self, host, user, passwd, config=None): + self.host = host + if not passwd: + raise ValueError('Password cannot be empty') + self.passwd = passwd + if not user: + raise ValueError('Username cannot be empty') + self.user = user + self._cpc_cache = {} + self._session = None + self._config = self.DEFAULT_CONFIG.copy() + # apply user defined values + if config: + self._config.update(config) + + def _create_session(self): + """ + Create a new requests session and apply config values + """ + session = requests.Session() + retry_obj = urllib3.Retry( + # setting a total is necessary to cover SSL related errors + total=max(self._config['connect_retries'], + self._config['read_retries']), + connect=self._config['connect_retries'], + read=self._config['read_retries'], + method_whitelist=self._config['method_whitelist'], + redirect=self._config['redirect'] + ) + session.mount('http://', requests.adapters.HTTPAdapter( + max_retries=retry_obj)) + session.mount('https://', requests.adapters.HTTPAdapter( + max_retries=retry_obj)) + return session + + def _get_mode_labels(self, cpc): + """ + Return the map of labels that corresponds to the cpc operation mode + """ + if self.is_dpm_enabled(cpc): + return self.LABEL_BY_OP_MODE['dpm'] + return self.LABEL_BY_OP_MODE['classic'] + + def _get_partition(self, cpc, partition): + """ + Return the properties of the specified partition. Raises ValueError if + it cannot be found. + """ + # HMC API's documentation says it'll return an empty array when no + # matches are found but for a CPC in classic mode it returns in fact + # 404, so we handle this accordingly. Remove the extra handling below + # once this behavior has been fixed on the API's side. + label_map = self._get_mode_labels(cpc) + resp = self._request('get', '{}/{}?name={}'.format( + self._cpc_cache[cpc]['object-uri'], label_map['nodes'], partition), + valid_codes=[200, 404]) + + if label_map['nodes'] not in resp or not resp[label_map['nodes']]: + raise ValueError(ERROR_NOT_FOUND.format( + obj_type='LPAR/Partition', obj_name=partition)) + return resp[label_map['nodes']][0] + + def _partition_switch_power(self, cpc, partition, action): + """ + Perform the API request to start (power on) or stop (power off) the + target partition and wait for the job to finish. + """ + # retrieve partition's uri + part_uri = self._get_partition(cpc, partition)['object-uri'] + label_map = self._get_mode_labels(cpc) + + # in dpm mode the request must have empty body + if self.is_dpm_enabled(cpc): + body = None + # in classic mode we make sure the operation is executed + # even if the partition is already on + else: + body = {'force': True} + # when powering on the partition must be activated first + if action == 'start': + op_uri = '{}/operations/activate'.format(part_uri) + job_resp = self._request( + 'post', op_uri, body=body, valid_codes=[202]) + # always wait for activate otherwise the load (start) + # operation will fail + if self._config['operation_timeout'] == 0: + timeout = self.DEFAULT_CONFIG['operation_timeout'] + else: + timeout = self._config['operation_timeout'] + logging.debug( + 'waiting for activate (timeout %s secs)', timeout) + self._wait_for_job('post', op_uri, job_resp['job-uri'], + timeout=timeout) + if self._config['load_on_activate']: + return + + # trigger the start job + op_uri = '{}/operations/{}'.format(part_uri, label_map[action]) + job_resp = self._request('post', op_uri, body=body, valid_codes=[202]) + if self._config['operation_timeout'] == 0: + return + logging.debug('waiting for %s (timeout %s secs)', + label_map[action], self._config['operation_timeout']) + self._wait_for_job('post', op_uri, job_resp['job-uri'], + timeout=self._config['operation_timeout']) + + def _request(self, method, uri, body=None, headers=None, valid_codes=None): + """ + Perform a request to the HMC API + """ + assert method in ('delete', 'head', 'get', 'post', 'put') + + url = 'https://{host}:{port}{uri}'.format( + host=self.host, port=self._config['port'], uri=uri) + if not headers: + headers = {} + + if self._session is None: + raise ValueError('You need to log on first') + method = getattr(self._session, method) + timeout = ( + self._config['connect_timeout'], self._config['read_timeout']) + response = method(url, json=body, headers=headers, + verify=self._config['ssl_verify'], timeout=timeout) + + if valid_codes and response.status_code not in valid_codes: + reason = '(no reason)' + message = '(no message)' + if response.headers.get('content-type') == 'application/json': + try: + json_resp = response.json() + except ValueError: + pass + else: + reason = json_resp.get('reason', reason) + message = json_resp.get('message', message) + else: + message = '{}...'.format(response.text[:500]) + raise ApiClientRequestError( + response.request.method, response.request.url, + response.status_code, reason, message) + + if response.status_code == 204: + return dict() + try: + json_resp = response.json() + except ValueError: + raise ApiClientRequestError( + response.request.method, response.request.url, + response.status_code, '(no reason)', + 'Invalid JSON content in response') + + return json_resp + + def _update_cpc_cache(self, cpc_props): + self._cpc_cache[cpc_props['name']] = { + 'object-uri': cpc_props['object-uri'], + 'dpm-enabled': cpc_props.get('dpm-enabled', False) + } + + def _wait_for_job(self, req_method, req_uri, job_uri, timeout): + """ + Perform API requests to check for job status until it has completed + or the specified timeout is reached + """ + op_timeout = time.time() + timeout + while time.time() < op_timeout: + job_resp = self._request("get", job_uri) + if job_resp['status'] == 'complete': + if job_resp['job-status-code'] in (200, 201, 204): + return + raise ApiClientRequestError( + req_method, req_uri, + job_resp.get('job-status-code', '(no status)'), + job_resp.get('job-reason-code', '(no reason)'), + job_resp.get('job-results', {}).get( + 'message', '(no message)') + ) + time.sleep(1) + raise ApiClientError('Timed out while waiting for job completion') + + def cpc_list(self): + """ + Return a list of CPCs in the format {'name': 'cpc-name', 'status': + 'operating'} + """ + list_resp = self._request("get", "/api/cpcs", valid_codes=[200]) + ret = [] + for cpc_props in list_resp['cpcs']: + self._update_cpc_cache(cpc_props) + ret.append({ + 'name': cpc_props['name'], 'status': cpc_props['status']}) + return ret + + def is_dpm_enabled(self, cpc): + """ + Return True if CPC is in DPM mode, False for classic mode + """ + if cpc in self._cpc_cache: + return self._cpc_cache[cpc]['dpm-enabled'] + list_resp = self._request("get", "/api/cpcs?name={}".format(cpc), + valid_codes=[200]) + if not list_resp['cpcs']: + raise ValueError(ERROR_NOT_FOUND.format( + obj_type='CPC', obj_name=cpc)) + self._update_cpc_cache(list_resp['cpcs'][0]) + return self._cpc_cache[cpc]['dpm-enabled'] + + def logon(self): + """ + Open a session with the HMC API and store its ID + """ + self._session = self._create_session() + logon_body = {"userid": self.user, "password": self.passwd} + logon_resp = self._request("post", "/api/sessions", body=logon_body, + valid_codes=[200, 201]) + self._session.headers["X-API-Session"] = logon_resp['api-session'] + + def logoff(self): + """ + Close/delete the HMC API session + """ + if self._session is None: + return + self._request("delete", "/api/sessions/this-session", + valid_codes=[204]) + self._cpc_cache = {} + self._session = None + + def partition_list(self, cpc): + """ + Return a list of partitions in the format {'name': 'part-name', + 'status': 'on'} + """ + label_map = self._get_mode_labels(cpc) + list_resp = self._request( + 'get', '{}/{}'.format( + self._cpc_cache[cpc]['object-uri'], label_map['nodes']), + valid_codes=[200]) + status_map = {label_map['state-on']: 'on'} + return [{'name': part['name'], + 'status': status_map.get(part['status'].lower(), 'off')} + for part in list_resp[label_map['nodes']]] + + def partition_start(self, cpc, partition): + """ + Power on a partition + """ + self._partition_switch_power(cpc, partition, 'start') + + def partition_status(self, cpc, partition): + """ + Return the current status of a partition (on or off) + """ + label_map = self._get_mode_labels(cpc) + + part_props = self._get_partition(cpc, partition) + if part_props['status'].lower() == label_map['state-on']: + return 'on' + return 'off' + + def partition_stop(self, cpc, partition): + """ + Power off a partition + """ + self._partition_switch_power(cpc, partition, 'stop') + +def parse_plug(options): + """ + Extract cpc and partition from specified plug value + """ + try: + cpc, partition = options['--plug'].strip().split('/', 1) + except ValueError: + fail_usage('Please specify nodename in format cpc/partition') + cpc = cpc.strip() + if not cpc or not partition: + fail_usage('Please specify nodename in format cpc/partition') + return cpc, partition + +def get_power_status(conn, options): + logging.debug('executing get_power_status') + status = conn.partition_status(*parse_plug(options)) + return status + +def set_power_status(conn, options): + logging.debug('executing set_power_status') + if options['--action'] == 'on': + conn.partition_start(*parse_plug(options)) + elif options['--action'] == 'off': + conn.partition_stop(*parse_plug(options)) + else: + fail_usage('Invalid action {}'.format(options['--action'])) + +def get_outlet_list(conn, options): + logging.debug('executing get_outlet_list') + result = {} + for cpc in conn.cpc_list(): + for part in conn.partition_list(cpc['name']): + result['{}/{}'.format(cpc['name'], part['name'])] = ( + part['name'], part['status']) + return result + +def disconnect(conn): + """ + Close the API session + """ + try: + conn.logoff() + except Exception as exc: + logging.exception('Logoff failed: ') + sys.exit(str(exc)) + +def set_opts(): + """ + Define the options supported by this agent + """ + device_opt = [ + "ipaddr", + "ipport", + "login", + "passwd", + "port", + "connect_retries", + "connect_timeout", + "operation_timeout", + "read_retries", + "read_timeout", + "ssl_secure", + "load_on_activate", + ] + + all_opt["ipport"]["default"] = APIClient.DEFAULT_CONFIG['port'] + all_opt["power_timeout"]["default"] = DEFAULT_POWER_TIMEOUT + port_desc = ("Physical plug id in the format cpc-name/partition-name " + "(case-sensitive)") + all_opt["port"]["shortdesc"] = port_desc + all_opt["port"]["help"] = ( + "-n, --plug=[id] {}".format(port_desc)) + all_opt["connect_retries"] = { + "getopt" : ":", + "longopt" : "connect-retries", + "help" : "--connect-retries=[number] How many times to " + "retry on connection errors", + "default" : APIClient.DEFAULT_CONFIG['connect_retries'], + "type" : "integer", + "required" : "0", + "shortdesc" : "How many times to retry on connection errors", + "order" : 2 + } + all_opt["read_retries"] = { + "getopt" : ":", + "longopt" : "read-retries", + "help" : "--read-retries=[number] How many times to " + "retry on errors related to reading from server", + "default" : APIClient.DEFAULT_CONFIG['read_retries'], + "type" : "integer", + "required" : "0", + "shortdesc" : "How many times to retry on read errors", + "order" : 2 + } + all_opt["connect_timeout"] = { + "getopt" : ":", + "longopt" : "connect-timeout", + "help" : "--connect-timeout=[seconds] How long to wait to " + "establish a connection", + "default" : APIClient.DEFAULT_CONFIG['connect_timeout'], + "type" : "second", + "required" : "0", + "shortdesc" : "How long to wait to establish a connection", + "order" : 2 + } + all_opt["operation_timeout"] = { + "getopt" : ":", + "longopt" : "operation-timeout", + "help" : "--operation-timeout=[seconds] How long to wait for " + "power operation to complete (0 = do not wait)", + "default" : APIClient.DEFAULT_CONFIG['operation_timeout'], + "type" : "second", + "required" : "0", + "shortdesc" : "How long to wait for power operation to complete", + "order" : 2 + } + all_opt["read_timeout"] = { + "getopt" : ":", + "longopt" : "read-timeout", + "help" : "--read-timeout=[seconds] How long to wait " + "to read data from server", + "default" : APIClient.DEFAULT_CONFIG['read_timeout'], + "type" : "second", + "required" : "0", + "shortdesc" : "How long to wait for server data", + "order" : 2 + } + all_opt["load_on_activate"] = { + "getopt" : "", + "longopt" : "load-on-activate", + "help" : "--load-on-activate Rely on the HMC to perform " + "a load operation on activation", + "required" : "0", + "order" : 3 + } + return device_opt + +def main(): + """ + Agent entry point + """ + # register exit handler used by pacemaker + atexit.register(atexit_handler) + + # prepare accepted options + device_opt = set_opts() + + # parse options provided on input + options = check_input(device_opt, process_input(device_opt)) + + docs = { + "shortdesc": "Fence agent for IBM z LPARs", + "longdesc": ( + "fence_ibmz is a power fencing agent which uses the HMC Web " + "Services API to fence IBM z LPARs."), + "vendorurl": "http://www.ibm.com" + } + show_docs(options, docs) + + run_delay(options) + + # set underlying library's logging and ssl config according to specified + # options + requests_log = logging.getLogger("urllib3") + requests_log.propagate = True + if "--verbose" in options: + requests_log.setLevel(logging.DEBUG) + if "--ssl-insecure" in options: + urllib3.disable_warnings( + category=urllib3.exceptions.InsecureRequestWarning) + + hmc_address = options["--ip"] + hmc_userid = options["--username"] + hmc_password = options["--password"] + config = { + 'connect_retries': int(options['--connect-retries']), + 'read_retries': int(options['--read-retries']), + 'operation_timeout': int(options['--operation-timeout']), + 'connect_timeout': int(options['--connect-timeout']), + 'read_timeout': int(options['--read-timeout']), + 'port': int(options['--ipport']), + 'ssl_verify': bool('--ssl-insecure' not in options), + 'load_on_activate': bool('--load-on-activate' in options), + } + try: + conn = APIClient(hmc_address, hmc_userid, hmc_password, config) + conn.logon() + atexit.register(disconnect, conn) + result = fence_action(conn, options, set_power_status, + get_power_status, get_outlet_list) + except Exception: + logging.exception('Exception occurred: ') + result = EC_GENERIC_ERROR + sys.exit(result) + +if __name__ == "__main__": + main() |