diff options
Diffstat (limited to '')
-rwxr-xr-x | heartbeat/podman | 628 |
1 files changed, 628 insertions, 0 deletions
diff --git a/heartbeat/podman b/heartbeat/podman new file mode 100755 index 0000000..53867bf --- /dev/null +++ b/heartbeat/podman @@ -0,0 +1,628 @@ +#!/bin/sh +# +# The podman HA resource agent creates and launches a podman container +# based off a supplied podman image. Containers managed by this agent +# are both created and removed upon the agent's start and stop actions. +# +# Copyright (c) 2014 David Vossel <davidvossel@gmail.com> +# Michele Baldessari <michele@acksyn.org> +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_reuse_default="0" + +: ${OCF_RESKEY_reuse=${OCF_RESKEY_reuse_default}} + +####################################################################### + +meta_data() +{ + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="podman" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +The podman HA resource agent creates and launches a podman container +based off a supplied podman image. Containers managed by this agent +are both created and removed upon the agent's start and stop actions. +</longdesc> +<shortdesc lang="en">Podman container resource agent.</shortdesc> + +<parameters> +<parameter name="image" required="1" unique="0"> +<longdesc lang="en"> +The podman image to base this container off of. +</longdesc> +<shortdesc lang="en">podman image</shortdesc> +<content type="string"/> +</parameter> + +<parameter name="name" required="0" unique="0"> +<longdesc lang="en"> +The name to give the created container. By default this will +be that resource's instance name. +</longdesc> +<shortdesc lang="en">podman container name</shortdesc> +<content type="string"/> +</parameter> + +<parameter name="allow_pull" unique="0"> +<longdesc lang="en"> +Allow the image to be pulled from the configured podman registry when +the image does not exist locally. NOTE, this can drastically increase +the time required to start the container if the image repository is +pulled over the network. +</longdesc> +<shortdesc lang="en">Allow pulling non-local images</shortdesc> +<content type="boolean"/> +</parameter> + +<parameter name="run_opts" required="0" unique="0"> +<longdesc lang="en"> +Add options to be appended to the 'podman run' command which is used +when creating the container during the start action. This option allows +users to do things such as setting a custom entry point and injecting +environment variables into the newly created container. Note the '-d' +option is supplied regardless of this value to force containers to run +in the background. + +NOTE: Do not explicitly specify the --name argument in the run_opts. This +agent will set --name using either the resource's instance or the name +provided in the 'name' argument of this agent. + +</longdesc> +<shortdesc lang="en">run options</shortdesc> +<content type="string"/> +</parameter> + +<parameter name="run_cmd" required="0" unique="0"> +<longdesc lang="en"> +Specify a command to launch within the container once +it has initialized. +</longdesc> +<shortdesc lang="en">run command</shortdesc> +<content type="string"/> +</parameter> + +<parameter name="mount_points" required="0" unique="0"> +<longdesc lang="en"> +A comma separated list of directories that the container is expecting to use. +The agent will ensure they exist by running 'mkdir -p' +</longdesc> +<shortdesc lang="en">Required mount points</shortdesc> +<content type="string"/> +</parameter> + +<parameter name="monitor_cmd" required="0" unique="0"> +<longdesc lang="en"> +Specify the full path of a command to launch within the container to check +the health of the container. This command must return 0 to indicate that +the container is healthy. A non-zero return code will indicate that the +container has failed and should be recovered. + +Note: Using this method for monitoring processes inside a container +is not recommended, as containerd tries to track processes running +inside the container and does not deal well with many short-lived +processes being spawned. Ensure that your container monitors its +own processes and terminates on fatal error rather than invoking +a command from the outside. +</longdesc> +<shortdesc lang="en">monitor command</shortdesc> +<content type="string"/> +</parameter> + +<parameter name="force_kill" required="0" unique="0"> +<longdesc lang="en"> +Kill a container immediately rather than waiting for it to gracefully +shutdown +</longdesc> +<shortdesc lang="en">force kill</shortdesc> +<content type="boolean"/> +</parameter> + +<parameter name="reuse" required="0" unique="0"> +<longdesc lang="en"> +Allow the container to be reused once it is stopped. By default, +containers get removed once they are stopped. Enable this option +to have the particular one persist when this happens. +</longdesc> +<shortdesc lang="en">reuse container</shortdesc> +<content type="boolean" default="${OCF_RESKEY_reuse_default}"/> +</parameter> + +<parameter name="drop_in_dependency" required="0" unique="0"> +<longdesc lang="en"> +Use transient drop-in files to add extra dependencies to the systemd +scopes associated to the container. During reboot, this prevents systemd +to stop the container before pacemaker. +</longdesc> +<shortdesc lang="en">drop-in dependency</shortdesc> +<content type="boolean"/> +</parameter> +</parameters> + +<actions> +<action name="start" timeout="90s" /> +<action name="stop" timeout="90s" /> +<action name="monitor" timeout="30s" interval="30s" depth="0" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="30s" /> +</actions> +</resource-agent> +END +} + +####################################################################### +REQUIRE_IMAGE_PULL=0 + +podman_usage() +{ + cat <<END +usage: $0 {start|stop|monitor|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + + +monitor_cmd_exec() +{ + local rc=$OCF_SUCCESS + local out + + out=$(podman exec ${CONTAINER} $OCF_RESKEY_monitor_cmd 2>&1) + rc=$? + # 125: no container with name or ID ${CONTAINER} found + # 126: container state improper (not running) + # 127: any other error + # 255: podman 2+: container not running + case "$rc" in + 125|126|255) + rc=$OCF_NOT_RUNNING + ;; + 0) + ocf_log debug "monitor cmd passed: exit code = $rc" + ;; + *) + ocf_exit_reason "monitor cmd failed (rc=$rc), output: $out" + rc=$OCF_ERR_GENERIC + ;; + esac + + return $rc +} + +container_exists() +{ + local rc + local out + + out=$(podman exec ${CONTAINER} $OCF_RESKEY_monitor_cmd 2>&1) + rc=$? + # 125: no container with name or ID ${CONTAINER} found + if [ $rc -ne 125 ]; then + return 0 + fi + return 1 +} + +remove_container() +{ + local rc + local execids + + if ocf_is_true "$OCF_RESKEY_reuse"; then + # never remove the container if we have reuse enabled. + return 0 + fi + + container_exists + if [ $? -ne 0 ]; then + # don't attempt to remove a container that doesn't exist + return 0 + fi + ocf_log notice "Cleaning up inactive container, ${CONTAINER}." + ocf_run podman rm -v $CONTAINER + rc=$? + if [ $rc -ne 0 ]; then + # due to a podman bug (rhbz#1841485), sometimes a stopped + # container can still be associated with Exec sessions, in + # which case the "podman rm" has to be forced + execids=$(podman inspect $CONTAINER --format '{{len .ExecIDs}}') + if [ "$execids" -ne "0" ]; then + ocf_log warn "Inactive container ${CONTAINER} has lingering exec sessions. Force-remove it." + ocf_run podman rm -f $CONTAINER + rc=$? + fi + fi + return $rc +} + +podman_simple_status() +{ + local rc + + # simple status is implemented via podman exec + # everything besides success is considered "not running" + monitor_cmd_exec + rc=$? + if [ $rc -ne $OCF_SUCCESS ]; then + rc=$OCF_NOT_RUNNING; + fi + return $rc +} + +podman_monitor() +{ + # We rely on running podman exec to monitor the container + # state because that command seems to be less prone to + # performance issue under IO load. + # + # For probes to work, we expect cmd_exec to be able to report + # when a container is not running. Here, we're not interested + # in distinguishing whether it's stopped or non existing + # (there's function container_exists for that) + monitor_cmd_exec + return $? +} + +podman_create_mounts() { + oldIFS="$IFS" + IFS="," + for directory in $OCF_RESKEY_mount_points; do + mkdir -p "$directory" + done + IFS="$oldIFS" +} + +podman_container_id() +{ + # Retrieve the container ID by doing a "podman ps" rather than + # a "podman inspect", because the latter has performance issues + # under IO load. + # We could have run "podman start $CONTAINER" to get the ID back + # but if the container is stopped, the command will return a + # name instead of a container ID. This would break us. + podman ps --no-trunc --format '{{.ID}} {{.Names}}' | grep -F -w -m1 "$CONTAINER" | cut -d' ' -f1 +} + + +create_transient_drop_in_dependency() +{ + local cid=$1 + local rc=$OCF_SUCCESS + + if [ -z "$cid" ]; then + ocf_exit_reason "Container ID not found for \"$CONTAINER\". Not creating drop-in dependency" + return $OCF_ERR_GENERIC + fi + + ocf_log info "Creating drop-in dependency for \"$CONTAINER\" ($cid)" + for scope in "libpod-$cid.scope.d" "libpod-conmon-$cid.scope.d"; do + if [ $rc -eq $OCF_SUCCESS ] && [ ! -d /run/systemd/transient/"$scope" ]; then + mkdir -p /run/systemd/transient/"$scope" && \ + printf "[Unit]\nBefore=pacemaker.service" > /run/systemd/transient/"$scope"/dep.conf && \ + chmod ago+r /run/systemd/transient/"$scope" /run/systemd/transient/"$scope"/dep.conf + rc=$? + fi + done + + if [ $rc -ne $OCF_SUCCESS ]; then + ocf_log error "Could not create drop-in dependency for \"$CONTAINER\" ($cid)" + else + systemctl daemon-reload + rc=$? + if [ $rc -ne $OCF_SUCCESS ]; then + ocf_log error "Could not refresh service definition after creating drop-in for \"$CONTAINER\"" + fi + fi + + return $rc +} + + +run_new_container() +{ + local opts=$1 + local image=$2 + local cmd=$3 + local rc + + ocf_log info "running container $CONTAINER for the first time" + out=$(podman run $opts $image $cmd 2>&1) + rc=$? + + if [ -n "$out" ]; then + out="$(echo "$out" | tr -s ' \t\r\n' ' ')" + if [ $rc -eq 0 ]; then + ocf_log info "$out" + else + ocf_log err "$out" + fi + fi + + if [ $rc -eq 125 ]; then + # If an internal podman error occurred, it might be because + # the internal storage layer still references an old container + # with the same name, even though podman itself thinks there + # is no such container. If so, purge the storage layer to try + # to clean the corruption and try again. + if echo "$out" | grep -q "unknown.*flag"; then + ocf_exit_reason "$out" + return $rc + fi + + ocf_log warn "Internal podman error while creating new container $CONTAINER. Retrying." + ocf_run podman rm --storage $CONTAINER + ocf_run podman run $opts $image $cmd + rc=$? + elif [ $rc -eq 127 ]; then + # rhbz#1972209: podman 3.0.x seems to be hit by a race + # where the cgroup is not yet set up properly when the OCI + # runtime configures the container. If that happens, recreate + # the container as long as we get the same error code or + # until start timeout preempts us. + while [ $rc -eq 127 ] && (echo "$out" | grep -q "cgroup.*scope not found") ; do + ocf_log warn "Internal podman error while assigning cgroup. Retrying." + # Arbitrary sleep to prevent consuming all CPU while looping + sleep 1 + podman rm -f "$CONTAINER" + out=$(podman run $opts $image $cmd 2>&1) + rc=$? + done + # Log the created container ID if it succeeded + if [ $rc -eq 0 ]; then + ocf_log info "$out" + fi + fi + + return $rc +} + + +podman_start() +{ + local cid + local rc + + podman_create_mounts + local run_opts="-d --name=${CONTAINER}" + # check to see if the container has already started + podman_simple_status + if [ $? -eq $OCF_SUCCESS ]; then + return $OCF_SUCCESS + fi + + if [ -n "$OCF_RESKEY_run_opts" ]; then + run_opts="$run_opts $OCF_RESKEY_run_opts" + fi + + if [ $REQUIRE_IMAGE_PULL -eq 1 ]; then + ocf_log notice "Beginning pull of image, ${OCF_RESKEY_image}" + podman pull "${OCF_RESKEY_image}" + if [ $? -ne 0 ]; then + ocf_exit_reason "failed to pull image ${OCF_RESKEY_image}" + return $OCF_ERR_GENERIC + fi + fi + + if ocf_is_true "$OCF_RESKEY_reuse" && container_exists; then + ocf_log info "starting existing container $CONTAINER." + ocf_run podman start $CONTAINER + else + # make sure any previous container matching our container name is cleaned up first. + # we already know at this point it wouldn't be running + remove_container + run_new_container "$run_opts" $OCF_RESKEY_image "$OCF_RESKEY_run_cmd" + if [ $? -eq 125 ]; then + return $OCF_ERR_GENERIC + fi + + fi + rc=$? + + # if the container was stopped or didn't exist before, systemd + # removed the libpod* scopes. So always try to recreate the drop-ins + if [ $rc -eq 0 ] && ocf_is_true "$OCF_RESKEY_drop_in_dependency"; then + cid=$(podman_container_id) + create_transient_drop_in_dependency "$cid" + rc=$? + fi + + if [ $rc -ne 0 ]; then + ocf_exit_reason "podman failed to launch container (rc: $rc)" + return $OCF_ERR_GENERIC + fi + + + # wait for monitor to pass before declaring that the container is started + while true; do + podman_simple_status + if [ $? -ne $OCF_SUCCESS ]; then + ocf_exit_reason "Newly created podman container exited after start" + return $OCF_ERR_GENERIC + fi + + monitor_cmd_exec + if [ $? -eq $OCF_SUCCESS ]; then + ocf_log notice "Container $CONTAINER started successfully" + return $OCF_SUCCESS + fi + + ocf_exit_reason "waiting on monitor_cmd to pass after start" + sleep 1 + done +} + +podman_stop() +{ + local timeout=60 + local rc + podman_simple_status + if [ $? -eq $OCF_NOT_RUNNING ]; then + remove_container + return $OCF_SUCCESS + fi + + if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then + timeout=$((($OCF_RESKEY_CRM_meta_timeout/1000) -10 )) + if [ $timeout -lt 10 ]; then + timeout=10 + fi + fi + + if ocf_is_true "$OCF_RESKEY_force_kill"; then + ocf_run podman kill $CONTAINER + rc=$? + else + ocf_log debug "waiting $timeout second[s] before killing container" + ocf_run podman stop -t=$timeout $CONTAINER + rc=$? + # on stop, systemd will automatically delete any transient + # drop-in conf that has been created earlier + fi + + if [ $rc -ne 0 ]; then + # If the stop failed, it could be because the controlling conmon + # process died unexpectedly. If so, a generic error code is returned + # but the associated container exit code is -1. If that's the case, + # assume there's no failure and continue with the rm as usual. + if [ $rc -eq 125 ] && \ + podman inspect --format '{{.State.Status}}:{{.State.ExitCode}}' $CONTAINER | grep -wq "stopped:-1"; then + ocf_log warn "Container ${CONTAINER} had an unexpected stop outcome. Trying to remove it anyway." + else + ocf_exit_reason "Failed to stop container, ${CONTAINER}, based on image, ${OCF_RESKEY_image}." + return $OCF_ERR_GENERIC + fi + fi + + remove_container + if [ $? -ne 0 ]; then + ocf_exit_reason "Failed to remove stopped container, ${CONTAINER}, based on image, ${OCF_RESKEY_image}." + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +image_exists() +{ + podman image exists "${OCF_RESKEY_image}" + if [ $? -eq 0 ]; then + # image found + return 0 + fi + + if ocf_is_true "$OCF_RESKEY_allow_pull"; then + REQUIRE_IMAGE_PULL=1 + ocf_log notice "Image (${OCF_RESKEY_image}) does not exist locally but will be pulled during start" + return 0 + fi + # image not found. + return 1 +} + +podman_validate() +{ + check_binary podman + if [ -z "$OCF_RESKEY_image" ]; then + ocf_exit_reason "'image' option is required" + exit $OCF_ERR_CONFIGURED + fi + + image_exists + if [ $? -ne 0 ]; then + ocf_exit_reason "base image, ${OCF_RESKEY_image}, could not be found." + exit $OCF_ERR_CONFIGURED + fi + + return $OCF_SUCCESS +} + +# TODO : +# When a user starts plural clones in a node in globally-unique, a user cannot appoint plural name parameters. +# When a user appoints reuse, the resource agent cannot connect plural clones with a container. + +if ocf_is_true "$OCF_RESKEY_CRM_meta_globally_unique"; then + if [ -n "$OCF_RESKEY_name" ]; then + if [ -n "$OCF_RESKEY_CRM_meta_clone_node_max" ] && [ "$OCF_RESKEY_CRM_meta_clone_node_max" -ne 1 ] + then + ocf_exit_reason "Cannot make plural clones from the same name parameter." + exit $OCF_ERR_CONFIGURED + fi + if [ -n "$OCF_RESKEY_CRM_meta_master_node_max" ] && [ "$OCF_RESKEY_CRM_meta_master_node_max" -ne 1 ] + then + ocf_exit_reason "Cannot make plural master from the same name parameter." + exit $OCF_ERR_CONFIGURED + fi + fi + : ${OCF_RESKEY_name=`echo ${OCF_RESOURCE_INSTANCE} | tr ':' '-'`} +else + : ${OCF_RESKEY_name=${OCF_RESOURCE_INSTANCE}} +fi + +CONTAINER=$OCF_RESKEY_name + +# Note: we currently monitor podman containers by with the "podman exec" +# command, so make sure that invocation is always valid by enforcing the +# exec command to be non-empty +: ${OCF_RESKEY_monitor_cmd:=/bin/true} + +# When OCF_RESKEY_drop_in_dependency is not populated, we +# look at another file-based way of enabling the option. +# Otherwise, consider it disabled. +if [ -z "$OCF_RESKEY_drop_in_dependency" ]; then + if [ -f "/etc/sysconfig/podman_drop_in" ] || \ + [ -f "/etc/default/podman_drop_in" ]; then + OCF_RESKEY_drop_in_dependency=yes + fi +fi + +case $__OCF_ACTION in +meta-data) meta_data + exit $OCF_SUCCESS;; +start) + podman_validate + podman_start;; +stop) podman_stop;; +monitor) podman_monitor;; +validate-all) podman_validate;; +usage|help) podman_usage + exit $OCF_SUCCESS + ;; +*) podman_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +rc=$? +ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" +exit $rc |