summaryrefslogtreecommitdiffstats
path: root/agents/ocf/controld.in
blob: ace49332ea0eff8679af5c59b3d5fc35cb53b345 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
#!/bin/sh
#
# ocf:pacemaker:controld resource agent
#
# Copyright 2008-2023 the Pacemaker project contributors
#
# The version control history for this file may have further details.
#
# This source code is licensed under the GNU General Public License version 2
# (GPLv2) WITHOUT ANY WARRANTY.

#
# Manages the DLM controld process
#

#######################################################################
# Initialization:

: ${OCF_FUNCTIONS:="${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs"}
. "${OCF_FUNCTIONS}"
: ${__OCF_ACTION:="$1"}

# Explicitly list all environment variables used, to make static analysis happy
: ${OCF_RESKEY_CRM_meta_globally_unique:="false"}
: ${OCF_RESKEY_allow_stonith_disabled:="false"}
: ${OCF_RESKEY_sctp:="false"}
: ${OCF_RESOURCE_INSTANCE:=""}

case "$OCF_RESOURCE_INSTANCE" in
    *[gG][fF][sS]*)
        : ${OCF_RESKEY_args=-g 0}
        : ${OCF_RESKEY_daemon:=gfs_controld}
        ;;
    *[dD][lL][mM]*)
        : ${OCF_RESKEY_args=-s 0}
        : ${OCF_RESKEY_daemon:=dlm_controld}
        ;;
    *)
        : ${OCF_RESKEY_args=-s 0}
        : ${OCF_RESKEY_daemon:=dlm_controld}
esac


#######################################################################

if [ -e "$OCF_ROOT/resource.d/heartbeat/controld" ]; then
    ocf_log info "Using heartbeat controld agent"
    "$OCF_ROOT/resource.d/heartbeat/controld" "$1"
    exit $?
fi

meta_data() {
    cat <<END
<?xml version="1.0"?>
<resource-agent name="controld" version="@VERSION@">
<version>1.1</version>

<longdesc lang="en">
This Resource Agent can control the dlm_controld services needed by cluster-aware file systems.
It assumes that dlm_controld is in your default PATH.
In most cases, it should be run as an anonymous clone.
</longdesc>
<shortdesc lang="en">DLM Agent for cluster file systems</shortdesc>

<parameters>

<parameter name="args">
<longdesc lang="en">
Any additional options to start the dlm_controld service with
</longdesc>
<shortdesc lang="en">DLM Options</shortdesc>
<content type="string" default="-s 0" />
</parameter>

<parameter name="daemon" unique-group="daemon">
<longdesc lang="en">
The daemon to start - supports gfs_controld and dlm_controld
</longdesc>
<shortdesc lang="en">The daemon to start</shortdesc>
<content type="string" default="dlm_controld" />
</parameter>

<parameter name="allow_stonith_disabled">
<longdesc lang="en">
Allow DLM start-up even if STONITH/fencing is disabled in the cluster.

Setting this option to true will cause cluster malfunction and hangs on
fail-over for DLM clients that require fencing (such as GFS2, OCFS2, and
cLVM2).

This option is advanced use only.
</longdesc>
<shortdesc lang="en">Allow start-up even without STONITH/fencing</shortdesc>
<content type="string" default="false" />
</parameter>

</parameters>

<actions>
<action name="start"        timeout="90s" />
<action name="stop"         timeout="100s" />
<action name="monitor"      timeout="20s" interval="10s" depth="0" start-delay="0s" />
<action name="meta-data"    timeout="5s" />
<action name="validate-all"   timeout="30s" depth="0" />
</actions>
</resource-agent>
END
}

#######################################################################

CONFIGFS_DIR="/sys/kernel/config"
DLM_CONFIGFS_DIR="${CONFIGFS_DIR}/dlm"
DLM_SYSFS_DIR="/sys/kernel/dlm"

controld_usage() {
    cat <<END
usage: $0 {start|stop|monitor|validate-all|meta-data}

Expects to have a fully populated OCF RA-compliant environment set.
END
}

check_uncontrolled_locks()
{
    CUL_TMP=$(ls "$DLM_SYSFS_DIR" 2>&1)
    if [ $? -eq 0 ]; then
        if [ -n "$CUL_TMP" ]; then

            ocf_log err "Uncontrolled lockspace exists, system must reboot. Executing suicide fencing"
            stonith_admin --reboot="$(crm_node -n)" --tag controld

            exit $OCF_ERR_GENERIC
        fi
    fi
}

controld_start() {
    controld_monitor; rc=$?

    case $rc in
      "$OCF_SUCCESS")     return $OCF_SUCCESS;;
      "$OCF_NOT_RUNNING") ;;
      *) return $OCF_ERR_GENERIC;;
    esac

    # Ensure @runstatedir@/cluster exists
    [ -d "@runstatedir@/cluster" ] || mkdir "@runstatedir@/cluster"

    # Ensure configfs is mounted
    if [ ! -e "$CONFIGFS_DIR" ]; then
        modprobe configfs
        if [ ! -e "$CONFIGFS_DIR" ]; then
           ocf_log err "$CONFIGFS_DIR not available"
           return $OCF_ERR_INSTALLED
        fi
    fi
    mount -t configfs | grep " $CONFIGFS_DIR " >/dev/null 2>/dev/null
    if [ $? -ne 0 ]; then
       mount -t configfs none "$CONFIGFS_DIR"
    fi

    # Ensure DLM is available
    if [ ! -e "$DLM_CONFIGFS_DIR" ]; then
       modprobe dlm
       if [ ! -e "$DLM_CONFIGFS_DIR" ]; then
          ocf_log err "$DLM_CONFIGFS_DIR not available"
          return $OCF_ERR_INSTALLED
       fi
    fi

    if ! ocf_is_true "$OCF_RESKEY_allow_stonith_disabled" && \
        ! ocf_is_true "$(crm_attribute --type=crm_config --name=stonith-enabled --query --quiet --default=true)"; then
        ocf_log err "The cluster property stonith-enabled may not be deactivated to use the DLM"
        return $OCF_ERR_CONFIGURED
    fi

    # If no-quorum-policy not set, or not set as freeze, give a warning
    crm_attribute --type=crm_config --name=no-quorum-policy --query|grep value=freeze >/dev/null 2>/dev/null
    if [ $? -ne 0 ]; then
        ocf_log warn "The DLM cluster best practice suggests to set the cluster property \"no-quorum-policy=freeze\""
    fi

    "${OCF_RESKEY_daemon}" $OCF_RESKEY_args

    while true
    do
        sleep 1

        controld_monitor; rc=$?
        case $rc in
          "$OCF_SUCCESS")
            CS_ADDR_LIST="$(cat "${DLM_CONFIGFS_DIR}"/cluster/comms/*/addr_list 2>/dev/null)"
            if [ $? -eq 0 ] && [ -n "$CS_ADDR_LIST" ]; then
                return $OCF_SUCCESS
            fi
            ;;
          "$OCF_NOT_RUNNING")
            return $OCF_NOT_RUNNING
            ;;
          *)
            return $OCF_ERR_GENERIC
            ;;
        esac

        ocf_log debug "Waiting for ${OCF_RESKEY_daemon} to be ready"
    done
}

controld_stop() {
    controld_monitor; rc=$?

    if [ $rc -eq $OCF_NOT_RUNNING ]; then
        return $OCF_SUCCESS
    fi

    killall -TERM "${OCF_RESKEY_daemon}"; rc=$?

    if [ $rc -ne 0 ]; then
        return $OCF_ERR_GENERIC
    fi

    rc=$OCF_SUCCESS
    while [ $rc -eq $OCF_SUCCESS ]; do
        controld_monitor; rc=$?
        sleep 1
    done

    if [ $rc -eq $OCF_NOT_RUNNING ]; then
        rc=$OCF_SUCCESS
    fi

    return $rc
}

controld_monitor() {
    killall -0 ${OCF_RESKEY_daemon} >/dev/null 2>&1 ; CM_RC=$?

    case $CM_RC in
      0) smw=$(dlm_tool status -v | grep "stateful_merge_wait=" | cut -d= -f2)
         if [ -n "$smw" ] && [ $smw -eq 1 ]; then
             ocf_log err "DLM status is: stateful_merge_wait"
             CM_RC=$OCF_ERR_GENERIC
         elif [ -z "$smw" ] && dlm_tool ls | grep -q "wait fencing" && \
              ! stonith_admin -H '*' --output-as xml | grep -q "extended-status=\"pending\""; then
             ocf_log err "DLM status is: wait fencing"
             CM_RC=$OCF_ERR_GENERIC
         else
             CM_RC=$OCF_SUCCESS
         fi
         ;;
      1) CM_RC=$OCF_NOT_RUNNING;;
      *) CM_RC=$OCF_ERR_GENERIC;;
    esac

    # if the dlm is not successfully running, but
    # dlm lockspace bits are left over, we self must fence.
    if [ $CM_RC -ne $OCF_SUCCESS ]; then
        check_uncontrolled_locks
    fi

    return $CM_RC
}

controld_validate() {
    case "${OCF_RESKEY_CRM_meta_globally_unique}" in
        [Tt][Rr][Uu][Ee] | [Oo][Nn] | [Yy][Ee][Ss] | [Yy] | 1)
            msg="The globally-unique meta attribute must not be enabled for"
            msg="$msg $OCF_RESOURCE_INSTANCE"
            ocf_log err "$msg"
            exit $OCF_ERR_CONFIGURED
            ;;
    esac

    # Host-specific checks
    if [ "$OCF_CHECK_LEVEL" = "10" ]; then
        check_binary killall
        check_binary "${OCF_RESKEY_daemon}"
    fi

    return $OCF_SUCCESS
}

case "$__OCF_ACTION" in
meta-data)      meta_data
                exit $OCF_SUCCESS
                ;;
start)          controld_validate; controld_start;;
stop)           controld_stop;;
monitor)        controld_validate; controld_monitor;;
validate-all)   controld_validate;;
usage|help)     controld_usage
                exit $OCF_SUCCESS
                ;;
*)              controld_usage
                exit $OCF_ERR_UNIMPLEMENTED
                ;;
esac
rc=$?

exit $rc

# vim: set filetype=sh expandtab tabstop=4 softtabstop=4 shiftwidth=4 textwidth=80: