summaryrefslogtreecommitdiffstats
path: root/heartbeat/db2
blob: 95447ab6cbf7d1c469d63892d4be46954af77060 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
#!/bin/sh
#
# db2
#
# Resource agent that manages a DB2 LUW database in Standard role 
# or HADR configuration in promotable configuration.
# Multi partition is supported as well.
#
# Copyright (c) 2011 Holger Teutsch <holger.teutsch@web.de>
#
# This agent incoporates code of a previous release created by
# Alan Robertson and the community.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License as
# published by the Free Software Foundation.
#
# This program is distributed in the hope that it would be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# Further, this software is distributed without any warranty that it is
# free of the rightful claim of any third person regarding infringement
# or the like.  Any license provided herein, whether implied or
# otherwise, applies only to this software file.  Patent licenses, if
# any, provided herein do not apply to combinations of this program with
# other software, or any other product whatsoever.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#

#######################################################################
# Initialization:

: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs

# Parameter defaults

OCF_RESKEY_instance_default=""
OCF_RESKEY_admin_default=""
OCF_RESKEY_dbpartitionnum_default="0"

: ${OCF_RESKEY_instance=${OCF_RESKEY_instance_default}}
: ${OCF_RESKEY_admin=${OCF_RESKEY_admin_default}}
: ${OCF_RESKEY_dbpartitionnum=${OCF_RESKEY_dbpartitionnum_default}}

#######################################################################


db2_usage() {
    echo "db2 start|stop|monitor|promote|demote|notify|validate-all|meta-data"
}

db2_meta_data() {
cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="db2" version="1.0">
<version>1.0</version>
<longdesc lang="en">
Resource Agent that manages an IBM DB2 LUW databases in Standard role as primitive or in HADR roles in promotable configuration. Multiple partitions are supported.

Standard mode:

An instance including all or selected databases is made highly available.
Configure each partition as a separate primitive resource.

HADR mode:

A single database in HADR configuration is made highly available by automating takeover operations.
Configure a promotable resource with notifications enabled and an
additional monitoring operation with role "Promoted".

In case of HADR be very deliberate in specifying intervals/timeouts. The detection of a failure including promote must complete within HADR_PEER_WINDOW.

In addition to honoring requirements for crash recovery etc. for your specific database use the following relations as guidance:

"monitor interval" &lt; HADR_PEER_WINDOW - (appr 30 sec)

"promote timeout" &lt; HADR_PEER_WINDOW + (appr 20 sec)

For further information and examples consult http://www.linux-ha.org/wiki/db2_(resource_agent)
</longdesc>
<shortdesc lang="en">Resource Agent that manages an IBM DB2 LUW databases in Standard role as primitive or in HADR roles as promotable configuration. Multiple partitions are supported.</shortdesc>

<parameters>
<parameter name="instance" unique="1" required="1">
<longdesc lang="en">
The instance of the database(s).
</longdesc>
<shortdesc lang="en">instance</shortdesc>
<content type="string" default="${OCF_RESKEY_instance_default}" />
</parameter>
<parameter name="dblist" unique="0" required="0">
<longdesc lang="en">
List of databases to be managed, e.g "db1 db2".
Defaults to all databases in the instance. Specify one db for HADR mode.
</longdesc>
<shortdesc lang="en">List of databases to be managed</shortdesc>
<content type="string"/>
</parameter>
<parameter name="admin" unique="0" required="0">
<longdesc lang="en">
DEPRECATED: The admin user of the instance.
</longdesc>
<shortdesc lang="en">DEPRECATED: admin</shortdesc>
<content type="string" default="${OCF_RESKEY_admin_default}" />
</parameter>
<parameter name="dbpartitionnum" unique="0" required="0">
<longdesc lang="en">
The number of the partition (DBPARTITIONNUM) to be managed.
</longdesc>
<shortdesc lang="en">database partition number (DBPARTITIONNUM)</shortdesc>
<content type="string" default="${OCF_RESKEY_dbpartitionnum_default}" />
</parameter>
</parameters>

<actions>
<action name="start" timeout="120s"/>
<action name="stop" timeout="120s"/>
<action name="promote" timeout="120s"/>
<action name="demote" timeout="120s"/>
<action name="notify" timeout="10s"/>
<action name="monitor" depth="0" timeout="60s" interval="20s"/>
<action name="monitor" depth="0" timeout="60s" role="Promoted" interval="22s"/>
<action name="validate-all" timeout="5s"/>
<action name="meta-data" timeout="5s"/>
</actions>
</resource-agent>
END
}

#
# validate
# .. and set global variables
#
# exit on error
#
db2_validate() {
    local db2home db2sql db2instance

    # db2 uses korn shell
    check_binary "ksh"

    # check required instance vars
    if [ -z "$OCF_RESKEY_instance" ]
    then
        ocf_log err "DB2 required parameter instance is not set!"
        return $OCF_ERR_CONFIGURED
    fi

    instance=$OCF_RESKEY_instance
    if [ -n "$OCF_RESKEY_admin" ]
    then
        ocf_log warn "DB2 deprecated parameter admin is set, using $OCF_RESKEY_admin as instance."
        instance=$OCF_RESKEY_admin
    fi

    db2node=${OCF_RESKEY_dbpartitionnum:-0}

    db2home=$(sh -c "echo ~$instance")
    db2sql=$db2home/sqllib
    db2profile=$db2sql/db2profile
    db2bin=$db2sql/bin

    STATE_FILE=${HA_RSCTMP}/db2-${OCF_RESOURCE_INSTANCE}.state

    #	Let's make sure a few important things are there...
    if ! [ -d "$db2sql" -a  -d "$db2bin" -a -f "$db2profile" -a \
           -x "$db2profile" -a -x "$db2bin/db2" ]
    then
        ocf_is_probe && exit $OCF_NOT_RUNNING
        ocf_log err "DB2 required directories and/or files not found"
        exit $OCF_ERR_INSTALLED
    fi

    db2instance=$(runasdb2 'echo $DB2INSTANCE')
    if [ "$db2instance" != "$instance" ]
    then
        ocf_is_probe && exit $OCF_NOT_RUNNING
        ocf_log err "DB2 parameter instance \"$instance\" != DB2INSTANCE \"$db2instance\""
        exit $OCF_ERR_CONFIGURED
    fi

    # enough checking for stop to succeed
    [ $__OCF_ACTION = stop ] && return $OCF_SUCCESS

    dblist=$OCF_RESKEY_dblist
    if [ -n "$dblist" ]
    then
        # support , as separator as well
        dblist=$(echo "$dblist" | sed -e 's/[,]/ /g')
    else
        if ! dblist=$(db2_dblist)
        then
            ocf_log err "DB2 $instance($db2node): cannot retrieve db directory"
            exit $OCF_ERR_INSTALLED
        fi
    fi

    # check requirements for the HADR case
    if ocf_is_ms
    then
        set -- $dblist
        if [ $# != 1 ]
        then
            ocf_log err "DB2 resource $OCF_RESOURCE_INSTANCE must have exactly one name in dblist"
            exit $OCF_ERR_CONFIGURED
        fi

        if [ $db2node != 0 ]
        then
            ocf_log err "DB2 resource $OCF_RESOURCE_INSTANCE must have dbpartitionnum=0"
            exit $OCF_ERR_CONFIGURED
        fi
    fi

    return $OCF_SUCCESS
}

master_score()
{
    if ! have_binary "crm_master"; then
        return
    fi

    crm_master $*
}

#
# Run the given command as db2 instance user
#
runasdb2() {
    su $instance -c ". $db2profile; $*"
}

#
# Run a command as the DB2 admin, and log the output
#
logasdb2() {
    local output rc

    output=$(runasdb2 $*)
    rc=$?
    if [ $rc -eq 0 ]
    then
        ocf_log info "$output"
    else
        ocf_log err "$output"
    fi
    return $rc
}


#
# maintain the fal (first active log) attribute
# db2_fal_attrib DB {set val|get}
#
db2_fal_attrib() {
    local db=$1
    local attr val rc id node member me

    attr=db2hadr_${instance}_${db}_fal

    case "$2" in
        set)
        me=$(ocf_local_nodename)

        # loop over all member nodes and set attribute
        crm_node -l |
        while read id node member
        do
            [ "$member" = member -a "$node" != "$me" ] || continue
            crm_attribute -l forever --node=$node -n $attr -v "$3"
            rc=$?
            ocf_log info "DB2 instance $instance($db2node/$db: setting attrib for FAL to $FIRST_ACTIVE_LOG @ $node"
            [ $rc != 0 ] && break
        done
        ;;

        get)
        crm_attribute -l forever -n $attr -G --quiet 2>&1
        rc=$?
        if ! ocf_is_true "$OCF_RESKEY_CRM_meta_notify" && [ $rc != 0 ]
        then
            ocf_log warn "DB2 instance $instance($db2node/$db: can't retrieve attribute $attr, are you sure notifications are enabled ?"
        fi
        ;;

        *)
        exit $OCF_ERR_CONFIGURED
    esac

    return $rc
}

#
# unfortunately a first connect after a crash may need several minutes
# for some internal cleanup stuff in DB2.
# We run a connect in background so other connects (i.e. monitoring!) may proceed.
#
db2_run_connect() {
    local db=$1

    logasdb2 "db2 connect to $db; db2 terminate"
}

#
# get some data from the database config
# sets HADR_ROLE HADR_TIMEOUT HADR_PEER_WINDOW
#
db2_get_cfg() {
    local db=$1

    local output hadr_vars

    output=$(runasdb2 db2 get db cfg for $db)
    [ $? != 0 ] && return $OCF_ERR_GENERIC

    hadr_vars=$(echo "$output" |
        awk '/HADR database role/ {printf "HADR_ROLE='%s'; ", $NF;}
            /HADR_TIMEOUT/ {printf "HADR_TIMEOUT='%s'; ", $NF;}
            /First active log file/ {printf "FIRST_ACTIVE_LOG='%s'\n", $NF;}
            /HADR_PEER_WINDOW/ {printf "HADR_PEER_WINDOW='%s'\n", $NF;}')

    # sets HADR_ROLE HADR_TIMEOUT HADR_PEER_WINDOW 
    HADR_ROLE=$(echo "$output" | awk '/HADR database role/ {print $NF;}')
    HADR_TIMEOUT=$(echo "$output" | awk '/HADR_TIMEOUT/ {print $NF;}')
    FIRST_ACTIVE_LOG=$(echo "$output" | awk '/First active log file/ {print $NF;}')
    HADR_PEER_WINDOW=$(echo "$output" | awk '/HADR_PEER_WINDOW/ {print $NF;}')

    # HADR_PEER_WINDOW comes with V9 and is checked later
    if [ -z "$HADR_ROLE" -o -z "$HADR_TIMEOUT" ]
    then
        ocf_log error "DB2 cfg values invalid for $instance($db2node)/$db: $hadr_vars"
        return $OCF_ERR_GENERIC
    fi

    return $OCF_SUCCESS
}

#
# return the list of databases in the instance
#
db2_dblist() {
    local output

    output=$(runasdb2 db2 list database directory) || return $OCF_ERR_GENERIC
    
    echo "$output" | grep -i 'Database name.*=' | sed 's%.*= *%%'
}

#
# Delayed check of the compatibility of DB2 instance and pacemaker
# config.
# Logically this belongs to validate but certain parameters can only
# be retrieved once the instance is started.
#
db2_check_config_compatibility() {
    local db=$1
    local is_ms

    ocf_is_ms
    is_ms=$?

    case "$HADR_ROLE/$is_ms" in
        STANDARD/0)
        ocf_log err "DB2 database $instance/$db is not in a HADR configuration but I am a M/S resource"
        exit $OCF_ERR_INSTALLED
        ;;

        STANDARD/1)
        # OK
        ;;

        */0)
        if [ -z "$HADR_PEER_WINDOW" ]
        then
            ocf_log err "DB2 database $instance: release to old, need HADR_PEER_WINDOW (>=V9)"
            exit $OCF_ERR_INSTALLED
        fi
        ;;

        */1)
        ocf_log err "DB2 database $instance/$db is in a HADR configuration but I must be a M/S resource"
    esac

}

#
# Start instance and DB.
# Standard mode is through "db2 activate" in order to start in previous
# mode (Standy/Primary).
# If the database is a primary AND we can determine that the running master
# has a higher "first active log" we conclude that we come up after a crash
# an the previous Standby is now Primary.
# The db is then started as Standby.
#
# Other cases: danger of split brain, log error and do nothing.
#
db2_start() {
    local output start_cmd db
    local start_opts="dbpartitionnum $db2node"

    # If we detect that db partitions are not in use, and no
    # partition is explicitly specified, activate without
    # partition information. This allows db2 instances without
    # partition support to be managed. 
    if [ -z "$OCF_RESKEY_dbpartitionnum" ] && ! [ -e "$db2sql/db2nodes.cfg" ]; then
        start_opts=""
    fi

    if output=$(runasdb2 db2start $start_opts)
    then
        ocf_log info "DB2 instance $instance($db2node) started: $output"
    else
        case $output in
            *SQL1026N*)
            ocf_log info "DB2 instance $instance($db2node) already running: $output"
            ;;

            *)
            ocf_log err "$output"
            return $OCF_ERR_GENERIC
        esac
    fi

    if ! db2_instance_status
    then
        ocf_log err "DB2 instance $instance($db2node) is not active!"
        return $OCF_ERR_GENERIC
    fi

    [ $db2node = 0 ] || return $OCF_SUCCESS
    # activate DB only on node 0

    for db in $dblist
    do
        # sets HADR_ROLE HADR_TIMEOUT HADR_PEER_WINDOW FIRST_ACTIVE_LOG
        db2_get_cfg $db || return $?

        # Better late than never: can only check this when the instance is already up
        db2_check_config_compatibility $db

        start_cmd="db2 activate db $db"

        if [ $HADR_ROLE = PRIMARY ]
        then
            local master_fal

            # communicate our FAL to other nodes the might start concurrently
            db2_fal_attrib $db set $FIRST_ACTIVE_LOG

            # ignore false positive:
            # error: Can't use > in [ ]. Escape it or use [[..]]. [SC2073]
            # see https://github.com/koalaman/shellcheck/issues/691
            # shellcheck disable=SC2073
            if master_fal=$(db2_fal_attrib $db get) && [ "$master_fal" '>' $FIRST_ACTIVE_LOG ]
            then
                ocf_log info "DB2 database $instance($db2node)/$db is Primary and outdated, starting as secondary"
                start_cmd="db2 start hadr on db $db as standby"
                HADR_ROLE=STANDBY
            fi
        fi

        if output=$(runasdb2 $start_cmd)
        then
            ocf_log info "DB2 database $instance($db2node)/$db started/activated"
            [ $HADR_ROLE != STANDBY ] && db2_run_connect $db &
        else
            case $output in
                SQL1490W*|SQL1494W*|SQL1497W*|SQL1777N*)
                ocf_log info "DB2 database $instance($db2node)/$db already activated: $output"
                ;;

                SQL1768N*"Reason code = \"7\""*)
                ocf_log err "DB2 database $instance($db2node)/$db is a Primary and the Standby is down"
                ocf_log err "Possible split brain ! Manual intervention required."
                ocf_log err "If this DB is outdated use \"db2 start hadr on db $db as standby\""
                ocf_log err "If this DB is the surviving primary use \"db2 start hadr on db $db as primary by force\""

                # might be the Standby is not yet there
                # might be a timing problem because "First active log" is delayed
                #    on the next start attempt we might succeed when FAL was advanced
                # might be manual intervention is required
                # ... so let pacemaker give it another try and we will succeed then
                return $OCF_ERR_GENERIC
                ;;

                *)
                ocf_log err "DB2 database $instance($db2node)/$db didn't start: $output"
                return $OCF_ERR_GENERIC
            esac
        fi
    done

    # come here with success
    # Even if we are a db2 Primary pacemaker requires start to end up in slave mode
    echo SLAVE > $STATE_FILE
    return $OCF_SUCCESS
}

#
# helper function to be spawned
# so we can detect a hang of the db2stop command
#
db2_stop_bg() {
    local rc output
    local stop_opts="dbpartitionnum $db2node"

    rc=$OCF_SUCCESS

    if [ -z "$OCF_RESKEY_dbpartitionnum" ] && ! [ -e "$db2sql/db2nodes.cfg" ]; then
        stop_opts=""
    fi

    if output=$(runasdb2 db2stop force $stop_opts)
    then
        ocf_log info "DB2 instance $instance($db2node) stopped: $output"
    else
        case $output in
            *SQL1032N*)
            #SQL1032N  No start database manager command was issued
            ocf_log info "$output"
            ;;

            *)
            ocf_log err "DB2 instance $instance($db2node) stop failed: $output"
            rc=$OCF_ERR_GENERIC
        esac
    fi

    return $rc
}

#
# Stop the given db2 database instance
#
db2_stop() {
    local stop_timeout grace_timeout stop_bg_pid i must_kill

    # remove master score
    master_score -D -l reboot

    # be very early here in order to avoid stale data
    rm -f $STATE_FILE

    db2_instance_status
    if [ $? -eq $OCF_NOT_RUNNING ]; then
        ocf_log info "DB2 instance $instance already stopped"
        return $OCF_SUCCESS
    fi

    stop_timeout=${OCF_RESKEY_CRM_meta_timeout:-20000}

    # grace_time is 4/5 (unit is ms)
    grace_timeout=$((stop_timeout/1250))

    # start db2stop in background as this may hang
    db2_stop_bg &
    stop_bg_pid=$!

    # wait for grace_timeout
    i=0
    while [ $i -lt $grace_timeout ]
    do
        kill -0 $stop_bg_pid 2>/dev/null || break;
        sleep 1
        i=$((i+1))
    done

    # collect exit status but don't hang
    if kill -0 $stop_bg_pid 2>/dev/null
    then
        stoprc=1
        kill -9 $stop_bg_pid 2>/dev/null
    else
        wait $stop_bg_pid
        stoprc=$?
    fi

    must_kill=0

    if [ $stoprc -ne 0 ]
    then
        ocf_log warn "DB2 instance $instance($db2node): db2stop failed, using db2nkill"
        must_kill=1
    elif ! db2_instance_dead
    then
        ocf_log warn "DB2 instance $instance($db2node): db2stop indicated success but there a still processes, using db2nkill"
        must_kill=1
    fi

    if [ $must_kill -eq 1 ]
    then
        # db2nkill kills *all* partitions on the node
        if [ -x $db2bin/db2nkill ]
        then
            logasdb2 $db2bin/db2nkill $db2node
        elif [ -x $db2bin/db2_kill ]
        then
            logasdb2 $db2bin/db2_kill
        fi

        # loop forever (or lrmd kills us due to timeout) until the
        # instance is dead
        while ! db2_instance_dead
        do
            ocf_log info "DB2 instance $instance($db2node): waiting for processes to exit"
            sleep 1
        done

        ocf_log info "DB2 instance $instance($db2node) is now dead"
    fi

    return $OCF_SUCCESS
}

#
# check whether `enough´ processes for a healthy instance are up
# 
db2_instance_status() {
    local pscount

    pscount=$(runasdb2 $db2bin/db2nps $db2node | cut -c9- |  grep ' db2[^ ]' | wc -l)
    if [ $pscount -ge 4 ]; then
        return $OCF_SUCCESS;
    elif [ $pscount -ge 1 ]; then
        return $OCF_ERR_GENERIC
    fi
    return $OCF_NOT_RUNNING
}

#
# is the given db2 instance dead?
# 
db2_instance_dead() {
    local pscount

    pscount=$(runasdb2 $db2bin/db2nps $db2node | cut -c9- |  grep ' db2[^ ]' | wc -l)
    test $pscount -eq 0
}

#
# return the status of the db as "Role/Status"
# e.g. Primary/Peer, Standby/RemoteCatchupPending
#
# If not in HADR configuration return "Standard/Standalone"
#
db2_hadr_status() {
    local db=$1
    local output

    output=$(runasdb2 db2pd -hadr -db $db)
    if [ $? != 0 ]
    then
        echo "Down/Off"
        return 1 
    fi

    echo "$output" |
    awk '/^\s+HADR_(ROLE|STATE) =/ {printf $3"/"}
         /^\s+HADR_CONNECT_STATUS =/ {print $3; exit; }
         /^HADR is not active/ {print "Standard/Standalone"; exit; }
         /^Role *State */ {getline; printf "%s/%s\n", $1, $2; exit; }'
}

#
# Monitor the db
# And as side effect set crm_master / FAL attribute
#
db2_monitor() {
    local CMD output hadr db
    local rc

    db2_instance_status
    rc=$?
    if [ $rc -ne $OCF_SUCCESS ]; then
        # instance is dead remove master score
        master_score -D -l reboot
        exit $rc
    fi

    [ $db2node = 0 ] || return 0
    # monitoring only for partition 0

    for db in $dblist
    do
        hadr=$(db2_hadr_status $db) || return $OCF_ERR_GENERIC
        ocf_log debug "Monitor: DB2 database $instance($db2node)/$db has HADR status $hadr"

        # set master preference accordingly
        case "$hadr" in
            PRIMARY/*|Primary/*|Standard/*)
            # perform  a basic health check
            CMD="if db2 connect to $db;
            then 
                db2 select \* from sysibm.sysversions ; rc=\$?;
                db2 terminate;
            else
                rc=\$?;
            fi;
            exit \$rc"

            if ! output=$(runasdb2 $CMD)
            then
                case "$output" in
                    SQL1776N*)
                    # can't connect/select on standby, may be spurious turing takeover
                    ;;

                    *)
                    ocf_log err "DB2 database $instance($db2node)/$db is not working"
                    ocf_log err "DB2 message: $output"

                    # dead primary, remove master score
                    master_score -D -l reboot
                    return $OCF_ERR_GENERIC
                esac
            fi

            ocf_log debug "DB2 database $instance($db2node)/$db appears to be working"
            ocf_is_ms && master_score -v 10000 -l reboot
            ;;

            STANDBY/*PEER/*|Standby/*Peer)
            master_score -v 8000 -l reboot
            ;;

            STANDBY/*|Standby/*)
            ocf_log warn "DB2 database $instance($db2node)/$db in status $hadr can never be promoted"
            master_score -D -l reboot
            ;;

            *)
            return $OCF_ERR_GENERIC
        esac
    done

    # everything OK, return if running as slave
    grep MASTER $STATE_FILE >/dev/null 2>&1 || return $OCF_SUCCESS

    return $OCF_RUNNING_MASTER
}

#
# Promote db to Primary
#
db2_promote() {
    # validate ensured that dblist contains only one entry
    local db=$dblist
    local i hadr output force

    # we run this twice as after a crash of the other node
    # within HADR_TIMEOUT the status may be still reported as Peer
    # although a connection no longer exists

    for i in 1 2
    do
        hadr=$(db2_hadr_status $db) || return $OCF_ERR_GENERIC
        ocf_log info "DB2 database $instance($db2node)/$db has HADR status $hadr and will be promoted"

        case "$hadr" in
            Standard/Standalone)
            # this case only to keep ocf-tester happy
            return $OCF_SUCCESS
            ;;

            PRIMARY/PEER/*|PRIMARY/REMOTE_CATCHUP/*|PRIMARY/REMOTE_CATCHUP_PENDING/CONNECTED|Primary/Peer)
            # nothing to do, only update pacemaker's view
            echo MASTER > $STATE_FILE
            return $OCF_SUCCESS
            ;;

            STANDBY/PEER/CONNECTED|Standby/Peer)
            # must take over
            ;;

            STANDBY/*PEER/DISCONNECTED|Standby/DisconnectedPeer)
            # must take over by force peer window only
            force="by force peer window only"
            ;;

            # must take over by force
            STANDBY/REMOTE_CATCHUP_PENDING/DISCONNECTED)
            force="by force"
            ;;

            *)
            return $OCF_ERR_GENERIC
        esac

        if output=$(runasdb2 db2 takeover hadr on db $db $force)
        then
            # update pacemaker's view
            echo MASTER > $STATE_FILE

            # turn the log so we rapidly get a new FAL
            logasdb2 "db2 archive log for db $db"
            return $OCF_SUCCESS
        fi

        case "$output" in
            SQL1770N*"Reason code = \"7\""*)
            # expected, HADR_TIMEOUT is now expired
            # go for the second try
            continue
            ;;

            *)
            ocf_log err "DB2 database $instance($db2node)/$db promote failed: $output"
            return $OCF_ERR_GENERIC
        esac
    done

    return $OCF_ERR_GENERIC
}

#
# Demote db to standby
#
db2_demote() {
    # validate ensured that dblist contains only one entry
    local db=$dblist
    local hadr
    
    # house keeping, set pacemaker's view to slave
    echo SLAVE > $STATE_FILE

    hadr=$(db2_hadr_status $dblist) || return $OCF_ERR_GENERIC
    ocf_log info "DB2 database $instance($db2node)/$db has HADR status $hadr and will be demoted"

    db2_monitor
    return $?
}

#
# handle pre start notification
# We record our first active log on the other nodes.
# If two primaries come up after a crash they can safely determine who is
# the outdated one.
#
db2_notify() {
    local node

    # only interested in pre-start
    [  $OCF_RESKEY_CRM_meta_notify_type = pre \
    -a $OCF_RESKEY_CRM_meta_notify_operation = start ] || return $OCF_SUCESS

    # gets FIRST_ACTIVE_LOG
    db2_get_cfg $dblist || return $?

    db2_fal_attrib $dblist set $FIRST_ACTIVE_LOG || return $OCF_ERR_GENERIC
    exit $OCF_SUCCESS
}

########
# Main #
########
case "$__OCF_ACTION" in
    meta-data)
    db2_meta_data
    exit $OCF_SUCCESS
    ;;

    usage)
    db2_usage
    exit $OCF_SUCCESS
    ;;

    start)
    db2_validate
    db2_start || exit $?
    db2_monitor
    exit $?
    ;;

    stop)
    db2_validate
    db2_stop
    exit $?
    ;;

    promote)
    db2_validate
    db2_promote
    exit $?
    ;;

    demote)
    db2_validate
    db2_demote
    exit $?
    ;;

    notify)
    db2_validate
    db2_notify
    exit $?
    ;;

    monitor)	
    db2_validate
    db2_monitor
    exit $?
    ;;

    validate-all)
    db2_validate
    exit $?
    ;;

    *)
    db2_usage
    exit $OCF_ERR_UNIMPLEMENTED
esac