diff options
Diffstat (limited to 'tests/sentinel')
21 files changed, 1305 insertions, 0 deletions
diff --git a/tests/sentinel/run.tcl b/tests/sentinel/run.tcl new file mode 100644 index 0000000..98c4c11 --- /dev/null +++ b/tests/sentinel/run.tcl @@ -0,0 +1,35 @@ +# Sentinel test suite. Copyright (C) 2014 Salvatore Sanfilippo antirez@gmail.com +# This software is released under the BSD License. See the COPYING file for +# more information. + +cd tests/sentinel +source ../instances.tcl + +set ::instances_count 5 ; # How many instances we use at max. +set ::tlsdir "../../tls" + +proc main {} { + parse_options + if {$::leaked_fds_file != ""} { + set ::env(LEAKED_FDS_FILE) $::leaked_fds_file + } + spawn_instance sentinel $::sentinel_base_port $::instances_count { + "sentinel deny-scripts-reconfig no" + "enable-protected-configs yes" + "enable-debug-command yes" + } "../tests/includes/sentinel.conf" + + spawn_instance redis $::redis_base_port $::instances_count { + "enable-protected-configs yes" + "enable-debug-command yes" + } + run_tests + cleanup + end_tests +} + +if {[catch main e]} { + puts $::errorInfo + cleanup + exit 1 +} diff --git a/tests/sentinel/tests/00-base.tcl b/tests/sentinel/tests/00-base.tcl new file mode 100644 index 0000000..761ee82 --- /dev/null +++ b/tests/sentinel/tests/00-base.tcl @@ -0,0 +1,139 @@ +# Check the basic monitoring and failover capabilities. +source "../tests/includes/start-init-tests.tcl" +source "../tests/includes/init-tests.tcl" + +foreach_sentinel_id id { + S $id sentinel debug default-down-after 1000 +} + +if {$::simulate_error} { + test "This test will fail" { + fail "Simulated error" + } +} + +test "Sentinel commands sanity check" { + foreach_sentinel_id id { + assert_equal {72} [llength [S $id command list]] + assert_equal {15} [S $id command count] + } +} + +test "Basic failover works if the master is down" { + set old_port [RPort $master_id] + set addr [S 0 SENTINEL GET-MASTER-ADDR-BY-NAME mymaster] + assert {[lindex $addr 1] == $old_port} + kill_instance redis $master_id + foreach_sentinel_id id { + S $id sentinel debug ping-period 500 + S $id sentinel debug ask-period 500 + wait_for_condition 1000 100 { + [lindex [S $id SENTINEL GET-MASTER-ADDR-BY-NAME mymaster] 1] != $old_port + } else { + fail "At least one Sentinel did not receive failover info" + } + } + restart_instance redis $master_id + set addr [S 0 SENTINEL GET-MASTER-ADDR-BY-NAME mymaster] + set master_id [get_instance_id_by_port redis [lindex $addr 1]] +} + +test "New master [join $addr {:}] role matches" { + assert {[RI $master_id role] eq {master}} +} + +test "All the other slaves now point to the new master" { + foreach_redis_id id { + if {$id != $master_id && $id != 0} { + wait_for_condition 1000 50 { + [RI $id master_port] == [lindex $addr 1] + } else { + fail "Redis ID $id not configured to replicate with new master" + } + } + } +} + +test "The old master eventually gets reconfigured as a slave" { + wait_for_condition 1000 50 { + [RI 0 master_port] == [lindex $addr 1] + } else { + fail "Old master not reconfigured as slave of new master" + } +} + +test "ODOWN is not possible without N (quorum) Sentinels reports" { + foreach_sentinel_id id { + S $id SENTINEL SET mymaster quorum [expr $sentinels+1] + } + set old_port [RPort $master_id] + set addr [S 0 SENTINEL GET-MASTER-ADDR-BY-NAME mymaster] + assert {[lindex $addr 1] == $old_port} + kill_instance redis $master_id + + # Make sure failover did not happened. + set addr [S 0 SENTINEL GET-MASTER-ADDR-BY-NAME mymaster] + assert {[lindex $addr 1] == $old_port} + restart_instance redis $master_id +} + +test "Failover is not possible without majority agreement" { + foreach_sentinel_id id { + S $id SENTINEL SET mymaster quorum $quorum + } + + # Crash majority of sentinels + for {set id 0} {$id < $quorum} {incr id} { + kill_instance sentinel $id + } + + # Kill the current master + kill_instance redis $master_id + + # Make sure failover did not happened. + set addr [S $quorum SENTINEL GET-MASTER-ADDR-BY-NAME mymaster] + assert {[lindex $addr 1] == $old_port} + restart_instance redis $master_id + + # Cleanup: restart Sentinels to monitor the master. + for {set id 0} {$id < $quorum} {incr id} { + restart_instance sentinel $id + } +} + +test "Failover works if we configure for absolute agreement" { + foreach_sentinel_id id { + S $id SENTINEL SET mymaster quorum $sentinels + } + + # Wait for Sentinels to monitor the master again + foreach_sentinel_id id { + wait_for_condition 1000 100 { + [dict get [S $id SENTINEL MASTER mymaster] info-refresh] < 100000 + } else { + fail "At least one Sentinel is not monitoring the master" + } + } + + kill_instance redis $master_id + + foreach_sentinel_id id { + wait_for_condition 1000 100 { + [lindex [S $id SENTINEL GET-MASTER-ADDR-BY-NAME mymaster] 1] != $old_port + } else { + fail "At least one Sentinel did not receive failover info" + } + } + restart_instance redis $master_id + set addr [S 0 SENTINEL GET-MASTER-ADDR-BY-NAME mymaster] + set master_id [get_instance_id_by_port redis [lindex $addr 1]] + + # Set the min ODOWN agreement back to strict majority. + foreach_sentinel_id id { + S $id SENTINEL SET mymaster quorum $quorum + } +} + +test "New master [join $addr {:}] role matches" { + assert {[RI $master_id role] eq {master}} +} diff --git a/tests/sentinel/tests/01-conf-update.tcl b/tests/sentinel/tests/01-conf-update.tcl new file mode 100644 index 0000000..5dca556 --- /dev/null +++ b/tests/sentinel/tests/01-conf-update.tcl @@ -0,0 +1,39 @@ +# Test Sentinel configuration consistency after partitions heal. + +source "../tests/includes/init-tests.tcl" + +test "We can failover with Sentinel 1 crashed" { + set old_port [RPort $master_id] + set addr [S 0 SENTINEL GET-MASTER-ADDR-BY-NAME mymaster] + assert {[lindex $addr 1] == $old_port} + + # Crash Sentinel 1 + kill_instance sentinel 1 + + kill_instance redis $master_id + foreach_sentinel_id id { + if {$id != 1} { + wait_for_condition 1000 50 { + [lindex [S $id SENTINEL GET-MASTER-ADDR-BY-NAME mymaster] 1] != $old_port + } else { + fail "Sentinel $id did not receive failover info" + } + } + } + restart_instance redis $master_id + set addr [S 0 SENTINEL GET-MASTER-ADDR-BY-NAME mymaster] + set master_id [get_instance_id_by_port redis [lindex $addr 1]] +} + +test "After Sentinel 1 is restarted, its config gets updated" { + restart_instance sentinel 1 + wait_for_condition 1000 50 { + [lindex [S 1 SENTINEL GET-MASTER-ADDR-BY-NAME mymaster] 1] != $old_port + } else { + fail "Restarted Sentinel did not receive failover info" + } +} + +test "New master [join $addr {:}] role matches" { + assert {[RI $master_id role] eq {master}} +} diff --git a/tests/sentinel/tests/02-slaves-reconf.tcl b/tests/sentinel/tests/02-slaves-reconf.tcl new file mode 100644 index 0000000..8196b60 --- /dev/null +++ b/tests/sentinel/tests/02-slaves-reconf.tcl @@ -0,0 +1,91 @@ +# Check that slaves are reconfigured at a latter time if they are partitioned. +# +# Here we should test: +# 1) That slaves point to the new master after failover. +# 2) That partitioned slaves point to new master when they are partitioned +# away during failover and return at a latter time. + +source "../tests/includes/init-tests.tcl" + +proc 02_test_slaves_replication {} { + uplevel 1 { + test "Check that slaves replicate from current master" { + set master_port [RPort $master_id] + foreach_redis_id id { + if {$id == $master_id} continue + if {[instance_is_killed redis $id]} continue + wait_for_condition 1000 50 { + ([RI $id master_port] == $master_port) && + ([RI $id master_link_status] eq {up}) + } else { + fail "Redis slave $id is replicating from wrong master" + } + } + } + } +} + +proc 02_crash_and_failover {} { + uplevel 1 { + test "Crash the master and force a failover" { + set old_port [RPort $master_id] + set addr [S 0 SENTINEL GET-MASTER-ADDR-BY-NAME mymaster] + assert {[lindex $addr 1] == $old_port} + kill_instance redis $master_id + foreach_sentinel_id id { + wait_for_condition 1000 50 { + [lindex [S $id SENTINEL GET-MASTER-ADDR-BY-NAME mymaster] 1] != $old_port + } else { + fail "At least one Sentinel did not receive failover info" + } + } + restart_instance redis $master_id + set addr [S 0 SENTINEL GET-MASTER-ADDR-BY-NAME mymaster] + set master_id [get_instance_id_by_port redis [lindex $addr 1]] + } + } +} + +02_test_slaves_replication +02_crash_and_failover + +foreach_sentinel_id id { + S $id sentinel debug info-period 100 + S $id sentinel debug default-down-after 1000 + S $id sentinel debug publish-period 100 +} + +02_test_slaves_replication + +test "Kill a slave instance" { + foreach_redis_id id { + if {$id == $master_id} continue + set killed_slave_id $id + kill_instance redis $id + break + } +} + +02_crash_and_failover +02_test_slaves_replication + +test "Wait for failover to end" { + set inprogress 1 + while {$inprogress} { + set inprogress 0 + foreach_sentinel_id id { + if {[dict exists [S $id SENTINEL MASTER mymaster] failover-state]} { + incr inprogress + } + } + if {$inprogress} {after 100} + } +} + +test "Restart killed slave and test replication of slaves again..." { + restart_instance redis $killed_slave_id +} + +# Now we check if the slave rejoining the partition is reconfigured even +# if the failover finished. +02_test_slaves_replication diff --git a/tests/sentinel/tests/03-runtime-reconf.tcl b/tests/sentinel/tests/03-runtime-reconf.tcl new file mode 100644 index 0000000..bd6eecc --- /dev/null +++ b/tests/sentinel/tests/03-runtime-reconf.tcl @@ -0,0 +1,225 @@ +# Test runtime reconfiguration command SENTINEL SET. +source "../tests/includes/init-tests.tcl" +set num_sentinels [llength $::sentinel_instances] + +set ::user "testuser" +set ::password "secret" + +proc server_set_password {} { + foreach_redis_id id { + assert_equal {OK} [R $id CONFIG SET requirepass $::password] + assert_equal {OK} [R $id AUTH $::password] + assert_equal {OK} [R $id CONFIG SET masterauth $::password] + } +} + +proc server_reset_password {} { + foreach_redis_id id { + assert_equal {OK} [R $id CONFIG SET requirepass ""] + assert_equal {OK} [R $id CONFIG SET masterauth ""] + } +} + +proc server_set_acl {id} { + assert_equal {OK} [R $id ACL SETUSER $::user on >$::password allchannels +@all] + assert_equal {OK} [R $id ACL SETUSER default off] + + R $id CLIENT KILL USER default SKIPME no + assert_equal {OK} [R $id AUTH $::user $::password] + assert_equal {OK} [R $id CONFIG SET masteruser $::user] + assert_equal {OK} [R $id CONFIG SET masterauth $::password] +} + +proc server_reset_acl {id} { + assert_equal {OK} [R $id ACL SETUSER default on] + assert_equal {1} [R $id ACL DELUSER $::user] + + assert_equal {OK} [R $id CONFIG SET masteruser ""] + assert_equal {OK} [R $id CONFIG SET masterauth ""] +} + +proc verify_sentinel_connect_replicas {id} { + foreach replica [S $id SENTINEL REPLICAS mymaster] { + if {[string match "*disconnected*" [dict get $replica flags]]} { + return 0 + } + } + return 1 +} + +proc wait_for_sentinels_connect_servers { {is_connect 1} } { + foreach_sentinel_id id { + wait_for_condition 1000 50 { + [string match "*disconnected*" [dict get [S $id SENTINEL MASTER mymaster] flags]] != $is_connect + } else { + fail "At least some sentinel can't connect to master" + } + + wait_for_condition 1000 50 { + [verify_sentinel_connect_replicas $id] == $is_connect + } else { + fail "At least some sentinel can't connect to replica" + } + } +} + +test "Sentinels (re)connection following SENTINEL SET mymaster auth-pass" { + # 3 types of sentinels to test: + # (re)started while master changed pwd. Manage to connect only after setting pwd + set sent2re 0 + # (up)dated in advance with master new password + set sent2up 1 + # (un)touched. Yet manage to maintain (old) connection + set sent2un 2 + + wait_for_sentinels_connect_servers + kill_instance sentinel $sent2re + server_set_password + assert_equal {OK} [S $sent2up SENTINEL SET mymaster auth-pass $::password] + restart_instance sentinel $sent2re + + # Verify sentinel that restarted failed to connect master + wait_for_condition 100 50 { + [string match "*disconnected*" [dict get [S $sent2re SENTINEL MASTER mymaster] flags]] != 0 + } else { + fail "Expected to be disconnected from master due to wrong password" + } + + # Update restarted sentinel with master password + assert_equal {OK} [S $sent2re SENTINEL SET mymaster auth-pass $::password] + + # All sentinels expected to connect successfully + wait_for_sentinels_connect_servers + + # remove requirepass and verify sentinels manage to connect servers + server_reset_password + wait_for_sentinels_connect_servers + # Sanity check + verify_sentinel_auto_discovery +} + +test "Sentinels (re)connection following master ACL change" { + # Three types of sentinels to test during ACL change: + # 1. (re)started Sentinel. Manage to connect only after setting new pwd + # 2. (up)dated Sentinel, get just before ACL change the new password + # 3. (un)touched Sentinel that kept old connection with master and didn't + # set new ACL password won't persist ACL pwd change (unlike legacy auth-pass) + set sent2re 0 + set sent2up 1 + set sent2un 2 + + wait_for_sentinels_connect_servers + # kill sentinel 'sent2re' and restart it after ACL change + kill_instance sentinel $sent2re + + # Update sentinel 'sent2up' with new user and pwd + assert_equal {OK} [S $sent2up SENTINEL SET mymaster auth-user $::user] + assert_equal {OK} [S $sent2up SENTINEL SET mymaster auth-pass $::password] + + foreach_redis_id id { + server_set_acl $id + } + + restart_instance sentinel $sent2re + + # Verify sentinel that restarted failed to reconnect master + wait_for_condition 100 50 { + [string match "*disconnected*" [dict get [S $sent2re SENTINEL MASTER mymaster] flags]] != 0 + } else { + fail "Expected: Restarted sentinel to be disconnected from master due to obsolete password" + } + + # Verify sentinel with updated password managed to connect (wait for sentinelTimer to reconnect) + wait_for_condition 100 50 { + [string match "*disconnected*" [dict get [S $sent2up SENTINEL MASTER mymaster] flags]] == 0 + } else { + fail "Expected: Sentinel to be connected to master" + } + + # Verify sentinel untouched gets failed to connect master + wait_for_condition 100 50 { + [string match "*disconnected*" [dict get [S $sent2un SENTINEL MASTER mymaster] flags]] != 0 + } else { + fail "Expected: Sentinel to be disconnected from master due to obsolete password" + } + + # Now update all sentinels with new password + foreach_sentinel_id id { + assert_equal {OK} [S $id SENTINEL SET mymaster auth-user $::user] + assert_equal {OK} [S $id SENTINEL SET mymaster auth-pass $::password] + } + + # All sentinels expected to connect successfully + wait_for_sentinels_connect_servers + + # remove requirepass and verify sentinels manage to connect servers + foreach_redis_id id { + server_reset_acl $id + } + + wait_for_sentinels_connect_servers + # Sanity check + verify_sentinel_auto_discovery +} + +test "Set parameters in normal case" { + + set info [S 0 SENTINEL master mymaster] + set origin_quorum [dict get $info quorum] + set origin_down_after_milliseconds [dict get $info down-after-milliseconds] + set update_quorum [expr $origin_quorum+1] + set update_down_after_milliseconds [expr $origin_down_after_milliseconds+1000] + + assert_equal [S 0 SENTINEL SET mymaster quorum $update_quorum] "OK" + assert_equal [S 0 SENTINEL SET mymaster down-after-milliseconds $update_down_after_milliseconds] "OK" + + set update_info [S 0 SENTINEL master mymaster] + assert {[dict get $update_info quorum] != $origin_quorum} + assert {[dict get $update_info down-after-milliseconds] != $origin_down_after_milliseconds} + + #restore to origin config parameters + assert_equal [S 0 SENTINEL SET mymaster quorum $origin_quorum] "OK" + assert_equal [S 0 SENTINEL SET mymaster down-after-milliseconds $origin_down_after_milliseconds] "OK" +} + +test "Set parameters in normal case with bad format" { + + set info [S 0 SENTINEL master mymaster] + set origin_down_after_milliseconds [dict get $info down-after-milliseconds] + + assert_error "ERR Invalid argument '-20' for SENTINEL SET 'down-after-milliseconds'*" {S 0 SENTINEL SET mymaster down-after-milliseconds -20} + assert_error "ERR Invalid argument 'abc' for SENTINEL SET 'down-after-milliseconds'*" {S 0 SENTINEL SET mymaster down-after-milliseconds "abc"} + + set current_info [S 0 SENTINEL master mymaster] + assert {[dict get $current_info down-after-milliseconds] == $origin_down_after_milliseconds} +} + +test "Sentinel Set with other error situations" { + + # non-existing script + assert_error "ERR Notification script seems non existing*" {S 0 SENTINEL SET mymaster notification-script test.txt} + + # wrong parameter number + assert_error "ERR wrong number of arguments for 'sentinel|set' command" {S 0 SENTINEL SET mymaster fakeoption} + + # unknown parameter option + assert_error "ERR Unknown option or number of arguments for SENTINEL SET 'fakeoption'" {S 0 SENTINEL SET mymaster fakeoption fakevalue} + + # save new config to disk failed + set info [S 0 SENTINEL master mymaster] + set origin_quorum [dict get $info quorum] + set update_quorum [expr $origin_quorum+1] + set sentinel_id 0 + set configfilename [file join "sentinel_$sentinel_id" "sentinel.conf"] + set configfilename_bak [file join "sentinel_$sentinel_id" "sentinel.conf.bak"] + + file rename $configfilename $configfilename_bak + file mkdir $configfilename + + catch {[S 0 SENTINEL SET mymaster quorum $update_quorum]} err + + file delete $configfilename + file rename $configfilename_bak $configfilename + + assert_match "ERR Failed to save config file*" $err +} diff --git a/tests/sentinel/tests/04-slave-selection.tcl b/tests/sentinel/tests/04-slave-selection.tcl new file mode 100644 index 0000000..3d2ca64 --- /dev/null +++ b/tests/sentinel/tests/04-slave-selection.tcl @@ -0,0 +1,5 @@ +# Test slave selection algorithm. +# +# This unit should test: +# 1) That when there are no suitable slaves no failover is performed. +# 2) That among the available slaves, the one with better offset is picked. diff --git a/tests/sentinel/tests/05-manual.tcl b/tests/sentinel/tests/05-manual.tcl new file mode 100644 index 0000000..a0004eb --- /dev/null +++ b/tests/sentinel/tests/05-manual.tcl @@ -0,0 +1,50 @@ +# Test manual failover + +source "../tests/includes/init-tests.tcl" + +foreach_sentinel_id id { + S $id sentinel debug info-period 2000 + S $id sentinel debug default-down-after 6000 + S $id sentinel debug publish-period 1000 +} + +test "Manual failover works" { + set old_port [RPort $master_id] + set addr [S 0 SENTINEL GET-MASTER-ADDR-BY-NAME mymaster] + assert {[lindex $addr 1] == $old_port} + catch {S 0 SENTINEL FAILOVER mymaster} reply + assert {$reply eq "OK"} + foreach_sentinel_id id { + wait_for_condition 1000 50 { + [lindex [S $id SENTINEL GET-MASTER-ADDR-BY-NAME mymaster] 1] != $old_port + } else { + fail "At least one Sentinel did not receive failover info" + } + } + set addr [S 0 SENTINEL GET-MASTER-ADDR-BY-NAME mymaster] + set master_id [get_instance_id_by_port redis [lindex $addr 1]] +} + +test "New master [join $addr {:}] role matches" { + assert {[RI $master_id role] eq {master}} +} + +test "All the other slaves now point to the new master" { + foreach_redis_id id { + if {$id != $master_id && $id != 0} { + wait_for_condition 1000 50 { + [RI $id master_port] == [lindex $addr 1] + } else { + fail "Redis ID $id not configured to replicate with new master" + } + } + } +} + +test "The old master eventually gets reconfigured as a slave" { + wait_for_condition 1000 50 { + [RI 0 master_port] == [lindex $addr 1] + } else { + fail "Old master not reconfigured as slave of new master" + } +} diff --git a/tests/sentinel/tests/06-ckquorum.tcl b/tests/sentinel/tests/06-ckquorum.tcl new file mode 100644 index 0000000..36c3dc6 --- /dev/null +++ b/tests/sentinel/tests/06-ckquorum.tcl @@ -0,0 +1,42 @@ +# Test for the SENTINEL CKQUORUM command + +source "../tests/includes/init-tests.tcl" +set num_sentinels [llength $::sentinel_instances] + +test "CKQUORUM reports OK and the right amount of Sentinels" { + foreach_sentinel_id id { + assert_match "*OK $num_sentinels usable*" [S $id SENTINEL CKQUORUM mymaster] + } +} + +test "CKQUORUM detects quorum cannot be reached" { + set orig_quorum [expr {$num_sentinels/2+1}] + S 0 SENTINEL SET mymaster quorum [expr {$num_sentinels+1}] + catch {[S 0 SENTINEL CKQUORUM mymaster]} err + assert_match "*NOQUORUM*" $err + S 0 SENTINEL SET mymaster quorum $orig_quorum +} + +test "CKQUORUM detects failover authorization cannot be reached" { + set orig_quorum [expr {$num_sentinels/2+1}] + S 0 SENTINEL SET mymaster quorum 1 + for {set i 0} {$i < $orig_quorum} {incr i} { + kill_instance sentinel [expr {$i + 1}] + } + + # We need to make sure that other sentinels are in `DOWN` state + # from the point of view of S 0 before we executing `CKQUORUM`. + wait_for_condition 300 50 { + [catch {S 0 SENTINEL CKQUORUM mymaster}] == 1 + } else { + fail "At least $orig_quorum sentinels did not enter the down state." + } + + assert_error "*NOQUORUM*" {S 0 SENTINEL CKQUORUM mymaster} + + S 0 SENTINEL SET mymaster quorum $orig_quorum + for {set i 0} {$i < $orig_quorum} {incr i} { + restart_instance sentinel [expr {$i + 1}] + } +} + diff --git a/tests/sentinel/tests/07-down-conditions.tcl b/tests/sentinel/tests/07-down-conditions.tcl new file mode 100644 index 0000000..bb24d6d --- /dev/null +++ b/tests/sentinel/tests/07-down-conditions.tcl @@ -0,0 +1,102 @@ +# Test conditions where an instance is considered to be down + +source "../tests/includes/init-tests.tcl" +source "../../../tests/support/cli.tcl" + +foreach_sentinel_id id { + S $id sentinel debug info-period 1000 + S $id sentinel debug ask-period 100 + S $id sentinel debug default-down-after 3000 + S $id sentinel debug publish-period 200 + S $id sentinel debug ping-period 100 +} + +set ::alive_sentinel [expr {$::instances_count/2+2}] +proc ensure_master_up {} { + S $::alive_sentinel sentinel debug info-period 1000 + S $::alive_sentinel sentinel debug ping-period 100 + S $::alive_sentinel sentinel debug ask-period 100 + S $::alive_sentinel sentinel debug publish-period 100 + wait_for_condition 1000 50 { + [dict get [S $::alive_sentinel sentinel master mymaster] flags] eq "master" + } else { + fail "Master flags are not just 'master'" + } +} + +proc ensure_master_down {} { + S $::alive_sentinel sentinel debug info-period 1000 + S $::alive_sentinel sentinel debug ping-period 100 + S $::alive_sentinel sentinel debug ask-period 100 + S $::alive_sentinel sentinel debug publish-period 100 + wait_for_condition 1000 50 { + [string match *down* \ + [dict get [S $::alive_sentinel sentinel master mymaster] flags]] + } else { + fail "Master is not flagged SDOWN" + } +} + +test "Crash the majority of Sentinels to prevent failovers for this unit" { + for {set id 0} {$id < $quorum} {incr id} { + kill_instance sentinel $id + } +} + +test "SDOWN is triggered by non-responding but not crashed instance" { + ensure_master_up + set master_addr [S $::alive_sentinel SENTINEL GET-MASTER-ADDR-BY-NAME mymaster] + set master_id [get_instance_id_by_port redis [lindex $master_addr 1]] + + set pid [get_instance_attrib redis $master_id pid] + exec kill -SIGSTOP $pid + ensure_master_down + exec kill -SIGCONT $pid + ensure_master_up +} + +test "SDOWN is triggered by crashed instance" { + lassign [S $::alive_sentinel SENTINEL GET-MASTER-ADDR-BY-NAME mymaster] host port + ensure_master_up + kill_instance redis 0 + ensure_master_down + restart_instance redis 0 + ensure_master_up +} + +test "SDOWN is triggered by masters advertising as slaves" { + ensure_master_up + R 0 slaveof 127.0.0.1 34567 + ensure_master_down + R 0 slaveof no one + ensure_master_up +} + +test "SDOWN is triggered by misconfigured instance replying with errors" { + ensure_master_up + set orig_dir [lindex [R 0 config get dir] 1] + set orig_save [lindex [R 0 config get save] 1] + # Set dir to / and filename to "tmp" to make sure it will fail. + R 0 config set dir / + R 0 config set dbfilename tmp + R 0 config set save "1000000 1000000" + after 5000 + R 0 bgsave + after 5000 + ensure_master_down + R 0 config set save $orig_save + R 0 config set dir $orig_dir + R 0 config set dbfilename dump.rdb + R 0 bgsave + ensure_master_up +} + +# We use this test setup to also test command renaming, as a side +# effect of the master going down if we send PONG instead of PING +test "SDOWN is triggered if we rename PING to PONG" { + ensure_master_up + S $::alive_sentinel SENTINEL SET mymaster rename-command PING PONG + ensure_master_down + S $::alive_sentinel SENTINEL SET mymaster rename-command PING PING + ensure_master_up +} diff --git a/tests/sentinel/tests/08-hostname-conf.tcl b/tests/sentinel/tests/08-hostname-conf.tcl new file mode 100644 index 0000000..263b06f --- /dev/null +++ b/tests/sentinel/tests/08-hostname-conf.tcl @@ -0,0 +1,69 @@ +source "../tests/includes/utils.tcl" + +proc set_redis_announce_ip {addr} { + foreach_redis_id id { + R $id config set replica-announce-ip $addr + } +} + +proc set_sentinel_config {keyword value} { + foreach_sentinel_id id { + S $id sentinel config set $keyword $value + } +} + +proc set_all_instances_hostname {hostname} { + foreach_sentinel_id id { + set_instance_attrib sentinel $id host $hostname + } + foreach_redis_id id { + set_instance_attrib redis $id host $hostname + } +} + +test "(pre-init) Configure instances and sentinel for hostname use" { + set ::host "localhost" + restart_killed_instances + set_all_instances_hostname $::host + set_redis_announce_ip $::host + set_sentinel_config resolve-hostnames yes + set_sentinel_config announce-hostnames yes +} + +source "../tests/includes/init-tests.tcl" + +proc verify_hostname_announced {hostname} { + foreach_sentinel_id id { + # Master is reported with its hostname + if {![string equal [lindex [S $id SENTINEL GET-MASTER-ADDR-BY-NAME mymaster] 0] $hostname]} { + return 0 + } + + # Replicas are reported with their hostnames + foreach replica [S $id SENTINEL REPLICAS mymaster] { + if {![string equal [dict get $replica ip] $hostname]} { + return 0 + } + } + } + return 1 +} + +test "Sentinel announces hostnames" { + # Check initial state + verify_hostname_announced $::host + + # Disable announce-hostnames and confirm IPs are used + set_sentinel_config announce-hostnames no + assert {[verify_hostname_announced "127.0.0.1"] || [verify_hostname_announced "::1"]} +} + +# We need to revert any special configuration because all tests currently +# share the same instances. +test "(post-cleanup) Configure instances and sentinel for IPs" { + set ::host "127.0.0.1" + set_all_instances_hostname $::host + set_redis_announce_ip $::host + set_sentinel_config resolve-hostnames no + set_sentinel_config announce-hostnames no +}
\ No newline at end of file diff --git a/tests/sentinel/tests/09-acl-support.tcl b/tests/sentinel/tests/09-acl-support.tcl new file mode 100644 index 0000000..a754dac --- /dev/null +++ b/tests/sentinel/tests/09-acl-support.tcl @@ -0,0 +1,56 @@ + +source "../tests/includes/init-tests.tcl" + +set ::user "testuser" +set ::password "secret" + +proc setup_acl {} { + foreach_sentinel_id id { + assert_equal {OK} [S $id ACL SETUSER $::user >$::password +@all on] + assert_equal {OK} [S $id ACL SETUSER default off] + + S $id CLIENT KILL USER default SKIPME no + assert_equal {OK} [S $id AUTH $::user $::password] + } +} + +proc teardown_acl {} { + foreach_sentinel_id id { + assert_equal {OK} [S $id ACL SETUSER default on] + assert_equal {1} [S $id ACL DELUSER $::user] + + S $id SENTINEL CONFIG SET sentinel-user "" + S $id SENTINEL CONFIG SET sentinel-pass "" + } +} + +test "(post-init) Set up ACL configuration" { + setup_acl + assert_equal $::user [S 1 ACL WHOAMI] +} + +test "SENTINEL CONFIG SET handles on-the-fly credentials reconfiguration" { + # Make sure we're starting with a broken state... + wait_for_condition 200 50 { + [catch {S 1 SENTINEL CKQUORUM mymaster}] == 1 + } else { + fail "Expected: Sentinel to be disconnected from master due to wrong password" + } + assert_error "*NOQUORUM*" {S 1 SENTINEL CKQUORUM mymaster} + + foreach_sentinel_id id { + assert_equal {OK} [S $id SENTINEL CONFIG SET sentinel-user $::user] + assert_equal {OK} [S $id SENTINEL CONFIG SET sentinel-pass $::password] + } + + wait_for_condition 200 50 { + [catch {S 1 SENTINEL CKQUORUM mymaster}] == 0 + } else { + fail "Expected: Sentinel to be connected to master after setting password" + } + assert_match {*OK*} [S 1 SENTINEL CKQUORUM mymaster] +} + +test "(post-cleanup) Tear down ACL configuration" { + teardown_acl +} diff --git a/tests/sentinel/tests/10-replica-priority.tcl b/tests/sentinel/tests/10-replica-priority.tcl new file mode 100644 index 0000000..d3f868a --- /dev/null +++ b/tests/sentinel/tests/10-replica-priority.tcl @@ -0,0 +1,76 @@ +source "../tests/includes/init-tests.tcl" + +test "Check acceptable replica-priority values" { + foreach_redis_id id { + if {$id == $master_id} continue + + # ensure replica-announced accepts yes and no + catch {R $id CONFIG SET replica-announced no} e + if {$e ne "OK"} { + fail "Unable to set replica-announced to no" + } + catch {R $id CONFIG SET replica-announced yes} e + if {$e ne "OK"} { + fail "Unable to set replica-announced to yes" + } + + # ensure a random value throw error + catch {R $id CONFIG SET replica-announced 321} e + if {$e eq "OK"} { + fail "Able to set replica-announced with something else than yes or no (321) whereas it should not be possible" + } + catch {R $id CONFIG SET replica-announced a3b2c1} e + if {$e eq "OK"} { + fail "Able to set replica-announced with something else than yes or no (a3b2c1) whereas it should not be possible" + } + + # test only the first redis replica, no need to double test + break + } +} + +proc 10_test_number_of_replicas {n_replicas_expected} { + test "Check sentinel replies with $n_replicas_expected replicas" { + # ensure sentinels replies with the right number of replicas + foreach_sentinel_id id { + S $id sentinel debug info-period 100 + S $id sentinel debug default-down-after 1000 + S $id sentinel debug publish-period 100 + set len [llength [S $id SENTINEL REPLICAS mymaster]] + wait_for_condition 200 100 { + [llength [S $id SENTINEL REPLICAS mymaster]] == $n_replicas_expected + } else { + fail "Sentinel replies with a wrong number of replicas with replica-announced=yes (expected $n_replicas_expected but got $len) on sentinel $id" + } + } + } +} + +proc 10_set_replica_announced {master_id announced n_replicas} { + test "Set replica-announced=$announced on $n_replicas replicas" { + set i 0 + foreach_redis_id id { + if {$id == $master_id} continue + #puts "set replica-announce=$announced on redis #$id" + R $id CONFIG SET replica-announced "$announced" + incr i + if { $n_replicas!="all" && $i >= $n_replicas } { break } + } + } +} + +# ensure all replicas are announced +10_set_replica_announced $master_id "yes" "all" +# ensure all replicas are announced by sentinels +10_test_number_of_replicas 4 + +# ensure the first 2 replicas are not announced +10_set_replica_announced $master_id "no" 2 +# ensure sentinels are not announcing the first 2 replicas that have been set unannounced +10_test_number_of_replicas 2 + +# ensure all replicas are announced +10_set_replica_announced $master_id "yes" "all" +# ensure all replicas are not announced by sentinels +10_test_number_of_replicas 4 + diff --git a/tests/sentinel/tests/11-port-0.tcl b/tests/sentinel/tests/11-port-0.tcl new file mode 100644 index 0000000..a3e8bdb --- /dev/null +++ b/tests/sentinel/tests/11-port-0.tcl @@ -0,0 +1,33 @@ +source "../tests/includes/init-tests.tcl" + +test "Start/Stop sentinel on same port with a different runID should not change the total number of sentinels" { + set sentinel_id [expr $::instances_count-1] + # Kill sentinel instance + kill_instance sentinel $sentinel_id + + # Delete line with myid in sentinels config file + set orgfilename [file join "sentinel_$sentinel_id" "sentinel.conf"] + set tmpfilename "sentinel.conf_tmp" + set dirname "sentinel_$sentinel_id" + + delete_lines_with_pattern $orgfilename $tmpfilename "myid" + + # Get count of total sentinels + set a [S 0 SENTINEL master mymaster] + set original_count [lindex $a 33] + + # Restart sentinel with the modified config file + set pid [exec_instance "sentinel" $dirname $orgfilename] + lappend ::pids $pid + + after 1000 + + # Get new count of total sentinel + set b [S 0 SENTINEL master mymaster] + set curr_count [lindex $b 33] + + # If the count is not the same then fail the test + if {$original_count != $curr_count} { + fail "Sentinel count is incorrect, original count being $original_count and current count is $curr_count" + } +} diff --git a/tests/sentinel/tests/12-master-reboot.tcl b/tests/sentinel/tests/12-master-reboot.tcl new file mode 100644 index 0000000..1fdd91d --- /dev/null +++ b/tests/sentinel/tests/12-master-reboot.tcl @@ -0,0 +1,103 @@ +# Check the basic monitoring and failover capabilities. +source "../tests/includes/init-tests.tcl" + + +if {$::simulate_error} { + test "This test will fail" { + fail "Simulated error" + } +} + + +# Reboot an instance previously in very short time but do not check if it is loading +proc reboot_instance {type id} { + set dirname "${type}_${id}" + set cfgfile [file join $dirname $type.conf] + set port [get_instance_attrib $type $id port] + + # Execute the instance with its old setup and append the new pid + # file for cleanup. + set pid [exec_instance $type $dirname $cfgfile] + set_instance_attrib $type $id pid $pid + lappend ::pids $pid + + # Check that the instance is running + if {[server_is_up 127.0.0.1 $port 100] == 0} { + set logfile [file join $dirname log.txt] + puts [exec tail $logfile] + abort_sentinel_test "Problems starting $type #$id: ping timeout, maybe server start failed, check $logfile" + } + + # Connect with it with a fresh link + set link [redis 127.0.0.1 $port 0 $::tls] + $link reconnect 1 + set_instance_attrib $type $id link $link +} + + +test "Master reboot in very short time" { + set old_port [RPort $master_id] + set addr [S 0 SENTINEL GET-MASTER-ADDR-BY-NAME mymaster] + assert {[lindex $addr 1] == $old_port} + + R $master_id debug populate 10000 + R $master_id bgsave + R $master_id config set key-load-delay 1500 + R $master_id config set loading-process-events-interval-bytes 1024 + R $master_id config rewrite + + foreach_sentinel_id id { + S $id SENTINEL SET mymaster master-reboot-down-after-period 5000 + S $id sentinel debug ping-period 500 + S $id sentinel debug ask-period 500 + } + + kill_instance redis $master_id + reboot_instance redis $master_id + + foreach_sentinel_id id { + wait_for_condition 1000 100 { + [lindex [S $id SENTINEL GET-MASTER-ADDR-BY-NAME mymaster] 1] != $old_port + } else { + fail "At least one Sentinel did not receive failover info" + } + } + + set addr [S 0 SENTINEL GET-MASTER-ADDR-BY-NAME mymaster] + set master_id [get_instance_id_by_port redis [lindex $addr 1]] + + # Make sure the instance load all the dataset + while 1 { + catch {[$link ping]} retval + if {[string match {*LOADING*} $retval]} { + after 100 + continue + } else { + break + } + } +} + +test "New master [join $addr {:}] role matches" { + assert {[RI $master_id role] eq {master}} +} + +test "All the other slaves now point to the new master" { + foreach_redis_id id { + if {$id != $master_id && $id != 0} { + wait_for_condition 1000 50 { + [RI $id master_port] == [lindex $addr 1] + } else { + fail "Redis ID $id not configured to replicate with new master" + } + } + } +} + +test "The old master eventually gets reconfigured as a slave" { + wait_for_condition 1000 50 { + [RI 0 master_port] == [lindex $addr 1] + } else { + fail "Old master not reconfigured as slave of new master" + } +}
\ No newline at end of file diff --git a/tests/sentinel/tests/13-info-command.tcl b/tests/sentinel/tests/13-info-command.tcl new file mode 100644 index 0000000..ef9dc01 --- /dev/null +++ b/tests/sentinel/tests/13-info-command.tcl @@ -0,0 +1,47 @@ +source "../tests/includes/init-tests.tcl" + +test "info command with at most one argument" { + set subCommandList {} + foreach arg {"" "all" "default" "everything"} { + if {$arg == ""} { + set info [S 0 info] + } else { + set info [S 0 info $arg] + } + assert { [string match "*redis_version*" $info] } + assert { [string match "*maxclients*" $info] } + assert { [string match "*used_cpu_user*" $info] } + assert { [string match "*sentinel_tilt*" $info] } + assert { ![string match "*used_memory*" $info] } + assert { ![string match "*rdb_last_bgsave*" $info] } + assert { ![string match "*master_repl_offset*" $info] } + assert { ![string match "*cluster_enabled*" $info] } + } +} + +test "info command with one sub-section" { + set info [S 0 info cpu] + assert { [string match "*used_cpu_user*" $info] } + assert { ![string match "*sentinel_tilt*" $info] } + assert { ![string match "*redis_version*" $info] } + + set info [S 0 info sentinel] + assert { [string match "*sentinel_tilt*" $info] } + assert { ![string match "*used_cpu_user*" $info] } + assert { ![string match "*redis_version*" $info] } +} + +test "info command with multiple sub-sections" { + set info [S 0 info server sentinel replication] + assert { [string match "*redis_version*" $info] } + assert { [string match "*sentinel_tilt*" $info] } + assert { ![string match "*used_memory*" $info] } + assert { ![string match "*used_cpu_user*" $info] } + + set info [S 0 info cpu all] + assert { [string match "*used_cpu_user*" $info] } + assert { [string match "*sentinel_tilt*" $info] } + assert { [string match "*redis_version*" $info] } + assert { ![string match "*used_memory*" $info] } + assert { ![string match "*master_repl_offset*" $info] } +} diff --git a/tests/sentinel/tests/helpers/check_leaked_fds.tcl b/tests/sentinel/tests/helpers/check_leaked_fds.tcl new file mode 100755 index 0000000..482b3e0 --- /dev/null +++ b/tests/sentinel/tests/helpers/check_leaked_fds.tcl @@ -0,0 +1,79 @@ +#!/usr/bin/env tclsh +# +# This script detects file descriptors that have leaked from a parent process. +# +# Our goal is to detect file descriptors that were opened by the parent and +# not cleaned up prior to exec(), but not file descriptors that were inherited +# from the grandparent which the parent knows nothing about. To do that, we +# look up every potential leak and try to match it against open files by the +# grandparent process. + +# Get PID of parent process +proc get_parent_pid {_pid} { + set fd [open "/proc/$_pid/status" "r"] + set content [read $fd] + close $fd + + if {[regexp {\nPPid:\s+(\d+)} $content _ ppid]} { + return $ppid + } + + error "failed to get parent pid" +} + +# Read symlink to get info about the specified fd of the specified process. +# The result can be the file name or an arbitrary string that identifies it. +# When not able to read, an empty string is returned. +proc get_fdlink {_pid fd} { + if { [catch {set fdlink [file readlink "/proc/$_pid/fd/$fd"]} err] } { + return "" + } + return $fdlink +} + +# Linux only +set os [exec uname] +if {$os != "Linux"} { + puts "Only Linux is supported." + exit 0 +} + +if {![info exists env(LEAKED_FDS_FILE)]} { + puts "Missing LEAKED_FDS_FILE environment variable." + exit 0 +} + +set outfile $::env(LEAKED_FDS_FILE) +set parent_pid [get_parent_pid [pid]] +set grandparent_pid [get_parent_pid $parent_pid] +set leaked_fds {} + +# Look for fds that were directly inherited from our parent but not from +# our grandparent (tcl) +foreach fd [glob -tails -directory "/proc/self/fd" *] { + # Ignore stdin/stdout/stderr + if {$fd == 0 || $fd == 1 || $fd == 2} { + continue + } + + set fdlink [get_fdlink "self" $fd] + if {$fdlink == ""} { + continue + } + + # We ignore fds that existed in the grandparent, or fds that don't exist + # in our parent (Sentinel process). + if {[get_fdlink $grandparent_pid $fd] == $fdlink || + [get_fdlink $parent_pid $fd] != $fdlink} { + continue + } + + lappend leaked_fds [list $fd $fdlink] +} + +# Produce report only if we found leaks +if {[llength $leaked_fds] > 0} { + set fd [open $outfile "w"] + puts $fd [join $leaked_fds "\n"] + close $fd +} diff --git a/tests/sentinel/tests/includes/init-tests.tcl b/tests/sentinel/tests/includes/init-tests.tcl new file mode 100644 index 0000000..ddb1319 --- /dev/null +++ b/tests/sentinel/tests/includes/init-tests.tcl @@ -0,0 +1,63 @@ +# Initialization tests -- most units will start including this. +source "../tests/includes/utils.tcl" + +test "(init) Restart killed instances" { + restart_killed_instances +} + +test "(init) Remove old master entry from sentinels" { + foreach_sentinel_id id { + catch {S $id SENTINEL REMOVE mymaster} + } +} + +set redis_slaves [expr $::instances_count - 1] +test "(init) Create a master-slaves cluster of [expr $redis_slaves+1] instances" { + create_redis_master_slave_cluster [expr {$redis_slaves+1}] +} +set master_id 0 + +test "(init) Sentinels can start monitoring a master" { + set sentinels [llength $::sentinel_instances] + set quorum [expr {$sentinels/2+1}] + foreach_sentinel_id id { + S $id SENTINEL MONITOR mymaster \ + [get_instance_attrib redis $master_id host] \ + [get_instance_attrib redis $master_id port] $quorum + } + foreach_sentinel_id id { + assert {[S $id sentinel master mymaster] ne {}} + S $id SENTINEL SET mymaster down-after-milliseconds 2000 + S $id SENTINEL SET mymaster failover-timeout 10000 + S $id SENTINEL debug tilt-period 5000 + S $id SENTINEL SET mymaster parallel-syncs 10 + if {$::leaked_fds_file != "" && [exec uname] == "Linux"} { + S $id SENTINEL SET mymaster notification-script ../../tests/helpers/check_leaked_fds.tcl + S $id SENTINEL SET mymaster client-reconfig-script ../../tests/helpers/check_leaked_fds.tcl + } + } +} + +test "(init) Sentinels can talk with the master" { + foreach_sentinel_id id { + wait_for_condition 1000 50 { + [catch {S $id SENTINEL GET-MASTER-ADDR-BY-NAME mymaster}] == 0 + } else { + fail "Sentinel $id can't talk with the master." + } + } +} + +test "(init) Sentinels are able to auto-discover other sentinels" { + verify_sentinel_auto_discovery +} + +test "(init) Sentinels are able to auto-discover slaves" { + foreach_sentinel_id id { + wait_for_condition 1000 50 { + [dict get [S $id SENTINEL MASTER mymaster] num-slaves] == $redis_slaves + } else { + fail "At least some sentinel can't detect some slave" + } + } +} diff --git a/tests/sentinel/tests/includes/sentinel.conf b/tests/sentinel/tests/includes/sentinel.conf new file mode 100644 index 0000000..1275236 --- /dev/null +++ b/tests/sentinel/tests/includes/sentinel.conf @@ -0,0 +1,9 @@ +# assume master is down after being unresponsive for 20s +sentinel down-after-milliseconds setmaster 20000 +# reconfigure one slave at a time +sentinel parallel-syncs setmaster 2 +# wait for 4m before assuming failover went wrong +sentinel failover-timeout setmaster 240000 +# monitoring set +sentinel monitor setmaster 10.0.0.1 30000 2 + diff --git a/tests/sentinel/tests/includes/start-init-tests.tcl b/tests/sentinel/tests/includes/start-init-tests.tcl new file mode 100644 index 0000000..b052350 --- /dev/null +++ b/tests/sentinel/tests/includes/start-init-tests.tcl @@ -0,0 +1,18 @@ +test "(start-init) Flush config and compare rewrite config file lines" { + foreach_sentinel_id id { + assert_match "OK" [S $id SENTINEL FLUSHCONFIG] + set file1 ../tests/includes/sentinel.conf + set file2 [file join "sentinel_${id}" "sentinel.conf"] + set fh1 [open $file1 r] + set fh2 [open $file2 r] + while {[gets $fh1 line1]} { + if {[gets $fh2 line2]} { + assert [string equal $line1 $line2] + } else { + fail "sentinel config file rewrite sequence changed" + } + } + close $fh1 + close $fh2 + } +}
\ No newline at end of file diff --git a/tests/sentinel/tests/includes/utils.tcl b/tests/sentinel/tests/includes/utils.tcl new file mode 100644 index 0000000..adfd91c --- /dev/null +++ b/tests/sentinel/tests/includes/utils.tcl @@ -0,0 +1,22 @@ +proc restart_killed_instances {} { + foreach type {redis sentinel} { + foreach_${type}_id id { + if {[get_instance_attrib $type $id pid] == -1} { + puts -nonewline "$type/$id " + flush stdout + restart_instance $type $id + } + } + } +} + +proc verify_sentinel_auto_discovery {} { + set sentinels [llength $::sentinel_instances] + foreach_sentinel_id id { + wait_for_condition 1000 50 { + [dict get [S $id SENTINEL MASTER mymaster] num-other-sentinels] == ($sentinels-1) + } else { + fail "At least some sentinel can't detect some other sentinel" + } + } +} diff --git a/tests/sentinel/tmp/.gitignore b/tests/sentinel/tmp/.gitignore new file mode 100644 index 0000000..f581f73 --- /dev/null +++ b/tests/sentinel/tmp/.gitignore @@ -0,0 +1,2 @@ +redis_* +sentinel_* |