From bb12c1fd00eb51118749bbbc69c5596835fcbd3b Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sat, 4 May 2024 19:31:02 +0200 Subject: Adding upstream version 5:7.0.15. Signed-off-by: Daniel Baumann --- tests/cluster/cluster.tcl | 288 +++++++++++++++++++++ tests/cluster/run.tcl | 31 +++ tests/cluster/tests/00-base.tcl | 95 +++++++ tests/cluster/tests/01-faildet.tcl | 38 +++ tests/cluster/tests/02-failover.tcl | 65 +++++ tests/cluster/tests/03-failover-loop.tcl | 117 +++++++++ tests/cluster/tests/04-resharding.tcl | 196 ++++++++++++++ tests/cluster/tests/05-slave-selection.tcl | 179 +++++++++++++ tests/cluster/tests/06-slave-stop-cond.tcl | 77 ++++++ tests/cluster/tests/07-replica-migration.tcl | 103 ++++++++ tests/cluster/tests/08-update-msg.tcl | 90 +++++++ tests/cluster/tests/09-pubsub.tcl | 40 +++ tests/cluster/tests/10-manual-failover.tcl | 192 ++++++++++++++ tests/cluster/tests/11-manual-takeover.tcl | 71 +++++ tests/cluster/tests/12-replica-migration-2.tcl | 75 ++++++ tests/cluster/tests/12.1-replica-migration-3.tcl | 65 +++++ tests/cluster/tests/13-no-failover-option.tcl | 61 +++++ tests/cluster/tests/14-consistency-check.tcl | 124 +++++++++ tests/cluster/tests/15-cluster-slots.tcl | 128 +++++++++ tests/cluster/tests/16-transactions-on-replica.tcl | 79 ++++++ tests/cluster/tests/17-diskless-load-swapdb.tcl | 86 ++++++ tests/cluster/tests/18-info.tcl | 45 ++++ tests/cluster/tests/19-cluster-nodes-slots.tcl | 50 ++++ tests/cluster/tests/20-half-migrated-slot.tcl | 98 +++++++ tests/cluster/tests/21-many-slot-migration.tcl | 64 +++++ tests/cluster/tests/22-replica-in-sync.tcl | 146 +++++++++++ .../cluster/tests/23-multiple-slot-operations.tcl | 115 ++++++++ tests/cluster/tests/24-links.tcl | 114 ++++++++ .../tests/25-pubsubshard-slot-migration.tcl | 171 ++++++++++++ tests/cluster/tests/26-pubsubshard.tcl | 94 +++++++ tests/cluster/tests/27-endpoints.tcl | 219 ++++++++++++++++ tests/cluster/tests/28-cluster-shards.tcl | 202 +++++++++++++++ tests/cluster/tests/29-slot-migration-response.tcl | 50 ++++ tests/cluster/tests/helpers/onlydots.tcl | 16 ++ tests/cluster/tests/includes/init-tests.tcl | 75 ++++++ tests/cluster/tests/includes/utils.tcl | 36 +++ tests/cluster/tmp/.gitignore | 2 + 37 files changed, 3697 insertions(+) create mode 100644 tests/cluster/cluster.tcl create mode 100644 tests/cluster/run.tcl create mode 100644 tests/cluster/tests/00-base.tcl create mode 100644 tests/cluster/tests/01-faildet.tcl create mode 100644 tests/cluster/tests/02-failover.tcl create mode 100644 tests/cluster/tests/03-failover-loop.tcl create mode 100644 tests/cluster/tests/04-resharding.tcl create mode 100644 tests/cluster/tests/05-slave-selection.tcl create mode 100644 tests/cluster/tests/06-slave-stop-cond.tcl create mode 100644 tests/cluster/tests/07-replica-migration.tcl create mode 100644 tests/cluster/tests/08-update-msg.tcl create mode 100644 tests/cluster/tests/09-pubsub.tcl create mode 100644 tests/cluster/tests/10-manual-failover.tcl create mode 100644 tests/cluster/tests/11-manual-takeover.tcl create mode 100644 tests/cluster/tests/12-replica-migration-2.tcl create mode 100644 tests/cluster/tests/12.1-replica-migration-3.tcl create mode 100644 tests/cluster/tests/13-no-failover-option.tcl create mode 100644 tests/cluster/tests/14-consistency-check.tcl create mode 100644 tests/cluster/tests/15-cluster-slots.tcl create mode 100644 tests/cluster/tests/16-transactions-on-replica.tcl create mode 100644 tests/cluster/tests/17-diskless-load-swapdb.tcl create mode 100644 tests/cluster/tests/18-info.tcl create mode 100644 tests/cluster/tests/19-cluster-nodes-slots.tcl create mode 100644 tests/cluster/tests/20-half-migrated-slot.tcl create mode 100644 tests/cluster/tests/21-many-slot-migration.tcl create mode 100644 tests/cluster/tests/22-replica-in-sync.tcl create mode 100644 tests/cluster/tests/23-multiple-slot-operations.tcl create mode 100644 tests/cluster/tests/24-links.tcl create mode 100644 tests/cluster/tests/25-pubsubshard-slot-migration.tcl create mode 100644 tests/cluster/tests/26-pubsubshard.tcl create mode 100644 tests/cluster/tests/27-endpoints.tcl create mode 100644 tests/cluster/tests/28-cluster-shards.tcl create mode 100644 tests/cluster/tests/29-slot-migration-response.tcl create mode 100644 tests/cluster/tests/helpers/onlydots.tcl create mode 100644 tests/cluster/tests/includes/init-tests.tcl create mode 100644 tests/cluster/tests/includes/utils.tcl create mode 100644 tests/cluster/tmp/.gitignore (limited to 'tests/cluster') diff --git a/tests/cluster/cluster.tcl b/tests/cluster/cluster.tcl new file mode 100644 index 0000000..9c669e1 --- /dev/null +++ b/tests/cluster/cluster.tcl @@ -0,0 +1,288 @@ +# Cluster-specific test functions. +# +# Copyright (C) 2014 Salvatore Sanfilippo antirez@gmail.com +# This software is released under the BSD License. See the COPYING file for +# more information. + +# Track cluster configuration as created by create_cluster below +set ::cluster_master_nodes 0 +set ::cluster_replica_nodes 0 + +# Returns a parsed CLUSTER NODES output as a list of dictionaries. +proc get_cluster_nodes id { + set lines [split [R $id cluster nodes] "\r\n"] + set nodes {} + foreach l $lines { + set l [string trim $l] + if {$l eq {}} continue + set args [split $l] + set node [dict create \ + id [lindex $args 0] \ + addr [lindex $args 1] \ + flags [split [lindex $args 2] ,] \ + slaveof [lindex $args 3] \ + ping_sent [lindex $args 4] \ + pong_recv [lindex $args 5] \ + config_epoch [lindex $args 6] \ + linkstate [lindex $args 7] \ + slots [lrange $args 8 end] \ + ] + lappend nodes $node + } + return $nodes +} + +# Test node for flag. +proc has_flag {node flag} { + expr {[lsearch -exact [dict get $node flags] $flag] != -1} +} + +# Returns the parsed myself node entry as a dictionary. +proc get_myself id { + set nodes [get_cluster_nodes $id] + foreach n $nodes { + if {[has_flag $n myself]} {return $n} + } + return {} +} + +# Get a specific node by ID by parsing the CLUSTER NODES output +# of the instance Number 'instance_id' +proc get_node_by_id {instance_id node_id} { + set nodes [get_cluster_nodes $instance_id] + foreach n $nodes { + if {[dict get $n id] eq $node_id} {return $n} + } + return {} +} + +# Return the value of the specified CLUSTER INFO field. +proc CI {n field} { + get_info_field [R $n cluster info] $field +} + +# Return the value of the specified INFO field. +proc s {n field} { + get_info_field [R $n info] $field +} + +# Assuming nodes are reset, this function performs slots allocation. +# Only the first 'n' nodes are used. +proc cluster_allocate_slots {n} { + set slot 16383 + while {$slot >= 0} { + # Allocate successive slots to random nodes. + set node [randomInt $n] + lappend slots_$node $slot + incr slot -1 + } + for {set j 0} {$j < $n} {incr j} { + R $j cluster addslots {*}[set slots_${j}] + } +} + +# Check that cluster nodes agree about "state", or raise an error. +proc assert_cluster_state {state} { + foreach_redis_id id { + if {[instance_is_killed redis $id]} continue + wait_for_condition 1000 50 { + [CI $id cluster_state] eq $state + } else { + fail "Cluster node $id cluster_state:[CI $id cluster_state]" + } + } +} + +# Search the first node starting from ID $first that is not +# already configured as a slave. +proc cluster_find_available_slave {first} { + foreach_redis_id id { + if {$id < $first} continue + if {[instance_is_killed redis $id]} continue + set me [get_myself $id] + if {[dict get $me slaveof] eq {-}} {return $id} + } + fail "No available slaves" +} + +# Add 'slaves' slaves to a cluster composed of 'masters' masters. +# It assumes that masters are allocated sequentially from instance ID 0 +# to N-1. +proc cluster_allocate_slaves {masters slaves} { + for {set j 0} {$j < $slaves} {incr j} { + set master_id [expr {$j % $masters}] + set slave_id [cluster_find_available_slave $masters] + set master_myself [get_myself $master_id] + R $slave_id cluster replicate [dict get $master_myself id] + } +} + +# Create a cluster composed of the specified number of masters and slaves. +proc create_cluster {masters slaves} { + cluster_allocate_slots $masters + if {$slaves} { + cluster_allocate_slaves $masters $slaves + } + assert_cluster_state ok + + set ::cluster_master_nodes $masters + set ::cluster_replica_nodes $slaves +} + +proc cluster_allocate_with_continuous_slots {n} { + set slot 16383 + set avg [expr ($slot+1) / $n] + while {$slot >= 0} { + set node [expr $slot/$avg >= $n ? $n-1 : $slot/$avg] + lappend slots_$node $slot + incr slot -1 + } + for {set j 0} {$j < $n} {incr j} { + R $j cluster addslots {*}[set slots_${j}] + } +} + +# Create a cluster composed of the specified number of masters and slaves, +# but with a continuous slot range. +proc cluster_create_with_continuous_slots {masters slaves} { + cluster_allocate_with_continuous_slots $masters + if {$slaves} { + cluster_allocate_slaves $masters $slaves + } + assert_cluster_state ok + + set ::cluster_master_nodes $masters + set ::cluster_replica_nodes $slaves +} + + +# Set the cluster node-timeout to all the reachalbe nodes. +proc set_cluster_node_timeout {to} { + foreach_redis_id id { + catch {R $id CONFIG SET cluster-node-timeout $to} + } +} + +# Check if the cluster is writable and readable. Use node "id" +# as a starting point to talk with the cluster. +proc cluster_write_test {id} { + set prefix [randstring 20 20 alpha] + set port [get_instance_attrib redis $id port] + set cluster [redis_cluster 127.0.0.1:$port] + for {set j 0} {$j < 100} {incr j} { + $cluster set key.$j $prefix.$j + } + for {set j 0} {$j < 100} {incr j} { + assert {[$cluster get key.$j] eq "$prefix.$j"} + } + $cluster close +} + +# Check if cluster configuration is consistent. +proc cluster_config_consistent {} { + for {set j 0} {$j < $::cluster_master_nodes + $::cluster_replica_nodes} {incr j} { + if {$j == 0} { + set base_cfg [R $j cluster slots] + } else { + set cfg [R $j cluster slots] + if {$cfg != $base_cfg} { + return 0 + } + } + } + + return 1 +} + +# Wait for cluster configuration to propagate and be consistent across nodes. +proc wait_for_cluster_propagation {} { + wait_for_condition 50 100 { + [cluster_config_consistent] eq 1 + } else { + fail "cluster config did not reach a consistent state" + } +} + +# Check if cluster's view of hostnames is consistent +proc are_hostnames_propagated {match_string} { + for {set j 0} {$j < $::cluster_master_nodes + $::cluster_replica_nodes} {incr j} { + set cfg [R $j cluster slots] + foreach node $cfg { + for {set i 2} {$i < [llength $node]} {incr i} { + if {! [string match $match_string [lindex [lindex [lindex $node $i] 3] 1]] } { + return 0 + } + } + } + } + return 1 +} + +# Returns a parsed CLUSTER LINKS output of the instance identified +# by the given `id` as a list of dictionaries, with each dictionary +# corresponds to a link. +proc get_cluster_links id { + set lines [R $id cluster links] + set links {} + foreach l $lines { + if {$l eq {}} continue + assert_equal [llength $l] 12 + assert_equal [lindex $l 0] "direction" + set dir [lindex $l 1] + assert_equal [lindex $l 2] "node" + set node [lindex $l 3] + assert_equal [lindex $l 4] "create-time" + set create_time [lindex $l 5] + assert_equal [lindex $l 6] "events" + set events [lindex $l 7] + assert_equal [lindex $l 8] "send-buffer-allocated" + set send_buffer_allocated [lindex $l 9] + assert_equal [lindex $l 10] "send-buffer-used" + set send_buffer_used [lindex $l 11] + set link [dict create \ + dir $dir \ + node $node \ + create_time $create_time \ + events $events \ + send_buffer_allocated $send_buffer_allocated \ + send_buffer_used $send_buffer_used \ + ] + lappend links $link + } + return $links +} + +proc get_links_with_peer {this_instance_id peer_nodename} { + set links [get_cluster_links $this_instance_id] + set links_with_peer {} + foreach l $links { + if {[dict get $l node] eq $peer_nodename} { + lappend links_with_peer $l + } + } + return $links_with_peer +} + +# Return the entry in CLUSTER LINKS output by instance identified by `this_instance_id` that +# corresponds to the link established toward a peer identified by `peer_nodename` +proc get_link_to_peer {this_instance_id peer_nodename} { + set links_with_peer [get_links_with_peer $this_instance_id $peer_nodename] + foreach l $links_with_peer { + if {[dict get $l dir] eq "to"} { + return $l + } + } + return {} +} + +# Return the entry in CLUSTER LINKS output by instance identified by `this_instance_id` that +# corresponds to the link accepted from a peer identified by `peer_nodename` +proc get_link_from_peer {this_instance_id peer_nodename} { + set links_with_peer [get_links_with_peer $this_instance_id $peer_nodename] + foreach l $links_with_peer { + if {[dict get $l dir] eq "from"} { + return $l + } + } + return {} +} \ No newline at end of file diff --git a/tests/cluster/run.tcl b/tests/cluster/run.tcl new file mode 100644 index 0000000..c81d8f3 --- /dev/null +++ b/tests/cluster/run.tcl @@ -0,0 +1,31 @@ +# Cluster test suite. Copyright (C) 2014 Salvatore Sanfilippo antirez@gmail.com +# This software is released under the BSD License. See the COPYING file for +# more information. + +cd tests/cluster +source cluster.tcl +source ../instances.tcl +source ../../support/cluster.tcl ; # Redis Cluster client. + +set ::instances_count 20 ; # How many instances we use at max. +set ::tlsdir "../../tls" + +proc main {} { + parse_options + spawn_instance redis $::redis_base_port $::instances_count { + "cluster-enabled yes" + "appendonly yes" + "enable-protected-configs yes" + "enable-debug-command yes" + } + run_tests + cleanup + end_tests +} + +if {[catch main e]} { + puts $::errorInfo + if {$::pause_on_error} pause_on_error + cleanup + exit 1 +} diff --git a/tests/cluster/tests/00-base.tcl b/tests/cluster/tests/00-base.tcl new file mode 100644 index 0000000..e9e99ba --- /dev/null +++ b/tests/cluster/tests/00-base.tcl @@ -0,0 +1,95 @@ +# Check the basic monitoring and failover capabilities. + +source "../tests/includes/init-tests.tcl" + +if {$::simulate_error} { + test "This test will fail" { + fail "Simulated error" + } +} + +test "Different nodes have different IDs" { + set ids {} + set numnodes 0 + foreach_redis_id id { + incr numnodes + # Every node should just know itself. + set nodeid [dict get [get_myself $id] id] + assert {$nodeid ne {}} + lappend ids $nodeid + } + set numids [llength [lsort -unique $ids]] + assert {$numids == $numnodes} +} + +test "It is possible to perform slot allocation" { + cluster_allocate_slots 5 +} + +test "After the join, every node gets a different config epoch" { + set trynum 60 + while {[incr trynum -1] != 0} { + # We check that this condition is true for *all* the nodes. + set ok 1 ; # Will be set to 0 every time a node is not ok. + foreach_redis_id id { + set epochs {} + foreach n [get_cluster_nodes $id] { + lappend epochs [dict get $n config_epoch] + } + if {[lsort $epochs] != [lsort -unique $epochs]} { + set ok 0 ; # At least one collision! + } + } + if {$ok} break + after 1000 + puts -nonewline . + flush stdout + } + if {$trynum == 0} { + fail "Config epoch conflict resolution is not working." + } +} + +test "Nodes should report cluster_state is ok now" { + assert_cluster_state ok +} + +test "Sanity for CLUSTER COUNTKEYSINSLOT" { + set reply [R 0 CLUSTER COUNTKEYSINSLOT 0] + assert {$reply eq 0} +} + +test "It is possible to write and read from the cluster" { + cluster_write_test 0 +} + +test "Function no-cluster flag" { + R 1 function load {#!lua name=test + redis.register_function{function_name='f1', callback=function() return 'hello' end, flags={'no-cluster'}} + } + catch {R 1 fcall f1 0} e + assert_match {*Can not run script on cluster, 'no-cluster' flag is set*} $e +} + +test "Script no-cluster flag" { + catch { + R 1 eval {#!lua flags=no-cluster + return 1 + } 0 + } e + + assert_match {*Can not run script on cluster, 'no-cluster' flag is set*} $e +} + +test "CLUSTER RESET SOFT test" { + set last_epoch_node0 [get_info_field [R 0 cluster info] cluster_current_epoch] + R 0 FLUSHALL + R 0 CLUSTER RESET + assert {[get_info_field [R 0 cluster info] cluster_current_epoch] eq $last_epoch_node0} + + set last_epoch_node1 [get_info_field [R 1 cluster info] cluster_current_epoch] + R 1 FLUSHALL + R 1 CLUSTER RESET SOFT + assert {[get_info_field [R 1 cluster info] cluster_current_epoch] eq $last_epoch_node1} +} + diff --git a/tests/cluster/tests/01-faildet.tcl b/tests/cluster/tests/01-faildet.tcl new file mode 100644 index 0000000..8fe87c9 --- /dev/null +++ b/tests/cluster/tests/01-faildet.tcl @@ -0,0 +1,38 @@ +# Check the basic monitoring and failover capabilities. + +source "../tests/includes/init-tests.tcl" + +test "Create a 5 nodes cluster" { + create_cluster 5 5 +} + +test "Cluster should start ok" { + assert_cluster_state ok +} + +test "Killing two slave nodes" { + kill_instance redis 5 + kill_instance redis 6 +} + +test "Cluster should be still up" { + assert_cluster_state ok +} + +test "Killing one master node" { + kill_instance redis 0 +} + +# Note: the only slave of instance 0 is already down so no +# failover is possible, that would change the state back to ok. +test "Cluster should be down now" { + assert_cluster_state fail +} + +test "Restarting master node" { + restart_instance redis 0 +} + +test "Cluster should be up again" { + assert_cluster_state ok +} diff --git a/tests/cluster/tests/02-failover.tcl b/tests/cluster/tests/02-failover.tcl new file mode 100644 index 0000000..6b2fd09 --- /dev/null +++ b/tests/cluster/tests/02-failover.tcl @@ -0,0 +1,65 @@ +# Check the basic monitoring and failover capabilities. + +source "../tests/includes/init-tests.tcl" + +test "Create a 5 nodes cluster" { + create_cluster 5 5 +} + +test "Cluster is up" { + assert_cluster_state ok +} + +test "Cluster is writable" { + cluster_write_test 0 +} + +test "Instance #5 is a slave" { + assert {[RI 5 role] eq {slave}} +} + +test "Instance #5 synced with the master" { + wait_for_condition 1000 50 { + [RI 5 master_link_status] eq {up} + } else { + fail "Instance #5 master link status is not up" + } +} + +set current_epoch [CI 1 cluster_current_epoch] + +test "Killing one master node" { + kill_instance redis 0 +} + +test "Wait for failover" { + wait_for_condition 1000 50 { + [CI 1 cluster_current_epoch] > $current_epoch + } else { + fail "No failover detected" + } +} + +test "Cluster should eventually be up again" { + assert_cluster_state ok +} + +test "Cluster is writable" { + cluster_write_test 1 +} + +test "Instance #5 is now a master" { + assert {[RI 5 role] eq {master}} +} + +test "Restarting the previously killed master node" { + restart_instance redis 0 +} + +test "Instance #0 gets converted into a slave" { + wait_for_condition 1000 50 { + [RI 0 role] eq {slave} + } else { + fail "Old master was not converted into slave" + } +} diff --git a/tests/cluster/tests/03-failover-loop.tcl b/tests/cluster/tests/03-failover-loop.tcl new file mode 100644 index 0000000..46c22a9 --- /dev/null +++ b/tests/cluster/tests/03-failover-loop.tcl @@ -0,0 +1,117 @@ +# Failover stress test. +# In this test a different node is killed in a loop for N +# iterations. The test checks that certain properties +# are preserved across iterations. + +source "../tests/includes/init-tests.tcl" + +test "Create a 5 nodes cluster" { + create_cluster 5 5 +} + +test "Cluster is up" { + assert_cluster_state ok +} + +set iterations 20 +set cluster [redis_cluster 127.0.0.1:[get_instance_attrib redis 0 port]] + +while {[incr iterations -1]} { + set tokill [randomInt 10] + set other [expr {($tokill+1)%10}] ; # Some other instance. + set key [randstring 20 20 alpha] + set val [randstring 20 20 alpha] + set role [RI $tokill role] + if {$role eq {master}} { + set slave {} + set myid [dict get [get_myself $tokill] id] + foreach_redis_id id { + if {$id == $tokill} continue + if {[dict get [get_myself $id] slaveof] eq $myid} { + set slave $id + } + } + if {$slave eq {}} { + fail "Unable to retrieve slave's ID for master #$tokill" + } + } + + puts "--- Iteration $iterations ---" + + if {$role eq {master}} { + test "Wait for slave of #$tokill to sync" { + wait_for_condition 1000 50 { + [string match {*state=online*} [RI $tokill slave0]] + } else { + fail "Slave of node #$tokill is not ok" + } + } + set slave_config_epoch [CI $slave cluster_my_epoch] + } + + test "Cluster is writable before failover" { + for {set i 0} {$i < 100} {incr i} { + catch {$cluster set $key:$i $val:$i} err + assert {$err eq {OK}} + } + # Wait for the write to propagate to the slave if we + # are going to kill a master. + if {$role eq {master}} { + R $tokill wait 1 20000 + } + } + + test "Terminating node #$tokill" { + # Stop AOF so that an initial AOFRW won't prevent the instance from terminating + R $tokill config set appendonly no + kill_instance redis $tokill + } + + if {$role eq {master}} { + test "Wait failover by #$slave with old epoch $slave_config_epoch" { + wait_for_condition 1000 50 { + [CI $slave cluster_my_epoch] > $slave_config_epoch + } else { + fail "No failover detected, epoch is still [CI $slave cluster_my_epoch]" + } + } + } + + test "Cluster should eventually be up again" { + assert_cluster_state ok + } + + test "Cluster is writable again" { + for {set i 0} {$i < 100} {incr i} { + catch {$cluster set $key:$i:2 $val:$i:2} err + assert {$err eq {OK}} + } + } + + test "Restarting node #$tokill" { + restart_instance redis $tokill + } + + test "Instance #$tokill is now a slave" { + wait_for_condition 1000 50 { + [RI $tokill role] eq {slave} + } else { + fail "Restarted instance is not a slave" + } + } + + test "We can read back the value we set before" { + for {set i 0} {$i < 100} {incr i} { + catch {$cluster get $key:$i} err + assert {$err eq "$val:$i"} + catch {$cluster get $key:$i:2} err + assert {$err eq "$val:$i:2"} + } + } +} + +test "Post condition: current_epoch >= my_epoch everywhere" { + foreach_redis_id id { + assert {[CI $id cluster_current_epoch] >= [CI $id cluster_my_epoch]} + } +} diff --git a/tests/cluster/tests/04-resharding.tcl b/tests/cluster/tests/04-resharding.tcl new file mode 100644 index 0000000..18a26bd --- /dev/null +++ b/tests/cluster/tests/04-resharding.tcl @@ -0,0 +1,196 @@ +# Failover stress test. +# In this test a different node is killed in a loop for N +# iterations. The test checks that certain properties +# are preserved across iterations. + +source "../tests/includes/init-tests.tcl" +source "../../../tests/support/cli.tcl" + +test "Create a 5 nodes cluster" { + create_cluster 5 5 +} + +test "Cluster is up" { + assert_cluster_state ok +} + +test "Enable AOF in all the instances" { + foreach_redis_id id { + R $id config set appendonly yes + # We use "appendfsync no" because it's fast but also guarantees that + # write(2) is performed before replying to client. + R $id config set appendfsync no + } + + foreach_redis_id id { + wait_for_condition 1000 500 { + [RI $id aof_rewrite_in_progress] == 0 && + [RI $id aof_enabled] == 1 + } else { + fail "Failed to enable AOF on instance #$id" + } + } +} + +# Return non-zero if the specified PID is about a process still in execution, +# otherwise 0 is returned. +proc process_is_running {pid} { + # PS should return with an error if PID is non existing, + # and catch will return non-zero. We want to return non-zero if + # the PID exists, so we invert the return value with expr not operator. + expr {![catch {exec ps -p $pid}]} +} + +# Our resharding test performs the following actions: +# +# - N commands are sent to the cluster in the course of the test. +# - Every command selects a random key from key:0 to key:MAX-1. +# - The operation RPUSH key is performed. +# - Tcl remembers into an array all the values pushed to each list. +# - After N/2 commands, the resharding process is started in background. +# - The test continues while the resharding is in progress. +# - At the end of the test, we wait for the resharding process to stop. +# - Finally the keys are checked to see if they contain the value they should. + +set numkeys 50000 +set numops 200000 +set start_node_port [get_instance_attrib redis 0 port] +set cluster [redis_cluster 127.0.0.1:$start_node_port] +if {$::tls} { + # setup a non-TLS cluster client to the TLS cluster + set plaintext_port [get_instance_attrib redis 0 plaintext-port] + set cluster_plaintext [redis_cluster 127.0.0.1:$plaintext_port 0] + puts "Testing TLS cluster on start node 127.0.0.1:$start_node_port, plaintext port $plaintext_port" +} else { + set cluster_plaintext $cluster + puts "Testing using non-TLS cluster" +} +catch {unset content} +array set content {} +set tribpid {} + +test "Cluster consistency during live resharding" { + set ele 0 + for {set j 0} {$j < $numops} {incr j} { + # Trigger the resharding once we execute half the ops. + if {$tribpid ne {} && + ($j % 10000) == 0 && + ![process_is_running $tribpid]} { + set tribpid {} + } + + if {$j >= $numops/2 && $tribpid eq {}} { + puts -nonewline "...Starting resharding..." + flush stdout + set target [dict get [get_myself [randomInt 5]] id] + set tribpid [lindex [exec \ + ../../../src/redis-cli --cluster reshard \ + 127.0.0.1:[get_instance_attrib redis 0 port] \ + --cluster-from all \ + --cluster-to $target \ + --cluster-slots 100 \ + --cluster-yes \ + {*}[rediscli_tls_config "../../../tests"] \ + | [info nameofexecutable] \ + ../tests/helpers/onlydots.tcl \ + &] 0] + } + + # Write random data to random list. + set listid [randomInt $numkeys] + set key "key:$listid" + incr ele + # We write both with Lua scripts and with plain commands. + # This way we are able to stress Lua -> Redis command invocation + # as well, that has tests to prevent Lua to write into wrong + # hash slots. + # We also use both TLS and plaintext connections. + if {$listid % 3 == 0} { + $cluster rpush $key $ele + } elseif {$listid % 3 == 1} { + $cluster_plaintext rpush $key $ele + } else { + $cluster eval {redis.call("rpush",KEYS[1],ARGV[1])} 1 $key $ele + } + lappend content($key) $ele + + if {($j % 1000) == 0} { + puts -nonewline W; flush stdout + } + } + + # Wait for the resharding process to end + wait_for_condition 1000 500 { + [process_is_running $tribpid] == 0 + } else { + fail "Resharding is not terminating after some time." + } + +} + +test "Verify $numkeys keys for consistency with logical content" { + # Check that the Redis Cluster content matches our logical content. + foreach {key value} [array get content] { + if {[$cluster lrange $key 0 -1] ne $value} { + fail "Key $key expected to hold '$value' but actual content is [$cluster lrange $key 0 -1]" + } + } +} + +test "Terminate and restart all the instances" { + foreach_redis_id id { + # Stop AOF so that an initial AOFRW won't prevent the instance from terminating + R $id config set appendonly no + kill_instance redis $id + restart_instance redis $id + } +} + +test "Cluster should eventually be up again" { + assert_cluster_state ok +} + +test "Verify $numkeys keys after the restart" { + # Check that the Redis Cluster content matches our logical content. + foreach {key value} [array get content] { + if {[$cluster lrange $key 0 -1] ne $value} { + fail "Key $key expected to hold '$value' but actual content is [$cluster lrange $key 0 -1]" + } + } +} + +test "Disable AOF in all the instances" { + foreach_redis_id id { + R $id config set appendonly no + } +} + +test "Verify slaves consistency" { + set verified_masters 0 + foreach_redis_id id { + set role [R $id role] + lassign $role myrole myoffset slaves + if {$myrole eq {slave}} continue + set masterport [get_instance_attrib redis $id port] + set masterdigest [R $id debug digest] + foreach_redis_id sid { + set srole [R $sid role] + if {[lindex $srole 0] eq {master}} continue + if {[lindex $srole 2] != $masterport} continue + wait_for_condition 1000 500 { + [R $sid debug digest] eq $masterdigest + } else { + fail "Master and slave data digest are different" + } + incr verified_masters + } + } + assert {$verified_masters >= 5} +} + +test "Dump sanitization was skipped for migrations" { + set verified_masters 0 + foreach_redis_id id { + assert {[RI $id dump_payload_sanitizations] == 0} + } +} diff --git a/tests/cluster/tests/05-slave-selection.tcl b/tests/cluster/tests/05-slave-selection.tcl new file mode 100644 index 0000000..f0ce863 --- /dev/null +++ b/tests/cluster/tests/05-slave-selection.tcl @@ -0,0 +1,179 @@ +# Slave selection test +# Check the algorithm trying to pick the slave with the most complete history. + +source "../tests/includes/init-tests.tcl" + +# Create a cluster with 5 master and 10 slaves, so that we have 2 +# slaves for each master. +test "Create a 5 nodes cluster" { + create_cluster 5 10 +} + +test "Cluster is up" { + assert_cluster_state ok +} + +test "The first master has actually two slaves" { + wait_for_condition 1000 50 { + [llength [lindex [R 0 role] 2]] == 2 + } else { + fail "replicas didn't connect" + } +} + +test {Slaves of #0 are instance #5 and #10 as expected} { + set port0 [get_instance_attrib redis 0 port] + assert {[lindex [R 5 role] 2] == $port0} + assert {[lindex [R 10 role] 2] == $port0} +} + +test "Instance #5 and #10 synced with the master" { + wait_for_condition 1000 50 { + [RI 5 master_link_status] eq {up} && + [RI 10 master_link_status] eq {up} + } else { + fail "Instance #5 or #10 master link status is not up" + } +} + +set cluster [redis_cluster 127.0.0.1:[get_instance_attrib redis 0 port]] + +test "Slaves are both able to receive and acknowledge writes" { + for {set j 0} {$j < 100} {incr j} { + $cluster set $j $j + } + assert {[R 0 wait 2 60000] == 2} +} + +test "Write data while slave #10 is paused and can't receive it" { + # Stop the slave with a multi/exec transaction so that the master will + # be killed as soon as it can accept writes again. + R 10 multi + R 10 debug sleep 10 + R 10 client kill 127.0.0.1:$port0 + R 10 deferred 1 + R 10 exec + + # Write some data the slave can't receive. + for {set j 0} {$j < 100} {incr j} { + $cluster set $j $j + } + + # Prevent the master from accepting new slaves. + # Use a large pause value since we'll kill it anyway. + R 0 CLIENT PAUSE 60000 + + # Wait for the slave to return available again + R 10 deferred 0 + assert {[R 10 read] eq {OK OK}} + + # Kill the master so that a reconnection will not be possible. + kill_instance redis 0 +} + +test "Wait for instance #5 (and not #10) to turn into a master" { + wait_for_condition 1000 50 { + [RI 5 role] eq {master} + } else { + fail "No failover detected" + } +} + +test "Wait for the node #10 to return alive before ending the test" { + R 10 ping +} + +test "Cluster should eventually be up again" { + assert_cluster_state ok +} + +test "Node #10 should eventually replicate node #5" { + set port5 [get_instance_attrib redis 5 port] + wait_for_condition 1000 50 { + ([lindex [R 10 role] 2] == $port5) && + ([lindex [R 10 role] 3] eq {connected}) + } else { + fail "#10 didn't became slave of #5" + } +} + +source "../tests/includes/init-tests.tcl" + +# Create a cluster with 3 master and 15 slaves, so that we have 5 +# slaves for eatch master. +test "Create a 3 nodes cluster" { + create_cluster 3 15 +} + +test "Cluster is up" { + assert_cluster_state ok +} + +test "The first master has actually 5 slaves" { + wait_for_condition 1000 50 { + [llength [lindex [R 0 role] 2]] == 5 + } else { + fail "replicas didn't connect" + } +} + +test {Slaves of #0 are instance #3, #6, #9, #12 and #15 as expected} { + set port0 [get_instance_attrib redis 0 port] + assert {[lindex [R 3 role] 2] == $port0} + assert {[lindex [R 6 role] 2] == $port0} + assert {[lindex [R 9 role] 2] == $port0} + assert {[lindex [R 12 role] 2] == $port0} + assert {[lindex [R 15 role] 2] == $port0} +} + +test {Instance #3, #6, #9, #12 and #15 synced with the master} { + wait_for_condition 1000 50 { + [RI 3 master_link_status] eq {up} && + [RI 6 master_link_status] eq {up} && + [RI 9 master_link_status] eq {up} && + [RI 12 master_link_status] eq {up} && + [RI 15 master_link_status] eq {up} + } else { + fail "Instance #3 or #6 or #9 or #12 or #15 master link status is not up" + } +} + +proc master_detected {instances} { + foreach instance [dict keys $instances] { + if {[RI $instance role] eq {master}} { + return true + } + } + + return false +} + +test "New Master down consecutively" { + set instances [dict create 0 1 3 1 6 1 9 1 12 1 15 1] + + set loops [expr {[dict size $instances]-1}] + for {set i 0} {$i < $loops} {incr i} { + set master_id -1 + foreach instance [dict keys $instances] { + if {[RI $instance role] eq {master}} { + set master_id $instance + break; + } + } + + if {$master_id eq -1} { + fail "no master detected, #loop $i" + } + + set instances [dict remove $instances $master_id] + + kill_instance redis $master_id + wait_for_condition 1000 50 { + [master_detected $instances] + } else { + fail "No failover detected when master $master_id fails" + } + + assert_cluster_state ok + } +} diff --git a/tests/cluster/tests/06-slave-stop-cond.tcl b/tests/cluster/tests/06-slave-stop-cond.tcl new file mode 100644 index 0000000..80a2d17 --- /dev/null +++ b/tests/cluster/tests/06-slave-stop-cond.tcl @@ -0,0 +1,77 @@ +# Slave stop condition test +# Check that if there is a disconnection time limit, the slave will not try +# to failover its master. + +source "../tests/includes/init-tests.tcl" + +# Create a cluster with 5 master and 5 slaves. +test "Create a 5 nodes cluster" { + create_cluster 5 5 +} + +test "Cluster is up" { + assert_cluster_state ok +} + +test "The first master has actually one slave" { + wait_for_condition 1000 50 { + [llength [lindex [R 0 role] 2]] == 1 + } else { + fail "replicas didn't connect" + } +} + +test {Slaves of #0 is instance #5 as expected} { + set port0 [get_instance_attrib redis 0 port] + assert {[lindex [R 5 role] 2] == $port0} +} + +test "Instance #5 synced with the master" { + wait_for_condition 1000 50 { + [RI 5 master_link_status] eq {up} + } else { + fail "Instance #5 master link status is not up" + } +} + +test "Lower the slave validity factor of #5 to the value of 2" { + assert {[R 5 config set cluster-slave-validity-factor 2] eq {OK}} +} + +test "Break master-slave link and prevent further reconnections" { + # Stop the slave with a multi/exec transaction so that the master will + # be killed as soon as it can accept writes again. + R 5 multi + R 5 client kill 127.0.0.1:$port0 + # here we should sleep 6 or more seconds (node_timeout * slave_validity) + # but the actual validity time is actually incremented by the + # repl-ping-slave-period value which is 10 seconds by default. So we + # need to wait more than 16 seconds. + R 5 debug sleep 20 + R 5 deferred 1 + R 5 exec + + # Prevent the master from accepting new slaves. + # Use a large pause value since we'll kill it anyway. + R 0 CLIENT PAUSE 60000 + + # Wait for the slave to return available again + R 5 deferred 0 + assert {[R 5 read] eq {OK OK}} + + # Kill the master so that a reconnection will not be possible. + kill_instance redis 0 +} + +test "Slave #5 is reachable and alive" { + assert {[R 5 ping] eq {PONG}} +} + +test "Slave #5 should not be able to failover" { + after 10000 + assert {[RI 5 role] eq {slave}} +} + +test "Cluster should be down" { + assert_cluster_state fail +} diff --git a/tests/cluster/tests/07-replica-migration.tcl b/tests/cluster/tests/07-replica-migration.tcl new file mode 100644 index 0000000..c4e9985 --- /dev/null +++ b/tests/cluster/tests/07-replica-migration.tcl @@ -0,0 +1,103 @@ +# Replica migration test. +# Check that orphaned masters are joined by replicas of masters having +# multiple replicas attached, according to the migration barrier settings. + +source "../tests/includes/init-tests.tcl" + +# Create a cluster with 5 master and 10 slaves, so that we have 2 +# slaves for each master. +test "Create a 5 nodes cluster" { + create_cluster 5 10 +} + +test "Cluster is up" { + assert_cluster_state ok +} + +test "Each master should have two replicas attached" { + foreach_redis_id id { + if {$id < 5} { + wait_for_condition 1000 50 { + [llength [lindex [R $id role] 2]] == 2 + } else { + fail "Master #$id does not have 2 slaves as expected" + } + } + } +} + +test "Killing all the slaves of master #0 and #1" { + kill_instance redis 5 + kill_instance redis 10 + kill_instance redis 6 + kill_instance redis 11 + after 4000 +} + +foreach_redis_id id { + if {$id < 5} { + test "Master #$id should have at least one replica" { + wait_for_condition 1000 50 { + [llength [lindex [R $id role] 2]] >= 1 + } else { + fail "Master #$id has no replicas" + } + } + } +} + +# Now test the migration to a master which used to be a slave, after +# a failver. + +source "../tests/includes/init-tests.tcl" + +# Create a cluster with 5 master and 10 slaves, so that we have 2 +# slaves for each master. +test "Create a 5 nodes cluster" { + create_cluster 5 10 +} + +test "Cluster is up" { + assert_cluster_state ok +} + +test "Kill slave #7 of master #2. Only slave left is #12 now" { + kill_instance redis 7 +} + +set current_epoch [CI 1 cluster_current_epoch] + +test "Killing master node #2, #12 should failover" { + kill_instance redis 2 +} + +test "Wait for failover" { + wait_for_condition 1000 50 { + [CI 1 cluster_current_epoch] > $current_epoch + } else { + fail "No failover detected" + } +} + +test "Cluster should eventually be up again" { + assert_cluster_state ok +} + +test "Cluster is writable" { + cluster_write_test 1 +} + +test "Instance 12 is now a master without slaves" { + assert {[RI 12 role] eq {master}} +} + +# The remaining instance is now without slaves. Some other slave +# should migrate to it. + +test "Master #12 should get at least one migrated replica" { + wait_for_condition 1000 50 { + [llength [lindex [R 12 role] 2]] >= 1 + } else { + fail "Master #12 has no replicas" + } +} diff --git a/tests/cluster/tests/08-update-msg.tcl b/tests/cluster/tests/08-update-msg.tcl new file mode 100644 index 0000000..9011f32 --- /dev/null +++ b/tests/cluster/tests/08-update-msg.tcl @@ -0,0 +1,90 @@ +# Test UPDATE messages sent by other nodes when the currently authorirative +# master is unavailable. The test is performed in the following steps: +# +# 1) Master goes down. +# 2) Slave failover and becomes new master. +# 3) New master is partitioned away. +# 4) Old master returns. +# 5) At this point we expect the old master to turn into a slave ASAP because +# of the UPDATE messages it will receive from the other nodes when its +# configuration will be found to be outdated. + +source "../tests/includes/init-tests.tcl" + +test "Create a 5 nodes cluster" { + create_cluster 5 5 +} + +test "Cluster is up" { + assert_cluster_state ok +} + +test "Cluster is writable" { + cluster_write_test 0 +} + +test "Instance #5 is a slave" { + assert {[RI 5 role] eq {slave}} +} + +test "Instance #5 synced with the master" { + wait_for_condition 1000 50 { + [RI 5 master_link_status] eq {up} + } else { + fail "Instance #5 master link status is not up" + } +} + +set current_epoch [CI 1 cluster_current_epoch] + +test "Killing one master node" { + kill_instance redis 0 +} + +test "Wait for failover" { + wait_for_condition 1000 50 { + [CI 1 cluster_current_epoch] > $current_epoch + } else { + fail "No failover detected" + } +} + +test "Cluster should eventually be up again" { + assert_cluster_state ok +} + +test "Cluster is writable" { + cluster_write_test 1 +} + +test "Instance #5 is now a master" { + assert {[RI 5 role] eq {master}} +} + +test "Killing the new master #5" { + kill_instance redis 5 +} + +test "Cluster should be down now" { + assert_cluster_state fail +} + +test "Restarting the old master node" { + restart_instance redis 0 +} + +test "Instance #0 gets converted into a slave" { + wait_for_condition 1000 50 { + [RI 0 role] eq {slave} + } else { + fail "Old master was not converted into slave" + } +} + +test "Restarting the new master node" { + restart_instance redis 5 +} + +test "Cluster is up again" { + assert_cluster_state ok +} diff --git a/tests/cluster/tests/09-pubsub.tcl b/tests/cluster/tests/09-pubsub.tcl new file mode 100644 index 0000000..e62b91c --- /dev/null +++ b/tests/cluster/tests/09-pubsub.tcl @@ -0,0 +1,40 @@ +# Test PUBLISH propagation across the cluster. + +source "../tests/includes/init-tests.tcl" + +test "Create a 5 nodes cluster" { + create_cluster 5 5 +} + +proc test_cluster_publish {instance instances} { + # Subscribe all the instances but the one we use to send. + for {set j 0} {$j < $instances} {incr j} { + if {$j != $instance} { + R $j deferred 1 + R $j subscribe testchannel + R $j read; # Read the subscribe reply + } + } + + set data [randomValue] + R $instance PUBLISH testchannel $data + + # Read the message back from all the nodes. + for {set j 0} {$j < $instances} {incr j} { + if {$j != $instance} { + set msg [R $j read] + assert {$data eq [lindex $msg 2]} + R $j unsubscribe testchannel + R $j read; # Read the unsubscribe reply + R $j deferred 0 + } + } +} + +test "Test publishing to master" { + test_cluster_publish 0 10 +} + +test "Test publishing to slave" { + test_cluster_publish 5 10 +} diff --git a/tests/cluster/tests/10-manual-failover.tcl b/tests/cluster/tests/10-manual-failover.tcl new file mode 100644 index 0000000..5441b79 --- /dev/null +++ b/tests/cluster/tests/10-manual-failover.tcl @@ -0,0 +1,192 @@ +# Check the manual failover + +source "../tests/includes/init-tests.tcl" + +test "Create a 5 nodes cluster" { + create_cluster 5 5 +} + +test "Cluster is up" { + assert_cluster_state ok +} + +test "Cluster is writable" { + cluster_write_test 0 +} + +test "Instance #5 is a slave" { + assert {[RI 5 role] eq {slave}} +} + +test "Instance #5 synced with the master" { + wait_for_condition 1000 50 { + [RI 5 master_link_status] eq {up} + } else { + fail "Instance #5 master link status is not up" + } +} + +set current_epoch [CI 1 cluster_current_epoch] + +set numkeys 50000 +set numops 10000 +set cluster [redis_cluster 127.0.0.1:[get_instance_attrib redis 0 port]] +catch {unset content} +array set content {} + +test "Send CLUSTER FAILOVER to #5, during load" { + for {set j 0} {$j < $numops} {incr j} { + # Write random data to random list. + set listid [randomInt $numkeys] + set key "key:$listid" + set ele [randomValue] + # We write both with Lua scripts and with plain commands. + # This way we are able to stress Lua -> Redis command invocation + # as well, that has tests to prevent Lua to write into wrong + # hash slots. + if {$listid % 2} { + $cluster rpush $key $ele + } else { + $cluster eval {redis.call("rpush",KEYS[1],ARGV[1])} 1 $key $ele + } + lappend content($key) $ele + + if {($j % 1000) == 0} { + puts -nonewline W; flush stdout + } + + if {$j == $numops/2} {R 5 cluster failover} + } +} + +test "Wait for failover" { + wait_for_condition 1000 50 { + [CI 1 cluster_current_epoch] > $current_epoch + } else { + fail "No failover detected" + } +} + +test "Cluster should eventually be up again" { + assert_cluster_state ok +} + +test "Cluster is writable" { + cluster_write_test 1 +} + +test "Instance #5 is now a master" { + assert {[RI 5 role] eq {master}} +} + +test "Verify $numkeys keys for consistency with logical content" { + # Check that the Redis Cluster content matches our logical content. + foreach {key value} [array get content] { + assert {[$cluster lrange $key 0 -1] eq $value} + } +} + +test "Instance #0 gets converted into a slave" { + wait_for_condition 1000 50 { + [RI 0 role] eq {slave} + } else { + fail "Old master was not converted into slave" + } +} + +## Check that manual failover does not happen if we can't talk with the master. + +source "../tests/includes/init-tests.tcl" + +test "Create a 5 nodes cluster" { + create_cluster 5 5 +} + +test "Cluster is up" { + assert_cluster_state ok +} + +test "Cluster is writable" { + cluster_write_test 0 +} + +test "Instance #5 is a slave" { + assert {[RI 5 role] eq {slave}} +} + +test "Instance #5 synced with the master" { + wait_for_condition 1000 50 { + [RI 5 master_link_status] eq {up} + } else { + fail "Instance #5 master link status is not up" + } +} + +test "Make instance #0 unreachable without killing it" { + R 0 deferred 1 + R 0 DEBUG SLEEP 10 +} + +test "Send CLUSTER FAILOVER to instance #5" { + R 5 cluster failover +} + +test "Instance #5 is still a slave after some time (no failover)" { + after 5000 + assert {[RI 5 role] eq {master}} +} + +test "Wait for instance #0 to return back alive" { + R 0 deferred 0 + assert {[R 0 read] eq {OK}} +} + +## Check with "force" failover happens anyway. + +source "../tests/includes/init-tests.tcl" + +test "Create a 5 nodes cluster" { + create_cluster 5 5 +} + +test "Cluster is up" { + assert_cluster_state ok +} + +test "Cluster is writable" { + cluster_write_test 0 +} + +test "Instance #5 is a slave" { + assert {[RI 5 role] eq {slave}} +} + +test "Instance #5 synced with the master" { + wait_for_condition 1000 50 { + [RI 5 master_link_status] eq {up} + } else { + fail "Instance #5 master link status is not up" + } +} + +test "Make instance #0 unreachable without killing it" { + R 0 deferred 1 + R 0 DEBUG SLEEP 10 +} + +test "Send CLUSTER FAILOVER to instance #5" { + R 5 cluster failover force +} + +test "Instance #5 is a master after some time" { + wait_for_condition 1000 50 { + [RI 5 role] eq {master} + } else { + fail "Instance #5 is not a master after some time regardless of FORCE" + } +} + +test "Wait for instance #0 to return back alive" { + R 0 deferred 0 + assert {[R 0 read] eq {OK}} +} diff --git a/tests/cluster/tests/11-manual-takeover.tcl b/tests/cluster/tests/11-manual-takeover.tcl new file mode 100644 index 0000000..78a0f85 --- /dev/null +++ b/tests/cluster/tests/11-manual-takeover.tcl @@ -0,0 +1,71 @@ +# Manual takeover test + +source "../tests/includes/init-tests.tcl" + +test "Create a 5 nodes cluster" { + create_cluster 5 5 +} + +test "Cluster is up" { + assert_cluster_state ok +} + +test "Cluster is writable" { + cluster_write_test 0 +} + +# For this test, disable replica failover until +# all of the primaries are confirmed killed. Otherwise +# there might be enough time to elect a replica. +set replica_ids { 5 6 7 } +foreach id $replica_ids { + R $id config set cluster-replica-no-failover yes +} + +test "Killing majority of master nodes" { + kill_instance redis 0 + kill_instance redis 1 + kill_instance redis 2 +} + +foreach id $replica_ids { + R $id config set cluster-replica-no-failover no +} + +test "Cluster should eventually be down" { + assert_cluster_state fail +} + +test "Use takeover to bring slaves back" { + foreach id $replica_ids { + R $id cluster failover takeover + } +} + +test "Cluster should eventually be up again" { + assert_cluster_state ok +} + +test "Cluster is writable" { + cluster_write_test 4 +} + +test "Instance #5, #6, #7 are now masters" { + foreach id $replica_ids { + assert {[RI $id role] eq {master}} + } +} + +test "Restarting the previously killed master nodes" { + restart_instance redis 0 + restart_instance redis 1 + restart_instance redis 2 +} + +test "Instance #0, #1, #2 gets converted into a slaves" { + wait_for_condition 1000 50 { + [RI 0 role] eq {slave} && [RI 1 role] eq {slave} && [RI 2 role] eq {slave} + } else { + fail "Old masters not converted into slaves" + } +} diff --git a/tests/cluster/tests/12-replica-migration-2.tcl b/tests/cluster/tests/12-replica-migration-2.tcl new file mode 100644 index 0000000..ed68006 --- /dev/null +++ b/tests/cluster/tests/12-replica-migration-2.tcl @@ -0,0 +1,75 @@ +# Replica migration test #2. +# +# Check that the status of master that can be targeted by replica migration +# is acquired again, after being getting slots again, in a cluster where the +# other masters have slaves. + +source "../tests/includes/init-tests.tcl" +source "../../../tests/support/cli.tcl" + +# Create a cluster with 5 master and 15 slaves, to make sure there are no +# empty masters and make rebalancing simpler to handle during the test. +test "Create a 5 nodes cluster" { + cluster_create_with_continuous_slots 5 15 +} + +test "Cluster is up" { + assert_cluster_state ok +} + +test "Each master should have at least two replicas attached" { + foreach_redis_id id { + if {$id < 5} { + wait_for_condition 1000 50 { + [llength [lindex [R $id role] 2]] >= 2 + } else { + fail "Master #$id does not have 2 slaves as expected" + } + } + } +} + +test "Set allow-replica-migration yes" { + foreach_redis_id id { + R $id CONFIG SET cluster-allow-replica-migration yes + } +} + +set master0_id [dict get [get_myself 0] id] +test "Resharding all the master #0 slots away from it" { + set output [exec \ + ../../../src/redis-cli --cluster rebalance \ + 127.0.0.1:[get_instance_attrib redis 0 port] \ + {*}[rediscli_tls_config "../../../tests"] \ + --cluster-weight ${master0_id}=0 >@ stdout ] + +} + +test "Master #0 who lost all slots should turn into a replica without replicas" { + wait_for_condition 1000 50 { + [RI 0 role] == "slave" && [RI 0 connected_slaves] == 0 + } else { + puts [R 0 info replication] + fail "Master #0 didn't turn itself into a replica" + } +} + +test "Resharding back some slot to master #0" { + # Wait for the cluster config to propagate before attempting a + # new resharding. + after 10000 + set output [exec \ + ../../../src/redis-cli --cluster rebalance \ + 127.0.0.1:[get_instance_attrib redis 0 port] \ + {*}[rediscli_tls_config "../../../tests"] \ + --cluster-weight ${master0_id}=.01 \ + --cluster-use-empty-masters >@ stdout] +} + +test "Master #0 should re-acquire one or more replicas" { + wait_for_condition 1000 50 { + [llength [lindex [R 0 role] 2]] >= 1 + } else { + fail "Master #0 has no has replicas" + } +} diff --git a/tests/cluster/tests/12.1-replica-migration-3.tcl b/tests/cluster/tests/12.1-replica-migration-3.tcl new file mode 100644 index 0000000..790c732 --- /dev/null +++ b/tests/cluster/tests/12.1-replica-migration-3.tcl @@ -0,0 +1,65 @@ +# Replica migration test #2. +# +# Check that if 'cluster-allow-replica-migration' is set to 'no', slaves do not +# migrate when master becomes empty. + +source "../tests/includes/init-tests.tcl" +source "../tests/includes/utils.tcl" + +# Create a cluster with 5 master and 15 slaves, to make sure there are no +# empty masters and make rebalancing simpler to handle during the test. +test "Create a 5 nodes cluster" { + cluster_create_with_continuous_slots 5 15 +} + +test "Cluster is up" { + assert_cluster_state ok +} + +test "Each master should have at least two replicas attached" { + foreach_redis_id id { + if {$id < 5} { + wait_for_condition 1000 50 { + [llength [lindex [R $id role] 2]] >= 2 + } else { + fail "Master #$id does not have 2 slaves as expected" + } + } + } +} + +test "Set allow-replica-migration no" { + foreach_redis_id id { + R $id CONFIG SET cluster-allow-replica-migration no + } +} + +set master0_id [dict get [get_myself 0] id] +test "Resharding all the master #0 slots away from it" { + set output [exec \ + ../../../src/redis-cli --cluster rebalance \ + 127.0.0.1:[get_instance_attrib redis 0 port] \ + {*}[rediscli_tls_config "../../../tests"] \ + --cluster-weight ${master0_id}=0 >@ stdout ] +} + +test "Wait cluster to be stable" { + wait_cluster_stable +} + +test "Master #0 still should have its replicas" { + assert { [llength [lindex [R 0 role] 2]] >= 2 } +} + +test "Each master should have at least two replicas attached" { + foreach_redis_id id { + if {$id < 5} { + wait_for_condition 1000 50 { + [llength [lindex [R $id role] 2]] >= 2 + } else { + fail "Master #$id does not have 2 slaves as expected" + } + } + } +} + diff --git a/tests/cluster/tests/13-no-failover-option.tcl b/tests/cluster/tests/13-no-failover-option.tcl new file mode 100644 index 0000000..befa598 --- /dev/null +++ b/tests/cluster/tests/13-no-failover-option.tcl @@ -0,0 +1,61 @@ +# Check that the no-failover option works + +source "../tests/includes/init-tests.tcl" + +test "Create a 5 nodes cluster" { + create_cluster 5 5 +} + +test "Cluster is up" { + assert_cluster_state ok +} + +test "Cluster is writable" { + cluster_write_test 0 +} + +test "Instance #5 is a slave" { + assert {[RI 5 role] eq {slave}} + + # Configure it to never failover the master + R 5 CONFIG SET cluster-slave-no-failover yes +} + +test "Instance #5 synced with the master" { + wait_for_condition 1000 50 { + [RI 5 master_link_status] eq {up} + } else { + fail "Instance #5 master link status is not up" + } +} + +test "The nofailover flag is propagated" { + set slave5_id [dict get [get_myself 5] id] + + foreach_redis_id id { + wait_for_condition 1000 50 { + [has_flag [get_node_by_id $id $slave5_id] nofailover] + } else { + fail "Instance $id can't see the nofailover flag of slave" + } + } +} + +set current_epoch [CI 1 cluster_current_epoch] + +test "Killing one master node" { + kill_instance redis 0 +} + +test "Cluster should be still down after some time" { + after 10000 + assert_cluster_state fail +} + +test "Instance #5 is still a slave" { + assert {[RI 5 role] eq {slave}} +} + +test "Restarting the previously killed master node" { + restart_instance redis 0 +} diff --git a/tests/cluster/tests/14-consistency-check.tcl b/tests/cluster/tests/14-consistency-check.tcl new file mode 100644 index 0000000..e3b9a19 --- /dev/null +++ b/tests/cluster/tests/14-consistency-check.tcl @@ -0,0 +1,124 @@ +source "../tests/includes/init-tests.tcl" +source "../../../tests/support/cli.tcl" + +test "Create a 5 nodes cluster" { + create_cluster 5 5 +} + +test "Cluster should start ok" { + assert_cluster_state ok +} + +test "Cluster is writable" { + cluster_write_test 0 +} + +proc find_non_empty_master {} { + set master_id_no {} + foreach_redis_id id { + if {[RI $id role] eq {master} && [R $id dbsize] > 0} { + set master_id_no $id + break + } + } + return $master_id_no +} + +proc get_one_of_my_replica {id} { + wait_for_condition 1000 50 { + [llength [lindex [R $id role] 2]] > 0 + } else { + fail "replicas didn't connect" + } + set replica_port [lindex [lindex [lindex [R $id role] 2] 0] 1] + set replica_id_num [get_instance_id_by_port redis $replica_port] + return $replica_id_num +} + +proc cluster_write_keys_with_expire {id ttl} { + set prefix [randstring 20 20 alpha] + set port [get_instance_attrib redis $id port] + set cluster [redis_cluster 127.0.0.1:$port] + for {set j 100} {$j < 200} {incr j} { + $cluster setex key_expire.$j $ttl $prefix.$j + } + $cluster close +} + +# make sure that replica who restarts from persistence will load keys +# that have already expired, critical for correct execution of commands +# that arrive from the master +proc test_slave_load_expired_keys {aof} { + test "Slave expired keys is loaded when restarted: appendonly=$aof" { + set master_id [find_non_empty_master] + set replica_id [get_one_of_my_replica $master_id] + + set master_dbsize_0 [R $master_id dbsize] + set replica_dbsize_0 [R $replica_id dbsize] + assert_equal $master_dbsize_0 $replica_dbsize_0 + + # config the replica persistency and rewrite the config file to survive restart + # note that this needs to be done before populating the volatile keys since + # that triggers and AOFRW, and we rather the AOF file to have 'SET PXAT' commands + # rather than an RDB with volatile keys + R $replica_id config set appendonly $aof + R $replica_id config rewrite + + # fill with 100 keys with 3 second TTL + set data_ttl 3 + cluster_write_keys_with_expire $master_id $data_ttl + + # wait for replica to be in sync with master + wait_for_condition 500 10 { + [R $replica_id dbsize] eq [R $master_id dbsize] + } else { + fail "replica didn't sync" + } + + set replica_dbsize_1 [R $replica_id dbsize] + assert {$replica_dbsize_1 > $replica_dbsize_0} + + # make replica create persistence file + if {$aof == "yes"} { + # we need to wait for the initial AOFRW to be done, otherwise + # kill_instance (which now uses SIGTERM will fail ("Writing initial AOF, can't exit") + wait_for_condition 100 10 { + [RI $replica_id aof_rewrite_scheduled] eq 0 && + [RI $replica_id aof_rewrite_in_progress] eq 0 + } else { + fail "AOFRW didn't finish" + } + } else { + R $replica_id save + } + + # kill the replica (would stay down until re-started) + kill_instance redis $replica_id + + # Make sure the master doesn't do active expire (sending DELs to the replica) + R $master_id DEBUG SET-ACTIVE-EXPIRE 0 + + # wait for all the keys to get logically expired + after [expr $data_ttl*1000] + + # start the replica again (loading an RDB or AOF file) + restart_instance redis $replica_id + + # make sure the keys are still there + set replica_dbsize_3 [R $replica_id dbsize] + assert {$replica_dbsize_3 > $replica_dbsize_0} + + # restore settings + R $master_id DEBUG SET-ACTIVE-EXPIRE 1 + + # wait for the master to expire all keys and replica to get the DELs + wait_for_condition 500 10 { + [R $replica_id dbsize] eq $master_dbsize_0 + } else { + fail "keys didn't expire" + } + } +} + +test_slave_load_expired_keys no +test_slave_load_expired_keys yes diff --git a/tests/cluster/tests/15-cluster-slots.tcl b/tests/cluster/tests/15-cluster-slots.tcl new file mode 100644 index 0000000..892e904 --- /dev/null +++ b/tests/cluster/tests/15-cluster-slots.tcl @@ -0,0 +1,128 @@ +source "../tests/includes/init-tests.tcl" + +proc cluster_allocate_mixedSlots {n} { + set slot 16383 + while {$slot >= 0} { + set node [expr {$slot % $n}] + lappend slots_$node $slot + incr slot -1 + } + for {set j 0} {$j < $n} {incr j} { + R $j cluster addslots {*}[set slots_${j}] + } +} + +proc create_cluster_with_mixedSlot {masters slaves} { + cluster_allocate_mixedSlots $masters + if {$slaves} { + cluster_allocate_slaves $masters $slaves + } + assert_cluster_state ok +} + +test "Create a 5 nodes cluster" { + create_cluster_with_mixedSlot 5 15 +} + +test "Cluster is up" { + assert_cluster_state ok +} + +test "Cluster is writable" { + cluster_write_test 0 +} + +test "Instance #5 is a slave" { + assert {[RI 5 role] eq {slave}} +} + +test "client do not break when cluster slot" { + R 0 config set client-output-buffer-limit "normal 33554432 16777216 60" + if { [catch {R 0 cluster slots}] } { + fail "output overflow when cluster slots" + } +} + +test "client can handle keys with hash tag" { + set cluster [redis_cluster 127.0.0.1:[get_instance_attrib redis 0 port]] + $cluster set foo{tag} bar + $cluster close +} + +test "slot migration is valid from primary to another primary" { + set cluster [redis_cluster 127.0.0.1:[get_instance_attrib redis 0 port]] + set key order1 + set slot [$cluster cluster keyslot $key] + array set nodefrom [$cluster masternode_for_slot $slot] + array set nodeto [$cluster masternode_notfor_slot $slot] + + assert_equal {OK} [$nodefrom(link) cluster setslot $slot node $nodeto(id)] + assert_equal {OK} [$nodeto(link) cluster setslot $slot node $nodeto(id)] +} + +test "slot migration is invalid from primary to replica" { + set cluster [redis_cluster 127.0.0.1:[get_instance_attrib redis 0 port]] + set key order1 + set slot [$cluster cluster keyslot $key] + array set nodefrom [$cluster masternode_for_slot $slot] + + # Get replica node serving slot. + set replicanodeinfo [$cluster cluster replicas $nodefrom(id)] + puts $replicanodeinfo + set args [split $replicanodeinfo " "] + set replicaid [lindex [split [lindex $args 0] \{] 1] + puts $replicaid + + catch {[$nodefrom(link) cluster setslot $slot node $replicaid]} err + assert_match "*Target node is not a master" $err +} + +proc count_bound_slots {n} { + set slot_count 0 + foreach slot_range_mapping [$n cluster slots] { + set start_slot [lindex $slot_range_mapping 0] + set end_slot [lindex $slot_range_mapping 1] + incr slot_count [expr $end_slot - $start_slot + 1] + } + return $slot_count + } + + test "slot must be unbound on the owner when it is deleted" { + set node0 [Rn 0] + set node1 [Rn 1] + assert {[count_bound_slots $node0] eq 16384} + assert {[count_bound_slots $node1] eq 16384} + + set slot_to_delete 0 + # Delete + $node0 CLUSTER DELSLOTS $slot_to_delete + + # Verify + # The node that owns the slot must unbind the slot that was deleted + wait_for_condition 1000 50 { + [count_bound_slots $node0] == 16383 + } else { + fail "Cluster slot deletion was not recorded on the node that owns the slot" + } + + # We don't propagate slot deletion across all nodes in the cluster. + # This can lead to extra redirect before the clients find out that the slot is unbound. + wait_for_condition 1000 50 { + [count_bound_slots $node1] == 16384 + } else { + fail "Cluster slot deletion should not be propagated to all nodes in the cluster" + } + } + +if {$::tls} { + test {CLUSTER SLOTS from non-TLS client in TLS cluster} { + set slots_tls [R 0 cluster slots] + set host [get_instance_attrib redis 0 host] + set plaintext_port [get_instance_attrib redis 0 plaintext-port] + set client_plain [redis $host $plaintext_port 0 0] + set slots_plain [$client_plain cluster slots] + $client_plain close + # Compare the ports in the first row + assert_no_match [lindex $slots_tls 0 3 1] [lindex $slots_plain 0 3 1] + } +} \ No newline at end of file diff --git a/tests/cluster/tests/16-transactions-on-replica.tcl b/tests/cluster/tests/16-transactions-on-replica.tcl new file mode 100644 index 0000000..ec5699c --- /dev/null +++ b/tests/cluster/tests/16-transactions-on-replica.tcl @@ -0,0 +1,79 @@ +# Check basic transactions on a replica. + +source "../tests/includes/init-tests.tcl" + +test "Create a primary with a replica" { + create_cluster 1 1 +} + +test "Cluster should start ok" { + assert_cluster_state ok +} + +set primary [Rn 0] +set replica [Rn 1] + +test "Can't read from replica without READONLY" { + $primary SET a 1 + wait_for_ofs_sync $primary $replica + catch {$replica GET a} err + assert {[string range $err 0 4] eq {MOVED}} +} + +test "Can read from replica after READONLY" { + $replica READONLY + assert {[$replica GET a] eq {1}} +} + +test "Can perform HSET primary and HGET from replica" { + $primary HSET h a 1 + $primary HSET h b 2 + $primary HSET h c 3 + wait_for_ofs_sync $primary $replica + assert {[$replica HGET h a] eq {1}} + assert {[$replica HGET h b] eq {2}} + assert {[$replica HGET h c] eq {3}} +} + +test "Can MULTI-EXEC transaction of HGET operations from replica" { + $replica MULTI + assert {[$replica HGET h a] eq {QUEUED}} + assert {[$replica HGET h b] eq {QUEUED}} + assert {[$replica HGET h c] eq {QUEUED}} + assert {[$replica EXEC] eq {1 2 3}} +} + +test "MULTI-EXEC with write operations is MOVED" { + $replica MULTI + catch {$replica HSET h b 4} err + assert {[string range $err 0 4] eq {MOVED}} + catch {$replica exec} err + assert {[string range $err 0 8] eq {EXECABORT}} +} + +test "read-only blocking operations from replica" { + set rd [redis_deferring_client redis 1] + $rd readonly + $rd read + $rd XREAD BLOCK 0 STREAMS k 0 + + wait_for_condition 1000 50 { + [RI 1 blocked_clients] eq {1} + } else { + fail "client wasn't blocked" + } + + $primary XADD k * foo bar + set res [$rd read] + set res [lindex [lindex [lindex [lindex $res 0] 1] 0] 1] + assert {$res eq {foo bar}} + $rd close +} + +test "reply MOVED when eval from replica for update" { + catch {[$replica eval {#!lua + return redis.call('del','a') + } 1 a + ]} err + assert {[string range $err 0 4] eq {MOVED}} +} \ No newline at end of file diff --git a/tests/cluster/tests/17-diskless-load-swapdb.tcl b/tests/cluster/tests/17-diskless-load-swapdb.tcl new file mode 100644 index 0000000..7a56ec7 --- /dev/null +++ b/tests/cluster/tests/17-diskless-load-swapdb.tcl @@ -0,0 +1,86 @@ +# Check that replica keys and keys to slots map are right after failing to diskless load using SWAPDB. + +source "../tests/includes/init-tests.tcl" + +test "Create a primary with a replica" { + create_cluster 1 1 +} + +test "Cluster should start ok" { + assert_cluster_state ok +} + +test "Cluster is writable" { + cluster_write_test 0 +} + +test "Main db not affected when fail to diskless load" { + set master [Rn 0] + set replica [Rn 1] + set master_id 0 + set replica_id 1 + + $replica READONLY + $replica config set repl-diskless-load swapdb + $replica config set appendonly no + $replica config set save "" + $replica config rewrite + $master config set repl-backlog-size 1024 + $master config set repl-diskless-sync yes + $master config set repl-diskless-sync-delay 0 + $master config set rdb-key-save-delay 10000 + $master config set rdbcompression no + $master config set appendonly no + $master config set save "" + + # Write a key that belongs to slot 0 + set slot0_key "06S" + $master set $slot0_key 1 + wait_for_ofs_sync $master $replica + assert_equal {1} [$replica get $slot0_key] + assert_equal $slot0_key [$replica CLUSTER GETKEYSINSLOT 0 1] + + # Save an RDB and kill the replica + $replica save + kill_instance redis $replica_id + + # Delete the key from master + $master del $slot0_key + + # Replica must full sync with master when start because replication + # backlog size is very small, and dumping rdb will cost several seconds. + set num 10000 + set value [string repeat A 1024] + set rd [redis_deferring_client redis $master_id] + for {set j 0} {$j < $num} {incr j} { + $rd set $j $value + } + for {set j 0} {$j < $num} {incr j} { + $rd read + } + + # Start the replica again + restart_instance redis $replica_id + $replica READONLY + + # Start full sync, wait till after db started loading in background + wait_for_condition 500 10 { + [s $replica_id async_loading] eq 1 + } else { + fail "Fail to full sync" + } + + # Kill master, abort full sync + kill_instance redis $master_id + + # Start full sync, wait till the replica detects the disconnection + wait_for_condition 500 10 { + [s $replica_id async_loading] eq 0 + } else { + fail "Fail to full sync" + } + + # Replica keys and keys to slots map still both are right + assert_equal {1} [$replica get $slot0_key] + assert_equal $slot0_key [$replica CLUSTER GETKEYSINSLOT 0 1] +} diff --git a/tests/cluster/tests/18-info.tcl b/tests/cluster/tests/18-info.tcl new file mode 100644 index 0000000..68c62d3 --- /dev/null +++ b/tests/cluster/tests/18-info.tcl @@ -0,0 +1,45 @@ +# Check cluster info stats + +source "../tests/includes/init-tests.tcl" + +test "Create a primary with a replica" { + create_cluster 2 0 +} + +test "Cluster should start ok" { + assert_cluster_state ok +} + +set primary1 [Rn 0] +set primary2 [Rn 1] + +proc cmdstat {instance cmd} { + return [cmdrstat $cmd $instance] +} + +proc errorstat {instance cmd} { + return [errorrstat $cmd $instance] +} + +test "errorstats: rejected call due to MOVED Redirection" { + $primary1 config resetstat + $primary2 config resetstat + assert_match {} [errorstat $primary1 MOVED] + assert_match {} [errorstat $primary2 MOVED] + # we know that one will have a MOVED reply and one will succeed + catch {$primary1 set key b} replyP1 + catch {$primary2 set key b} replyP2 + # sort servers so we know which one failed + if {$replyP1 eq {OK}} { + assert_match {MOVED*} $replyP2 + set pok $primary1 + set perr $primary2 + } else { + assert_match {MOVED*} $replyP1 + set pok $primary2 + set perr $primary1 + } + assert_match {} [errorstat $pok MOVED] + assert_match {*count=1*} [errorstat $perr MOVED] + assert_match {*calls=0,*,rejected_calls=1,failed_calls=0} [cmdstat $perr set] +} diff --git a/tests/cluster/tests/19-cluster-nodes-slots.tcl b/tests/cluster/tests/19-cluster-nodes-slots.tcl new file mode 100644 index 0000000..77faec9 --- /dev/null +++ b/tests/cluster/tests/19-cluster-nodes-slots.tcl @@ -0,0 +1,50 @@ +# Optimize CLUSTER NODES command by generating all nodes slot topology firstly + +source "../tests/includes/init-tests.tcl" + +test "Create a 2 nodes cluster" { + cluster_create_with_continuous_slots 2 2 +} + +test "Cluster should start ok" { + assert_cluster_state ok +} + +set master1 [Rn 0] +set master2 [Rn 1] + +test "Continuous slots distribution" { + assert_match "* 0-8191*" [$master1 CLUSTER NODES] + assert_match "* 8192-16383*" [$master2 CLUSTER NODES] + assert_match "*0 8191*" [$master1 CLUSTER SLOTS] + assert_match "*8192 16383*" [$master2 CLUSTER SLOTS] + + $master1 CLUSTER DELSLOTS 4096 + assert_match "* 0-4095 4097-8191*" [$master1 CLUSTER NODES] + assert_match "*0 4095*4097 8191*" [$master1 CLUSTER SLOTS] + + + $master2 CLUSTER DELSLOTS 12288 + assert_match "* 8192-12287 12289-16383*" [$master2 CLUSTER NODES] + assert_match "*8192 12287*12289 16383*" [$master2 CLUSTER SLOTS] +} + +test "Discontinuous slots distribution" { + # Remove middle slots + $master1 CLUSTER DELSLOTS 4092 4094 + assert_match "* 0-4091 4093 4095 4097-8191*" [$master1 CLUSTER NODES] + assert_match "*0 4091*4093 4093*4095 4095*4097 8191*" [$master1 CLUSTER SLOTS] + $master2 CLUSTER DELSLOTS 12284 12286 + assert_match "* 8192-12283 12285 12287 12289-16383*" [$master2 CLUSTER NODES] + assert_match "*8192 12283*12285 12285*12287 12287*12289 16383*" [$master2 CLUSTER SLOTS] + + # Remove head slots + $master1 CLUSTER DELSLOTS 0 2 + assert_match "* 1 3-4091 4093 4095 4097-8191*" [$master1 CLUSTER NODES] + assert_match "*1 1*3 4091*4093 4093*4095 4095*4097 8191*" [$master1 CLUSTER SLOTS] + + # Remove tail slots + $master2 CLUSTER DELSLOTS 16380 16382 16383 + assert_match "* 8192-12283 12285 12287 12289-16379 16381*" [$master2 CLUSTER NODES] + assert_match "*8192 12283*12285 12285*12287 12287*12289 16379*16381 16381*" [$master2 CLUSTER SLOTS] +} diff --git a/tests/cluster/tests/20-half-migrated-slot.tcl b/tests/cluster/tests/20-half-migrated-slot.tcl new file mode 100644 index 0000000..229b3a8 --- /dev/null +++ b/tests/cluster/tests/20-half-migrated-slot.tcl @@ -0,0 +1,98 @@ +# Tests for fixing migrating slot at all stages: +# 1. when migration is half inited on "migrating" node +# 2. when migration is half inited on "importing" node +# 3. migration inited, but not finished +# 4. migration is half finished on "migrating" node +# 5. migration is half finished on "importing" node + +# TODO: Test is currently disabled until it is stabilized (fixing the test +# itself or real issues in Redis). + +if {false} { +source "../tests/includes/init-tests.tcl" +source "../tests/includes/utils.tcl" + +test "Create a 2 nodes cluster" { + create_cluster 2 0 + config_set_all_nodes cluster-allow-replica-migration no +} + +test "Cluster is up" { + assert_cluster_state ok +} + +set cluster [redis_cluster 127.0.0.1:[get_instance_attrib redis 0 port]] +catch {unset nodefrom} +catch {unset nodeto} + +proc reset_cluster {} { + uplevel 1 { + $cluster refresh_nodes_map + array set nodefrom [$cluster masternode_for_slot 609] + array set nodeto [$cluster masternode_notfor_slot 609] + } +} + +reset_cluster + +$cluster set aga xyz + +test "Half init migration in 'migrating' is fixable" { + assert_equal {OK} [$nodefrom(link) cluster setslot 609 migrating $nodeto(id)] + fix_cluster $nodefrom(addr) + assert_equal "xyz" [$cluster get aga] +} + +test "Half init migration in 'importing' is fixable" { + assert_equal {OK} [$nodeto(link) cluster setslot 609 importing $nodefrom(id)] + fix_cluster $nodefrom(addr) + assert_equal "xyz" [$cluster get aga] +} + +test "Init migration and move key" { + assert_equal {OK} [$nodefrom(link) cluster setslot 609 migrating $nodeto(id)] + assert_equal {OK} [$nodeto(link) cluster setslot 609 importing $nodefrom(id)] + assert_equal {OK} [$nodefrom(link) migrate $nodeto(host) $nodeto(port) aga 0 10000] + wait_for_cluster_propagation + assert_equal "xyz" [$cluster get aga] + fix_cluster $nodefrom(addr) + assert_equal "xyz" [$cluster get aga] +} + +reset_cluster + +test "Move key again" { + wait_for_cluster_propagation + assert_equal {OK} [$nodefrom(link) cluster setslot 609 migrating $nodeto(id)] + assert_equal {OK} [$nodeto(link) cluster setslot 609 importing $nodefrom(id)] + assert_equal {OK} [$nodefrom(link) migrate $nodeto(host) $nodeto(port) aga 0 10000] + wait_for_cluster_propagation + assert_equal "xyz" [$cluster get aga] +} + +test "Half-finish migration" { + # half finish migration on 'migrating' node + assert_equal {OK} [$nodefrom(link) cluster setslot 609 node $nodeto(id)] + fix_cluster $nodefrom(addr) + assert_equal "xyz" [$cluster get aga] +} + +reset_cluster + +test "Move key back" { + # 'aga' key is in 609 slot + assert_equal {OK} [$nodefrom(link) cluster setslot 609 migrating $nodeto(id)] + assert_equal {OK} [$nodeto(link) cluster setslot 609 importing $nodefrom(id)] + assert_equal {OK} [$nodefrom(link) migrate $nodeto(host) $nodeto(port) aga 0 10000] + assert_equal "xyz" [$cluster get aga] +} + +test "Half-finish importing" { + # Now we half finish 'importing' node + assert_equal {OK} [$nodeto(link) cluster setslot 609 node $nodeto(id)] + fix_cluster $nodefrom(addr) + assert_equal "xyz" [$cluster get aga] +} + +config_set_all_nodes cluster-allow-replica-migration yes +} diff --git a/tests/cluster/tests/21-many-slot-migration.tcl b/tests/cluster/tests/21-many-slot-migration.tcl new file mode 100644 index 0000000..1ac73dc --- /dev/null +++ b/tests/cluster/tests/21-many-slot-migration.tcl @@ -0,0 +1,64 @@ +# Tests for many simultaneous migrations. + +# TODO: Test is currently disabled until it is stabilized (fixing the test +# itself or real issues in Redis). + +if {false} { + +source "../tests/includes/init-tests.tcl" +source "../tests/includes/utils.tcl" + +# TODO: This test currently runs without replicas, as failovers (which may +# happen on lower-end CI platforms) are still not handled properly by the +# cluster during slot migration (related to #6339). + +test "Create a 10 nodes cluster" { + create_cluster 10 0 + config_set_all_nodes cluster-allow-replica-migration no +} + +test "Cluster is up" { + assert_cluster_state ok +} + +set cluster [redis_cluster 127.0.0.1:[get_instance_attrib redis 0 port]] +catch {unset nodefrom} +catch {unset nodeto} + +$cluster refresh_nodes_map + +test "Set many keys" { + for {set i 0} {$i < 40000} {incr i} { + $cluster set key:$i val:$i + } +} + +test "Keys are accessible" { + for {set i 0} {$i < 40000} {incr i} { + assert { [$cluster get key:$i] eq "val:$i" } + } +} + +test "Init migration of many slots" { + for {set slot 0} {$slot < 1000} {incr slot} { + array set nodefrom [$cluster masternode_for_slot $slot] + array set nodeto [$cluster masternode_notfor_slot $slot] + + $nodefrom(link) cluster setslot $slot migrating $nodeto(id) + $nodeto(link) cluster setslot $slot importing $nodefrom(id) + } +} + +test "Fix cluster" { + wait_for_cluster_propagation + fix_cluster $nodefrom(addr) +} + +test "Keys are accessible" { + for {set i 0} {$i < 40000} {incr i} { + assert { [$cluster get key:$i] eq "val:$i" } + } +} + +config_set_all_nodes cluster-allow-replica-migration yes +} diff --git a/tests/cluster/tests/22-replica-in-sync.tcl b/tests/cluster/tests/22-replica-in-sync.tcl new file mode 100644 index 0000000..b5645aa --- /dev/null +++ b/tests/cluster/tests/22-replica-in-sync.tcl @@ -0,0 +1,146 @@ +source "../tests/includes/init-tests.tcl" + +test "Create a 1 node cluster" { + create_cluster 1 0 +} + +test "Cluster is up" { + assert_cluster_state ok +} + +test "Cluster is writable" { + cluster_write_test 0 +} + +proc is_in_slots {master_id replica} { + set slots [R $master_id cluster slots] + set found_position [string first $replica $slots] + set result [expr {$found_position != -1}] + return $result +} + +proc is_replica_online {info_repl} { + set found_position [string first "state=online" $info_repl] + set result [expr {$found_position != -1}] + return $result +} + +proc get_last_pong_time {node_id target_cid} { + foreach item [split [R $node_id cluster nodes] \n] { + set args [split $item " "] + if {[lindex $args 0] eq $target_cid} { + return [lindex $args 5] + } + } + fail "Target node ID was not present" +} + +set master_id 0 + +test "Fill up primary with data" { + # Set 1 MB of data + R $master_id debug populate 1000 key 1000 +} + +test "Add new node as replica" { + set replica_id 1 + set replica [R $replica_id CLUSTER MYID] + R $replica_id cluster replicate [R $master_id CLUSTER MYID] +} + +test "Check digest and replica state" { + wait_for_condition 1000 50 { + [is_in_slots $master_id $replica] + } else { + fail "New replica didn't appear in the slots" + } + + wait_for_condition 100 50 { + [is_replica_online [R $master_id info replication]] + } else { + fail "Replica is down for too long" + } + set replica_digest [R $replica_id debug digest] + assert {$replica_digest ne 0} +} + +test "Replica in loading state is hidden" { + # Kill replica client for master and load new data to the primary + R $master_id config set repl-backlog-size 100 + + # Set the key load delay so that it will take at least + # 2 seconds to fully load the data. + R $replica_id config set key-load-delay 4000 + + # Trigger event loop processing every 1024 bytes, this trigger + # allows us to send and receive cluster messages, so we are setting + # it low so that the cluster messages are sent more frequently. + R $replica_id config set loading-process-events-interval-bytes 1024 + + R $master_id multi + R $master_id client kill type replica + set num 100 + set value [string repeat A 1024] + for {set j 0} {$j < $num} {incr j} { + set key "{0}" + append key $j + R $master_id set $key $value + } + R $master_id exec + + # The master will be the last to know the replica + # is loading, so we will wait on that and assert + # the replica is loading afterwards. + wait_for_condition 100 50 { + ![is_in_slots $master_id $replica] + } else { + fail "Replica was always present in cluster slots" + } + assert_equal 1 [s $replica_id loading] + + # Wait for the replica to finish full-sync and become online + wait_for_condition 200 50 { + [s $replica_id master_link_status] eq "up" + } else { + fail "Replica didn't finish loading" + } + + # Return configs to default values + R $replica_id config set loading-process-events-interval-bytes 2097152 + R $replica_id config set key-load-delay 0 + + # Check replica is back in cluster slots + wait_for_condition 100 50 { + [is_in_slots $master_id $replica] + } else { + fail "Replica is not back to slots" + } + assert_equal 1 [is_in_slots $replica_id $replica] +} + +test "Check disconnected replica not hidden from slots" { + # We want to disconnect the replica, but keep it alive so it can still gossip + + # Make sure that the replica will not be able to re-connect to the master + R $master_id config set requirepass asdf + + # Disconnect replica from primary + R $master_id client kill type replica + + # Check master to have no replicas + assert {[s $master_id connected_slaves] == 0} + + set replica_cid [R $replica_id cluster myid] + set initial_pong [get_last_pong_time $master_id $replica_cid] + wait_for_condition 50 100 { + $initial_pong != [get_last_pong_time $master_id $replica_cid] + } else { + fail "Primary never received gossip from replica" + } + + # Check that replica is still in the cluster slots + assert {[is_in_slots $master_id $replica]} + + # undo config + R $master_id config set requirepass "" +} diff --git a/tests/cluster/tests/23-multiple-slot-operations.tcl b/tests/cluster/tests/23-multiple-slot-operations.tcl new file mode 100644 index 0000000..060ab57 --- /dev/null +++ b/tests/cluster/tests/23-multiple-slot-operations.tcl @@ -0,0 +1,115 @@ +# Check the multiple slot add and remove commands + +source "../tests/includes/init-tests.tcl" + +proc cluster_allocate_with_continuous_slots_local {n} { + R 0 cluster ADDSLOTSRANGE 0 3276 + R 1 cluster ADDSLOTSRANGE 3277 6552 + R 2 cluster ADDSLOTSRANGE 6553 9828 + R 3 cluster ADDSLOTSRANGE 9829 13104 + R 4 cluster ADDSLOTSRANGE 13105 16383 +} + +proc cluster_create_with_continuous_slots_local {masters slaves} { + cluster_allocate_with_continuous_slots_local $masters + if {$slaves} { + cluster_allocate_slaves $masters $slaves + } + assert_cluster_state ok +} + + +test "Create a 5 nodes cluster" { + cluster_create_with_continuous_slots_local 5 5 +} + +test "Cluster should start ok" { + assert_cluster_state ok +} + +set master1 [Rn 0] +set master2 [Rn 1] +set master3 [Rn 2] +set master4 [Rn 3] +set master5 [Rn 4] + +test "Continuous slots distribution" { + assert_match "* 0-3276*" [$master1 CLUSTER NODES] + assert_match "* 3277-6552*" [$master2 CLUSTER NODES] + assert_match "* 6553-9828*" [$master3 CLUSTER NODES] + assert_match "* 9829-13104*" [$master4 CLUSTER NODES] + assert_match "* 13105-16383*" [$master5 CLUSTER NODES] + assert_match "*0 3276*" [$master1 CLUSTER SLOTS] + assert_match "*3277 6552*" [$master2 CLUSTER SLOTS] + assert_match "*6553 9828*" [$master3 CLUSTER SLOTS] + assert_match "*9829 13104*" [$master4 CLUSTER SLOTS] + assert_match "*13105 16383*" [$master5 CLUSTER SLOTS] + + $master1 CLUSTER DELSLOTSRANGE 3001 3050 + assert_match "* 0-3000 3051-3276*" [$master1 CLUSTER NODES] + assert_match "*0 3000*3051 3276*" [$master1 CLUSTER SLOTS] + + $master2 CLUSTER DELSLOTSRANGE 5001 5500 + assert_match "* 3277-5000 5501-6552*" [$master2 CLUSTER NODES] + assert_match "*3277 5000*5501 6552*" [$master2 CLUSTER SLOTS] + + $master3 CLUSTER DELSLOTSRANGE 7001 7100 8001 8500 + assert_match "* 6553-7000 7101-8000 8501-9828*" [$master3 CLUSTER NODES] + assert_match "*6553 7000*7101 8000*8501 9828*" [$master3 CLUSTER SLOTS] + + $master4 CLUSTER DELSLOTSRANGE 11001 12000 12101 12200 + assert_match "* 9829-11000 12001-12100 12201-13104*" [$master4 CLUSTER NODES] + assert_match "*9829 11000*12001 12100*12201 13104*" [$master4 CLUSTER SLOTS] + + $master5 CLUSTER DELSLOTSRANGE 13501 14000 15001 16000 + assert_match "* 13105-13500 14001-15000 16001-16383*" [$master5 CLUSTER NODES] + assert_match "*13105 13500*14001 15000*16001 16383*" [$master5 CLUSTER SLOTS] +} + +test "ADDSLOTSRANGE command with several boundary conditions test suite" { + # Add multiple slots with incorrect argument number + assert_error "ERR wrong number of arguments for 'cluster|addslotsrange' command" {R 0 cluster ADDSLOTSRANGE 3001 3020 3030} + + # Add multiple slots with invalid input slot + assert_error "ERR Invalid or out of range slot" {R 0 cluster ADDSLOTSRANGE 3001 3020 3030 aaa} + assert_error "ERR Invalid or out of range slot" {R 0 cluster ADDSLOTSRANGE 3001 3020 3030 70000} + assert_error "ERR Invalid or out of range slot" {R 0 cluster ADDSLOTSRANGE 3001 3020 -1000 3030} + + # Add multiple slots when start slot number is greater than the end slot + assert_error "ERR start slot number 3030 is greater than end slot number 3025" {R 0 cluster ADDSLOTSRANGE 3001 3020 3030 3025} + + # Add multiple slots with busy slot + assert_error "ERR Slot 3200 is already busy" {R 0 cluster ADDSLOTSRANGE 3001 3020 3200 3250} + + # Add multiple slots with assigned multiple times + assert_error "ERR Slot 3001 specified multiple times" {R 0 cluster ADDSLOTSRANGE 3001 3020 3001 3020} +} + +test "DELSLOTSRANGE command with several boundary conditions test suite" { + # Delete multiple slots with incorrect argument number + assert_error "ERR wrong number of arguments for 'cluster|delslotsrange' command" {R 0 cluster DELSLOTSRANGE 1000 2000 2100} + assert_match "* 0-3000 3051-3276*" [$master1 CLUSTER NODES] + assert_match "*0 3000*3051 3276*" [$master1 CLUSTER SLOTS] + + # Delete multiple slots with invalid input slot + assert_error "ERR Invalid or out of range slot" {R 0 cluster DELSLOTSRANGE 1000 2000 2100 aaa} + assert_error "ERR Invalid or out of range slot" {R 0 cluster DELSLOTSRANGE 1000 2000 2100 70000} + assert_error "ERR Invalid or out of range slot" {R 0 cluster DELSLOTSRANGE 1000 2000 -2100 2200} + assert_match "* 0-3000 3051-3276*" [$master1 CLUSTER NODES] + assert_match "*0 3000*3051 3276*" [$master1 CLUSTER SLOTS] + + # Delete multiple slots when start slot number is greater than the end slot + assert_error "ERR start slot number 5800 is greater than end slot number 5750" {R 1 cluster DELSLOTSRANGE 5600 5700 5800 5750} + assert_match "* 3277-5000 5501-6552*" [$master2 CLUSTER NODES] + assert_match "*3277 5000*5501 6552*" [$master2 CLUSTER SLOTS] + + # Delete multiple slots with already unassigned + assert_error "ERR Slot 7001 is already unassigned" {R 2 cluster DELSLOTSRANGE 7001 7100 9000 9200} + assert_match "* 6553-7000 7101-8000 8501-9828*" [$master3 CLUSTER NODES] + assert_match "*6553 7000*7101 8000*8501 9828*" [$master3 CLUSTER SLOTS] + + # Delete multiple slots with assigned multiple times + assert_error "ERR Slot 12500 specified multiple times" {R 3 cluster DELSLOTSRANGE 12500 12600 12500 12600} + assert_match "* 9829-11000 12001-12100 12201-13104*" [$master4 CLUSTER NODES] + assert_match "*9829 11000*12001 12100*12201 13104*" [$master4 CLUSTER SLOTS] +} diff --git a/tests/cluster/tests/24-links.tcl b/tests/cluster/tests/24-links.tcl new file mode 100644 index 0000000..d0ddea2 --- /dev/null +++ b/tests/cluster/tests/24-links.tcl @@ -0,0 +1,114 @@ +source "../tests/includes/init-tests.tcl" + +test "Create a cluster with two single-node shards" { + create_cluster 2 0 +} + +test "Cluster should start ok" { + assert_cluster_state ok +} + +proc number_of_peers {id} { + expr [llength [get_cluster_nodes $id]] - 1 +} + +proc number_of_links {id} { + llength [get_cluster_links $id] +} + +test "Each node has two links with each peer" { + foreach_redis_id id { + # Assert that from point of view of each node, there are two links for + # each peer. It might take a while for cluster to stabilize so wait up + # to 5 seconds. + wait_for_condition 50 100 { + [number_of_peers $id]*2 == [number_of_links $id] + } else { + assert_equal [expr [number_of_peers $id]*2] [number_of_links $id] + } + + set nodes [get_cluster_nodes $id] + set links [get_cluster_links $id] + + # For each peer there should be exactly one + # link "to" it and one link "from" it. + foreach n $nodes { + if {[has_flag $n myself]} continue + set peer [dict get $n id] + set to 0 + set from 0 + foreach l $links { + if {[dict get $l node] eq $peer} { + if {[dict get $l dir] eq "to"} { + incr to + } elseif {[dict get $l dir] eq "from"} { + incr from + } + } + } + assert {$to eq 1} + assert {$from eq 1} + } + } +} + +set primary1_id 0 +set primary2_id 1 + +set primary1 [Rn $primary1_id] +set primary2 [Rn $primary2_id] + +test "Disconnect link when send buffer limit reached" { + # On primary1, set timeout to 1 hour so links won't get disconnected due to timeouts + set oldtimeout [lindex [$primary1 CONFIG get cluster-node-timeout] 1] + $primary1 CONFIG set cluster-node-timeout [expr 60*60*1000] + + # Get primary1's links with primary2 + set primary2_name [dict get [get_myself $primary2_id] id] + set orig_link_p1_to_p2 [get_link_to_peer $primary1_id $primary2_name] + set orig_link_p1_from_p2 [get_link_from_peer $primary1_id $primary2_name] + + # On primary1, set cluster link send buffer limit to 256KB, which is large enough to not be + # overflowed by regular gossip messages but also small enough that it doesn't take too much + # memory to overflow it. If it is set too high, Redis may get OOM killed by kernel before this + # limit is overflowed in some RAM-limited test environments. + set oldlimit [lindex [$primary1 CONFIG get cluster-link-sendbuf-limit] 1] + $primary1 CONFIG set cluster-link-sendbuf-limit [expr 256*1024] + assert {[get_info_field [$primary1 cluster info] total_cluster_links_buffer_limit_exceeded] eq 0} + + # To manufacture an ever-growing send buffer from primary1 to primary2, + # make primary2 unresponsive. + set primary2_pid [get_instance_attrib redis $primary2_id pid] + exec kill -SIGSTOP $primary2_pid + + # On primary1, send 128KB Pubsub messages in a loop until the send buffer of the link from + # primary1 to primary2 exceeds buffer limit therefore be dropped. + # For the send buffer to grow, we need to first exhaust TCP send buffer of primary1 and TCP + # receive buffer of primary2 first. The sizes of these two buffers vary by OS, but 100 128KB + # messages should be sufficient. + set i 0 + wait_for_condition 100 0 { + [catch {incr i} e] == 0 && + [catch {$primary1 publish channel [prepare_value [expr 128*1024]]} e] == 0 && + [catch {after 500} e] == 0 && + [get_info_field [$primary1 cluster info] total_cluster_links_buffer_limit_exceeded] >= 1 + } else { + fail "Cluster link not freed as expected" + } + puts -nonewline "$i 128KB messages needed to overflow 256KB buffer limit. " + + # A new link to primary2 should have been recreated + set new_link_p1_to_p2 [get_link_to_peer $primary1_id $primary2_name] + assert {[dict get $new_link_p1_to_p2 create_time] > [dict get $orig_link_p1_to_p2 create_time]} + + # Link from primary2 should not be affected + set same_link_p1_from_p2 [get_link_from_peer $primary1_id $primary2_name] + assert {[dict get $same_link_p1_from_p2 create_time] eq [dict get $orig_link_p1_from_p2 create_time]} + + # Revive primary2 + exec kill -SIGCONT $primary2_pid + + # Reset configs on primary1 so config changes don't leak out to other tests + $primary1 CONFIG set cluster-node-timeout $oldtimeout + $primary1 CONFIG set cluster-link-sendbuf-limit $oldlimit +} diff --git a/tests/cluster/tests/25-pubsubshard-slot-migration.tcl b/tests/cluster/tests/25-pubsubshard-slot-migration.tcl new file mode 100644 index 0000000..0f59ffe --- /dev/null +++ b/tests/cluster/tests/25-pubsubshard-slot-migration.tcl @@ -0,0 +1,171 @@ +source "../tests/includes/init-tests.tcl" + +test "Create a 3 nodes cluster" { + cluster_create_with_continuous_slots 3 3 +} + +test "Cluster is up" { + assert_cluster_state ok +} + +set cluster [redis_cluster 127.0.0.1:[get_instance_attrib redis 0 port]] + +test "Migrate a slot, verify client receives sunsubscribe on primary serving the slot." { + + # Setup the to and from node + set channelname mychannel + set slot [$cluster cluster keyslot $channelname] + array set nodefrom [$cluster masternode_for_slot $slot] + array set nodeto [$cluster masternode_notfor_slot $slot] + + set subscribeclient [redis_deferring_client_by_addr $nodefrom(host) $nodefrom(port)] + + $subscribeclient deferred 1 + $subscribeclient ssubscribe $channelname + $subscribeclient read + + assert_equal {OK} [$nodefrom(link) cluster setslot $slot migrating $nodeto(id)] + assert_equal {OK} [$nodeto(link) cluster setslot $slot importing $nodefrom(id)] + + # Verify subscribe is still valid, able to receive messages. + $nodefrom(link) spublish $channelname hello + assert_equal {smessage mychannel hello} [$subscribeclient read] + + assert_equal {OK} [$nodefrom(link) cluster setslot $slot node $nodeto(id)] + + set msg [$subscribeclient read] + assert {"sunsubscribe" eq [lindex $msg 0]} + assert {$channelname eq [lindex $msg 1]} + assert {"0" eq [lindex $msg 2]} + + assert_equal {OK} [$nodeto(link) cluster setslot $slot node $nodeto(id)] + + $subscribeclient close +} + +test "Client subscribes to multiple channels, migrate a slot, verify client receives sunsubscribe on primary serving the slot." { + + # Setup the to and from node + set channelname ch3 + set anotherchannelname ch7 + set slot [$cluster cluster keyslot $channelname] + array set nodefrom [$cluster masternode_for_slot $slot] + array set nodeto [$cluster masternode_notfor_slot $slot] + + set subscribeclient [redis_deferring_client_by_addr $nodefrom(host) $nodefrom(port)] + + $subscribeclient deferred 1 + $subscribeclient ssubscribe $channelname + $subscribeclient read + + $subscribeclient ssubscribe $anotherchannelname + $subscribeclient read + + assert_equal {OK} [$nodefrom(link) cluster setslot $slot migrating $nodeto(id)] + assert_equal {OK} [$nodeto(link) cluster setslot $slot importing $nodefrom(id)] + + # Verify subscribe is still valid, able to receive messages. + $nodefrom(link) spublish $channelname hello + assert_equal {smessage ch3 hello} [$subscribeclient read] + + assert_equal {OK} [$nodefrom(link) cluster setslot $slot node $nodeto(id)] + + # Verify the client receives sunsubscribe message for the channel(slot) which got migrated. + set msg [$subscribeclient read] + assert {"sunsubscribe" eq [lindex $msg 0]} + assert {$channelname eq [lindex $msg 1]} + assert {"1" eq [lindex $msg 2]} + + assert_equal {OK} [$nodeto(link) cluster setslot $slot node $nodeto(id)] + + $nodefrom(link) spublish $anotherchannelname hello + + # Verify the client is still connected and receives message from the other channel. + set msg [$subscribeclient read] + assert {"smessage" eq [lindex $msg 0]} + assert {$anotherchannelname eq [lindex $msg 1]} + assert {"hello" eq [lindex $msg 2]} + + $subscribeclient close +} + +test "Migrate a slot, verify client receives sunsubscribe on replica serving the slot." { + + # Setup the to and from node + set channelname mychannel1 + set slot [$cluster cluster keyslot $channelname] + array set nodefrom [$cluster masternode_for_slot $slot] + array set nodeto [$cluster masternode_notfor_slot $slot] + + # Get replica node serving slot (mychannel) to connect a client. + set replicanodeinfo [$cluster cluster replicas $nodefrom(id)] + set args [split $replicanodeinfo " "] + set addr [lindex [split [lindex $args 1] @] 0] + set replicahost [lindex [split $addr :] 0] + set replicaport [lindex [split $addr :] 1] + set subscribeclient [redis_deferring_client_by_addr $replicahost $replicaport] + + $subscribeclient deferred 1 + $subscribeclient ssubscribe $channelname + $subscribeclient read + + assert_equal {OK} [$nodefrom(link) cluster setslot $slot migrating $nodeto(id)] + assert_equal {OK} [$nodeto(link) cluster setslot $slot importing $nodefrom(id)] + + # Verify subscribe is still valid, able to receive messages. + $nodefrom(link) spublish $channelname hello + assert_equal {smessage mychannel1 hello} [$subscribeclient read] + + assert_equal {OK} [$nodefrom(link) cluster setslot $slot node $nodeto(id)] + assert_equal {OK} [$nodeto(link) cluster setslot $slot node $nodeto(id)] + + set msg [$subscribeclient read] + assert {"sunsubscribe" eq [lindex $msg 0]} + assert {$channelname eq [lindex $msg 1]} + assert {"0" eq [lindex $msg 2]} + + $subscribeclient close +} + +test "Delete a slot, verify sunsubscribe message" { + set channelname ch2 + set slot [$cluster cluster keyslot $channelname] + + array set primary_client [$cluster masternode_for_slot $slot] + + set subscribeclient [redis_deferring_client_by_addr $primary_client(host) $primary_client(port)] + $subscribeclient deferred 1 + $subscribeclient ssubscribe $channelname + $subscribeclient read + + $primary_client(link) cluster DELSLOTS $slot + + set msg [$subscribeclient read] + assert {"sunsubscribe" eq [lindex $msg 0]} + assert {$channelname eq [lindex $msg 1]} + assert {"0" eq [lindex $msg 2]} + + $subscribeclient close +} + +test "Reset cluster, verify sunsubscribe message" { + set channelname ch4 + set slot [$cluster cluster keyslot $channelname] + + array set primary_client [$cluster masternode_for_slot $slot] + + set subscribeclient [redis_deferring_client_by_addr $primary_client(host) $primary_client(port)] + $subscribeclient deferred 1 + $subscribeclient ssubscribe $channelname + $subscribeclient read + + $cluster cluster reset HARD + + set msg [$subscribeclient read] + assert {"sunsubscribe" eq [lindex $msg 0]} + assert {$channelname eq [lindex $msg 1]} + assert {"0" eq [lindex $msg 2]} + + $cluster close + $subscribeclient close +} \ No newline at end of file diff --git a/tests/cluster/tests/26-pubsubshard.tcl b/tests/cluster/tests/26-pubsubshard.tcl new file mode 100644 index 0000000..2619eda --- /dev/null +++ b/tests/cluster/tests/26-pubsubshard.tcl @@ -0,0 +1,94 @@ +# Test PUBSUB shard propagation in a cluster slot. + +source "../tests/includes/init-tests.tcl" + +test "Create a 3 nodes cluster" { + cluster_create_with_continuous_slots 3 3 +} + +set cluster [redis_cluster 127.0.0.1:[get_instance_attrib redis 0 port]] +test "Pub/Sub shard basics" { + + set slot [$cluster cluster keyslot "channel.0"] + array set publishnode [$cluster masternode_for_slot $slot] + array set notshardnode [$cluster masternode_notfor_slot $slot] + + set publishclient [redis_client_by_addr $publishnode(host) $publishnode(port)] + set subscribeclient [redis_deferring_client_by_addr $publishnode(host) $publishnode(port)] + set subscribeclient2 [redis_deferring_client_by_addr $publishnode(host) $publishnode(port)] + set anotherclient [redis_deferring_client_by_addr $notshardnode(host) $notshardnode(port)] + + $subscribeclient ssubscribe channel.0 + $subscribeclient read + + $subscribeclient2 ssubscribe channel.0 + $subscribeclient2 read + + $anotherclient ssubscribe channel.0 + catch {$anotherclient read} err + assert_match {MOVED *} $err + + set data [randomValue] + $publishclient spublish channel.0 $data + + set msg [$subscribeclient read] + assert_equal $data [lindex $msg 2] + + set msg [$subscribeclient2 read] + assert_equal $data [lindex $msg 2] + + $publishclient close + $subscribeclient close + $subscribeclient2 close + $anotherclient close +} + +test "client can't subscribe to multiple shard channels across different slots in same call" { + catch {$cluster ssubscribe channel.0 channel.1} err + assert_match {CROSSSLOT Keys*} $err +} + +test "client can subscribe to multiple shard channels across different slots in separate call" { + $cluster ssubscribe ch3 + $cluster ssubscribe ch7 + + $cluster sunsubscribe ch3 + $cluster sunsubscribe ch7 +} + + +test "Verify Pub/Sub and Pub/Sub shard no overlap" { + set slot [$cluster cluster keyslot "channel.0"] + array set publishnode [$cluster masternode_for_slot $slot] + array set notshardnode [$cluster masternode_notfor_slot $slot] + + set publishshardclient [redis_client_by_addr $publishnode(host) $publishnode(port)] + set publishclient [redis_deferring_client_by_addr $publishnode(host) $publishnode(port)] + set subscribeshardclient [redis_deferring_client_by_addr $publishnode(host) $publishnode(port)] + set subscribeclient [redis_deferring_client_by_addr $publishnode(host) $publishnode(port)] + + $subscribeshardclient deferred 1 + $subscribeshardclient ssubscribe channel.0 + $subscribeshardclient read + + $subscribeclient deferred 1 + $subscribeclient subscribe channel.0 + $subscribeclient read + + set sharddata "testingpubsubdata" + $publishshardclient spublish channel.0 $sharddata + + set data "somemoredata" + $publishclient publish channel.0 $data + + set msg [$subscribeshardclient read] + assert_equal $sharddata [lindex $msg 2] + + set msg [$subscribeclient read] + assert_equal $data [lindex $msg 2] + + $cluster close + $publishclient close + $subscribeclient close + $subscribeshardclient close +} \ No newline at end of file diff --git a/tests/cluster/tests/27-endpoints.tcl b/tests/cluster/tests/27-endpoints.tcl new file mode 100644 index 0000000..32e3e79 --- /dev/null +++ b/tests/cluster/tests/27-endpoints.tcl @@ -0,0 +1,219 @@ +source "../tests/includes/init-tests.tcl" + +# Isolate a node from the cluster and give it a new nodeid +proc isolate_node {id} { + set node_id [R $id CLUSTER MYID] + R 6 CLUSTER RESET HARD + for {set j 0} {$j < 20} {incr j} { + if { $j eq $id } { + continue + } + R $j CLUSTER FORGET $node_id + } +} + +proc get_slot_field {slot_output shard_id node_id attrib_id} { + return [lindex [lindex [lindex $slot_output $shard_id] $node_id] $attrib_id] +} + +test "Create a 6 nodes cluster" { + cluster_create_with_continuous_slots 3 3 +} + +test "Cluster should start ok" { + assert_cluster_state ok + wait_for_cluster_propagation +} + +test "Set cluster hostnames and verify they are propagated" { + for {set j 0} {$j < $::cluster_master_nodes + $::cluster_replica_nodes} {incr j} { + R $j config set cluster-announce-hostname "host-$j.com" + } + + wait_for_condition 50 100 { + [are_hostnames_propagated "host-*.com"] eq 1 + } else { + fail "cluster hostnames were not propagated" + } + + # Now that everything is propagated, assert everyone agrees + wait_for_cluster_propagation +} + +test "Update hostnames and make sure they are all eventually propagated" { + for {set j 0} {$j < $::cluster_master_nodes + $::cluster_replica_nodes} {incr j} { + R $j config set cluster-announce-hostname "host-updated-$j.com" + } + + wait_for_condition 50 100 { + [are_hostnames_propagated "host-updated-*.com"] eq 1 + } else { + fail "cluster hostnames were not propagated" + } + + # Now that everything is propagated, assert everyone agrees + wait_for_cluster_propagation +} + +test "Remove hostnames and make sure they are all eventually propagated" { + for {set j 0} {$j < $::cluster_master_nodes + $::cluster_replica_nodes} {incr j} { + R $j config set cluster-announce-hostname "" + } + + wait_for_condition 50 100 { + [are_hostnames_propagated ""] eq 1 + } else { + fail "cluster hostnames were not propagated" + } + + # Now that everything is propagated, assert everyone agrees + wait_for_cluster_propagation +} + +test "Verify cluster-preferred-endpoint-type behavior for redirects and info" { + R 0 config set cluster-announce-hostname "me.com" + R 1 config set cluster-announce-hostname "" + R 2 config set cluster-announce-hostname "them.com" + + wait_for_cluster_propagation + + # Verify default behavior + set slot_result [R 0 cluster slots] + assert_equal "" [lindex [get_slot_field $slot_result 0 2 0] 1] + assert_equal "" [lindex [get_slot_field $slot_result 2 2 0] 1] + assert_equal "hostname" [lindex [get_slot_field $slot_result 0 2 3] 0] + assert_equal "me.com" [lindex [get_slot_field $slot_result 0 2 3] 1] + assert_equal "hostname" [lindex [get_slot_field $slot_result 2 2 3] 0] + assert_equal "them.com" [lindex [get_slot_field $slot_result 2 2 3] 1] + + # Redirect will use the IP address + catch {R 0 set foo foo} redir_err + assert_match "MOVED * 127.0.0.1:*" $redir_err + + # Verify prefer hostname behavior + R 0 config set cluster-preferred-endpoint-type hostname + + set slot_result [R 0 cluster slots] + assert_equal "me.com" [get_slot_field $slot_result 0 2 0] + assert_equal "them.com" [get_slot_field $slot_result 2 2 0] + + # Redirect should use hostname + catch {R 0 set foo foo} redir_err + assert_match "MOVED * them.com:*" $redir_err + + # Redirect to an unknown hostname returns ? + catch {R 0 set barfoo bar} redir_err + assert_match "MOVED * ?:*" $redir_err + + # Verify unknown hostname behavior + R 0 config set cluster-preferred-endpoint-type unknown-endpoint + + # Verify default behavior + set slot_result [R 0 cluster slots] + assert_equal "ip" [lindex [get_slot_field $slot_result 0 2 3] 0] + assert_equal "127.0.0.1" [lindex [get_slot_field $slot_result 0 2 3] 1] + assert_equal "ip" [lindex [get_slot_field $slot_result 2 2 3] 0] + assert_equal "127.0.0.1" [lindex [get_slot_field $slot_result 2 2 3] 1] + assert_equal "ip" [lindex [get_slot_field $slot_result 1 2 3] 0] + assert_equal "127.0.0.1" [lindex [get_slot_field $slot_result 1 2 3] 1] + # Not required by the protocol, but IP comes before hostname + assert_equal "hostname" [lindex [get_slot_field $slot_result 0 2 3] 2] + assert_equal "me.com" [lindex [get_slot_field $slot_result 0 2 3] 3] + assert_equal "hostname" [lindex [get_slot_field $slot_result 2 2 3] 2] + assert_equal "them.com" [lindex [get_slot_field $slot_result 2 2 3] 3] + + # This node doesn't have a hostname + assert_equal 2 [llength [get_slot_field $slot_result 1 2 3]] + + # Redirect should use empty string + catch {R 0 set foo foo} redir_err + assert_match "MOVED * :*" $redir_err + + R 0 config set cluster-preferred-endpoint-type ip +} + +test "Verify the nodes configured with prefer hostname only show hostname for new nodes" { + # Have everyone forget node 6 and isolate it from the cluster. + isolate_node 6 + + # Set hostnames for the primaries, now that the node is isolated + R 0 config set cluster-announce-hostname "shard-1.com" + R 1 config set cluster-announce-hostname "shard-2.com" + R 2 config set cluster-announce-hostname "shard-3.com" + + # Prevent Node 0 and Node 6 from properly meeting, + # they'll hang in the handshake phase. This allows us to + # test the case where we "know" about it but haven't + # successfully retrieved information about it yet. + R 0 DEBUG DROP-CLUSTER-PACKET-FILTER 0 + R 6 DEBUG DROP-CLUSTER-PACKET-FILTER 0 + + # Have a replica meet the isolated node + R 3 cluster meet 127.0.0.1 [get_instance_attrib redis 6 port] + + # Wait for the isolated node to learn about the rest of the cluster, + # which correspond to a single entry in cluster nodes. Note this + # doesn't mean the isolated node has successfully contacted each + # node. + wait_for_condition 50 100 { + [llength [split [R 6 CLUSTER NODES] "\n"]] eq 21 + } else { + fail "Isolated node didn't learn about the rest of the cluster *" + } + + # Now, we wait until the two nodes that aren't filtering packets + # to accept our isolated nodes connections. At this point they will + # start showing up in cluster slots. + wait_for_condition 50 100 { + [llength [R 6 CLUSTER SLOTS]] eq 2 + } else { + fail "Node did not learn about the 2 shards it can talk to" + } + set slot_result [R 6 CLUSTER SLOTS] + assert_equal [lindex [get_slot_field $slot_result 0 2 3] 1] "shard-2.com" + assert_equal [lindex [get_slot_field $slot_result 1 2 3] 1] "shard-3.com" + + # Also make sure we know about the isolated primary, we + # just can't reach it. + set primary_id [R 0 CLUSTER MYID] + assert_match "*$primary_id*" [R 6 CLUSTER NODES] + + # Stop dropping cluster packets, and make sure everything + # stabilizes + R 0 DEBUG DROP-CLUSTER-PACKET-FILTER -1 + R 6 DEBUG DROP-CLUSTER-PACKET-FILTER -1 + + wait_for_condition 50 100 { + [llength [R 6 CLUSTER SLOTS]] eq 3 + } else { + fail "Node did not learn about the 2 shards it can talk to" + } + set slot_result [R 6 CLUSTER SLOTS] + assert_equal [lindex [get_slot_field $slot_result 0 2 3] 1] "shard-1.com" + assert_equal [lindex [get_slot_field $slot_result 1 2 3] 1] "shard-2.com" + assert_equal [lindex [get_slot_field $slot_result 2 2 3] 1] "shard-3.com" +} + +test "Test restart will keep hostname information" { + # Set a new hostname, reboot and make sure it sticks + R 0 config set cluster-announce-hostname "restart-1.com" + # Store the hostname in the config + R 0 config rewrite + kill_instance redis 0 + restart_instance redis 0 + set slot_result [R 0 CLUSTER SLOTS] + assert_equal [lindex [get_slot_field $slot_result 0 2 3] 1] "restart-1.com" + + # As a sanity check, make sure everyone eventually agrees + wait_for_cluster_propagation +} + +test "Test hostname validation" { + catch {R 0 config set cluster-announce-hostname [string repeat x 256]} err + assert_match "*Hostnames must be less than 256 characters*" $err + catch {R 0 config set cluster-announce-hostname "?.com"} err + assert_match "*Hostnames may only contain alphanumeric characters, hyphens or dots*" $err + + # Note this isn't a valid hostname, but it passes our internal validation + R 0 config set cluster-announce-hostname "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-." +} \ No newline at end of file diff --git a/tests/cluster/tests/28-cluster-shards.tcl b/tests/cluster/tests/28-cluster-shards.tcl new file mode 100644 index 0000000..8d218cb --- /dev/null +++ b/tests/cluster/tests/28-cluster-shards.tcl @@ -0,0 +1,202 @@ +source "../tests/includes/init-tests.tcl" + +# Initial slot distribution. +set ::slot0 [list 0 1000 1002 5459 5461 5461 10926 10926] +set ::slot1 [list 5460 5460 5462 10922 10925 10925] +set ::slot2 [list 10923 10924 10927 16383] +set ::slot3 [list 1001 1001] + +proc cluster_create_with_split_slots {masters replicas} { + for {set j 0} {$j < $masters} {incr j} { + R $j cluster ADDSLOTSRANGE {*}[set ::slot${j}] + } + if {$replicas} { + cluster_allocate_slaves $masters $replicas + } + set ::cluster_master_nodes $masters + set ::cluster_replica_nodes $replicas +} + +# Get the node info with the specific node_id from the +# given reference node. Valid type options are "node" and "shard" +proc get_node_info_from_shard {id reference {type node}} { + set shards_response [R $reference CLUSTER SHARDS] + foreach shard_response $shards_response { + set nodes [dict get $shard_response nodes] + foreach node $nodes { + if {[dict get $node id] eq $id} { + if {$type eq "node"} { + return $node + } elseif {$type eq "shard"} { + return $shard_response + } else { + return {} + } + } + } + } + # No shard found, return nothing + return {} +} + +test "Create a 8 nodes cluster with 4 shards" { + cluster_create_with_split_slots 4 4 +} + +test "Cluster should start ok" { + assert_cluster_state ok +} + +test "Set cluster hostnames and verify they are propagated" { + for {set j 0} {$j < $::cluster_master_nodes + $::cluster_replica_nodes} {incr j} { + R $j config set cluster-announce-hostname "host-$j.com" + } + + # Wait for everyone to agree about the state + wait_for_cluster_propagation +} + +test "Verify information about the shards" { + set ids {} + for {set j 0} {$j < $::cluster_master_nodes + $::cluster_replica_nodes} {incr j} { + lappend ids [R $j CLUSTER MYID] + } + set slots [list $::slot0 $::slot1 $::slot2 $::slot3 $::slot0 $::slot1 $::slot2 $::slot3] + + # Verify on each node (primary/replica), the response of the `CLUSTER SLOTS` command is consistent. + for {set ref 0} {$ref < $::cluster_master_nodes + $::cluster_replica_nodes} {incr ref} { + for {set i 0} {$i < $::cluster_master_nodes + $::cluster_replica_nodes} {incr i} { + assert_equal [lindex $slots $i] [dict get [get_node_info_from_shard [lindex $ids $i] $ref "shard"] slots] + assert_equal "host-$i.com" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] hostname] + assert_equal "127.0.0.1" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] ip] + # Default value of 'cluster-preferred-endpoint-type' is ip. + assert_equal "127.0.0.1" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] endpoint] + + if {$::tls} { + assert_equal [get_instance_attrib redis $i plaintext-port] [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] port] + assert_equal [get_instance_attrib redis $i port] [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] tls-port] + } else { + assert_equal [get_instance_attrib redis $i port] [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] port] + } + + if {$i < 4} { + assert_equal "master" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] role] + assert_equal "online" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] health] + } else { + assert_equal "replica" [dict get [get_node_info_from_shard [lindex $ids $i] $ref "node"] role] + # Replica could be in online or loading + } + } + } +} + +test "Verify no slot shard" { + # Node 8 has no slots assigned + set node_8_id [R 8 CLUSTER MYID] + assert_equal {} [dict get [get_node_info_from_shard $node_8_id 8 "shard"] slots] + assert_equal {} [dict get [get_node_info_from_shard $node_8_id 0 "shard"] slots] +} + +set node_0_id [R 0 CLUSTER MYID] + +test "Kill a node and tell the replica to immediately takeover" { + kill_instance redis 0 + R 4 cluster failover force +} + +# Primary 0 node should report as fail, wait until the new primary acknowledges it. +test "Verify health as fail for killed node" { + wait_for_condition 50 100 { + "fail" eq [dict get [get_node_info_from_shard $node_0_id 4 "node"] "health"] + } else { + fail "New primary never detected the node failed" + } +} + +set primary_id 4 +set replica_id 0 + +test "Restarting primary node" { + restart_instance redis $replica_id +} + +test "Instance #0 gets converted into a replica" { + wait_for_condition 1000 50 { + [RI $replica_id role] eq {slave} + } else { + fail "Old primary was not converted into replica" + } +} + +test "Test the replica reports a loading state while it's loading" { + # Test the command is good for verifying everything moves to a happy state + set replica_cluster_id [R $replica_id CLUSTER MYID] + wait_for_condition 50 1000 { + [dict get [get_node_info_from_shard $replica_cluster_id $primary_id "node"] health] eq "online" + } else { + fail "Replica never transitioned to online" + } + + # Set 1 MB of data, so there is something to load on full sync + R $primary_id debug populate 1000 key 1000 + + # Kill replica client for primary and load new data to the primary + R $primary_id config set repl-backlog-size 100 + + # Set the key load delay so that it will take at least + # 2 seconds to fully load the data. + R $replica_id config set key-load-delay 4000 + + # Trigger event loop processing every 1024 bytes, this trigger + # allows us to send and receive cluster messages, so we are setting + # it low so that the cluster messages are sent more frequently. + R $replica_id config set loading-process-events-interval-bytes 1024 + + R $primary_id multi + R $primary_id client kill type replica + # populate the correct data + set num 100 + set value [string repeat A 1024] + for {set j 0} {$j < $num} {incr j} { + # Use hashtag valid for shard #0 + set key "{ch3}$j" + R $primary_id set $key $value + } + R $primary_id exec + + # The replica should reconnect and start a full sync, it will gossip about it's health to the primary. + wait_for_condition 50 1000 { + "loading" eq [dict get [get_node_info_from_shard $replica_cluster_id $primary_id "node"] health] + } else { + fail "Replica never transitioned to loading" + } + + # Speed up the key loading and verify everything resumes + R $replica_id config set key-load-delay 0 + + wait_for_condition 50 1000 { + "online" eq [dict get [get_node_info_from_shard $replica_cluster_id $primary_id "node"] health] + } else { + fail "Replica never transitioned to online" + } + + # Final sanity, the replica agrees it is online. + assert_equal "online" [dict get [get_node_info_from_shard $replica_cluster_id $replica_id "node"] health] +} + +test "Regression test for a crash when calling SHARDS during handshake" { + # Reset forget a node, so we can use it to establish handshaking connections + set id [R 19 CLUSTER MYID] + R 19 CLUSTER RESET HARD + for {set i 0} {$i < 19} {incr i} { + R $i CLUSTER FORGET $id + } + R 19 cluster meet 127.0.0.1 [get_instance_attrib redis 0 port] + # This should line would previously crash, since all the outbound + # connections were in handshake state. + R 19 CLUSTER SHARDS +} + +test "Cluster is up" { + assert_cluster_state ok +} diff --git a/tests/cluster/tests/29-slot-migration-response.tcl b/tests/cluster/tests/29-slot-migration-response.tcl new file mode 100644 index 0000000..060cc8d --- /dev/null +++ b/tests/cluster/tests/29-slot-migration-response.tcl @@ -0,0 +1,50 @@ +# Tests for the response of slot migrations. + +source "../tests/includes/init-tests.tcl" +source "../tests/includes/utils.tcl" + +test "Create a 2 nodes cluster" { + create_cluster 2 0 + config_set_all_nodes cluster-allow-replica-migration no +} + +test "Cluster is up" { + assert_cluster_state ok +} + +set cluster [redis_cluster 127.0.0.1:[get_instance_attrib redis 0 port]] +catch {unset nodefrom} +catch {unset nodeto} + +$cluster refresh_nodes_map + +test "Set many keys in the cluster" { + for {set i 0} {$i < 5000} {incr i} { + $cluster set $i $i + assert { [$cluster get $i] eq $i } + } +} + +test "Test cluster responses during migration of slot x" { + + set slot 10 + array set nodefrom [$cluster masternode_for_slot $slot] + array set nodeto [$cluster masternode_notfor_slot $slot] + + $nodeto(link) cluster setslot $slot importing $nodefrom(id) + $nodefrom(link) cluster setslot $slot migrating $nodeto(id) + + # Get a key from that slot + set key [$nodefrom(link) cluster GETKEYSINSLOT $slot "1"] + + # MOVED REPLY + assert_error "*MOVED*" {$nodeto(link) set $key "newVal"} + + # ASK REPLY + assert_error "*ASK*" {$nodefrom(link) set "abc{$key}" "newVal"} + + # UNSTABLE REPLY + assert_error "*TRYAGAIN*" {$nodefrom(link) mset "a{$key}" "newVal" $key "newVal2"} +} + +config_set_all_nodes cluster-allow-replica-migration yes diff --git a/tests/cluster/tests/helpers/onlydots.tcl b/tests/cluster/tests/helpers/onlydots.tcl new file mode 100644 index 0000000..4a6d1ae --- /dev/null +++ b/tests/cluster/tests/helpers/onlydots.tcl @@ -0,0 +1,16 @@ +# Read the standard input and only shows dots in the output, filtering out +# all the other characters. Designed to avoid bufferization so that when +# we get the output of redis-trib and want to show just the dots, we'll see +# the dots as soon as redis-trib will output them. + +fconfigure stdin -buffering none + +while 1 { + set c [read stdin 1] + if {$c eq {}} { + exit 0; # EOF + } elseif {$c eq {.}} { + puts -nonewline . + flush stdout + } +} diff --git a/tests/cluster/tests/includes/init-tests.tcl b/tests/cluster/tests/includes/init-tests.tcl new file mode 100644 index 0000000..fc5897a --- /dev/null +++ b/tests/cluster/tests/includes/init-tests.tcl @@ -0,0 +1,75 @@ +# Initialization tests -- most units will start including this. + +test "(init) Restart killed instances" { + foreach type {redis} { + foreach_${type}_id id { + if {[get_instance_attrib $type $id pid] == -1} { + puts -nonewline "$type/$id " + flush stdout + restart_instance $type $id + } + } + } +} + +test "Cluster nodes are reachable" { + foreach_redis_id id { + # Every node should be reachable. + wait_for_condition 1000 50 { + ([catch {R $id ping} ping_reply] == 0) && + ($ping_reply eq {PONG}) + } else { + catch {R $id ping} err + fail "Node #$id keeps replying '$err' to PING." + } + } +} + +test "Cluster nodes hard reset" { + foreach_redis_id id { + if {$::valgrind} { + set node_timeout 10000 + } else { + set node_timeout 3000 + } + catch {R $id flushall} ; # May fail for readonly slaves. + R $id MULTI + R $id cluster reset hard + R $id cluster set-config-epoch [expr {$id+1}] + R $id EXEC + R $id config set cluster-node-timeout $node_timeout + R $id config set cluster-slave-validity-factor 10 + R $id config set loading-process-events-interval-bytes 2097152 + R $id config set key-load-delay 0 + R $id config set repl-diskless-load disabled + R $id config set cluster-announce-hostname "" + R $id DEBUG DROP-CLUSTER-PACKET-FILTER -1 + R $id config rewrite + } +} + +test "Cluster Join and auto-discovery test" { + # Join node 0 with 1, 1 with 2, ... and so forth. + # If auto-discovery works all nodes will know every other node + # eventually. + set ids {} + foreach_redis_id id {lappend ids $id} + for {set j 0} {$j < [expr [llength $ids]-1]} {incr j} { + set a [lindex $ids $j] + set b [lindex $ids [expr $j+1]] + set b_port [get_instance_attrib redis $b port] + R $a cluster meet 127.0.0.1 $b_port + } + + foreach_redis_id id { + wait_for_condition 1000 50 { + [llength [get_cluster_nodes $id]] == [llength $ids] + } else { + fail "Cluster failed to join into a full mesh." + } + } +} + +test "Before slots allocation, all nodes report cluster failure" { + assert_cluster_state fail +} diff --git a/tests/cluster/tests/includes/utils.tcl b/tests/cluster/tests/includes/utils.tcl new file mode 100644 index 0000000..c1b0fe6 --- /dev/null +++ b/tests/cluster/tests/includes/utils.tcl @@ -0,0 +1,36 @@ +source "../../../tests/support/cli.tcl" + +proc config_set_all_nodes {keyword value} { + foreach_redis_id id { + R $id config set $keyword $value + } +} + +proc fix_cluster {addr} { + set code [catch { + exec ../../../src/redis-cli {*}[rediscli_tls_config "../../../tests"] --cluster fix $addr << yes + } result] + if {$code != 0} { + puts "redis-cli --cluster fix returns non-zero exit code, output below:\n$result" + } + # Note: redis-cli --cluster fix may return a non-zero exit code if nodes don't agree, + # but we can ignore that and rely on the check below. + assert_cluster_state ok + wait_for_condition 100 100 { + [catch {exec ../../../src/redis-cli {*}[rediscli_tls_config "../../../tests"] --cluster check $addr} result] == 0 + } else { + puts "redis-cli --cluster check returns non-zero exit code, output below:\n$result" + fail "Cluster could not settle with configuration" + } +} + +proc wait_cluster_stable {} { + wait_for_condition 1000 50 { + [catch {exec ../../../src/redis-cli --cluster \ + check 127.0.0.1:[get_instance_attrib redis 0 port] \ + {*}[rediscli_tls_config "../../../tests"] \ + }] == 0 + } else { + fail "Cluster doesn't stabilize" + } +} \ No newline at end of file diff --git a/tests/cluster/tmp/.gitignore b/tests/cluster/tmp/.gitignore new file mode 100644 index 0000000..f581f73 --- /dev/null +++ b/tests/cluster/tmp/.gitignore @@ -0,0 +1,2 @@ +redis_* +sentinel_* -- cgit v1.2.3