diff options
Diffstat (limited to 'tests/instances.tcl')
-rw-r--r-- | tests/instances.tcl | 702 |
1 files changed, 702 insertions, 0 deletions
diff --git a/tests/instances.tcl b/tests/instances.tcl new file mode 100644 index 0000000..a92c5e5 --- /dev/null +++ b/tests/instances.tcl @@ -0,0 +1,702 @@ +# Multi-instance test framework. +# This is used in order to test Sentinel and Redis Cluster, and provides +# basic capabilities for spawning and handling N parallel Redis / Sentinel +# instances. +# +# Copyright (C) 2014 Salvatore Sanfilippo antirez@gmail.com +# This software is released under the BSD License. See the COPYING file for +# more information. + +package require Tcl 8.5 + +set tcl_precision 17 +source ../support/redis.tcl +source ../support/util.tcl +source ../support/aofmanifest.tcl +source ../support/server.tcl +source ../support/test.tcl + +set ::verbose 0 +set ::valgrind 0 +set ::tls 0 +set ::pause_on_error 0 +set ::dont_clean 0 +set ::simulate_error 0 +set ::failed 0 +set ::sentinel_instances {} +set ::redis_instances {} +set ::global_config {} +set ::sentinel_base_port 20000 +set ::redis_base_port 30000 +set ::redis_port_count 1024 +set ::host "127.0.0.1" +set ::leaked_fds_file [file normalize "tmp/leaked_fds.txt"] +set ::pids {} ; # We kill everything at exit +set ::dirs {} ; # We remove all the temp dirs at exit +set ::run_matching {} ; # If non empty, only tests matching pattern are run. + +if {[catch {cd tmp}]} { + puts "tmp directory not found." + puts "Please run this test from the Redis source root." + exit 1 +} + +# Execute the specified instance of the server specified by 'type', using +# the provided configuration file. Returns the PID of the process. +proc exec_instance {type dirname cfgfile} { + if {$type eq "redis"} { + set prgname redis-server + } elseif {$type eq "sentinel"} { + set prgname redis-sentinel + } else { + error "Unknown instance type." + } + + set errfile [file join $dirname err.txt] + if {$::valgrind} { + set pid [exec valgrind --track-origins=yes --suppressions=../../../src/valgrind.sup --show-reachable=no --show-possibly-lost=no --leak-check=full ../../../src/${prgname} $cfgfile 2>> $errfile &] + } else { + set pid [exec ../../../src/${prgname} $cfgfile 2>> $errfile &] + } + return $pid +} + +# Spawn a redis or sentinel instance, depending on 'type'. +proc spawn_instance {type base_port count {conf {}} {base_conf_file ""}} { + for {set j 0} {$j < $count} {incr j} { + set port [find_available_port $base_port $::redis_port_count] + # plaintext port (only used for TLS cluster) + set pport 0 + # Create a directory for this instance. + set dirname "${type}_${j}" + lappend ::dirs $dirname + catch {exec rm -rf $dirname} + file mkdir $dirname + + # Write the instance config file. + set cfgfile [file join $dirname $type.conf] + if {$base_conf_file ne ""} { + file copy -- $base_conf_file $cfgfile + set cfg [open $cfgfile a+] + } else { + set cfg [open $cfgfile w] + } + + if {$::tls} { + puts $cfg "tls-port $port" + puts $cfg "tls-replication yes" + puts $cfg "tls-cluster yes" + # plaintext port, only used by plaintext clients in a TLS cluster + set pport [find_available_port $base_port $::redis_port_count] + puts $cfg "port $pport" + puts $cfg [format "tls-cert-file %s/../../tls/server.crt" [pwd]] + puts $cfg [format "tls-key-file %s/../../tls/server.key" [pwd]] + puts $cfg [format "tls-client-cert-file %s/../../tls/client.crt" [pwd]] + puts $cfg [format "tls-client-key-file %s/../../tls/client.key" [pwd]] + puts $cfg [format "tls-dh-params-file %s/../../tls/redis.dh" [pwd]] + puts $cfg [format "tls-ca-cert-file %s/../../tls/ca.crt" [pwd]] + puts $cfg "loglevel debug" + } else { + puts $cfg "port $port" + } + puts $cfg "repl-diskless-sync-delay 0" + puts $cfg "dir ./$dirname" + puts $cfg "logfile log.txt" + # Add additional config files + foreach directive $conf { + puts $cfg $directive + } + dict for {name val} $::global_config { + puts $cfg "$name $val" + } + close $cfg + + # Finally exec it and remember the pid for later cleanup. + set retry 100 + while {$retry} { + set pid [exec_instance $type $dirname $cfgfile] + + # Check availability + if {[server_is_up 127.0.0.1 $port 100] == 0} { + puts "Starting $type #$j at port $port failed, try another" + incr retry -1 + set port [find_available_port $base_port $::redis_port_count] + set cfg [open $cfgfile a+] + if {$::tls} { + puts $cfg "tls-port $port" + set pport [find_available_port $base_port $::redis_port_count] + puts $cfg "port $pport" + } else { + puts $cfg "port $port" + } + close $cfg + } else { + puts "Starting $type #$j at port $port" + lappend ::pids $pid + break + } + } + + # Check availability finally + if {[server_is_up $::host $port 100] == 0} { + set logfile [file join $dirname log.txt] + puts [exec tail $logfile] + abort_sentinel_test "Problems starting $type #$j: ping timeout, maybe server start failed, check $logfile" + } + + # Push the instance into the right list + set link [redis $::host $port 0 $::tls] + $link reconnect 1 + lappend ::${type}_instances [list \ + pid $pid \ + host $::host \ + port $port \ + plaintext-port $pport \ + link $link \ + ] + } +} + +proc log_crashes {} { + set start_pattern {*REDIS BUG REPORT START*} + set logs [glob */log.txt] + foreach log $logs { + set fd [open $log] + set found 0 + while {[gets $fd line] >= 0} { + if {[string match $start_pattern $line]} { + puts "\n*** Crash report found in $log ***" + set found 1 + } + if {$found} { + puts $line + incr ::failed + } + } + } + + set logs [glob */err.txt] + foreach log $logs { + set res [find_valgrind_errors $log true] + if {$res != ""} { + puts $res + incr ::failed + } + } + + set logs [glob */err.txt] + foreach log $logs { + set res [sanitizer_errors_from_file $log] + if {$res != ""} { + puts $res + incr ::failed + } + } +} + +proc is_alive pid { + if {[catch {exec ps -p $pid} err]} { + return 0 + } else { + return 1 + } +} + +proc stop_instance pid { + catch {exec kill $pid} + # Node might have been stopped in the test + catch {exec kill -SIGCONT $pid} + if {$::valgrind} { + set max_wait 120000 + } else { + set max_wait 10000 + } + while {[is_alive $pid]} { + incr wait 10 + + if {$wait == $max_wait} { + puts [colorstr red "Forcing process $pid to crash..."] + catch {exec kill -SEGV $pid} + } elseif {$wait >= $max_wait * 2} { + puts [colorstr red "Forcing process $pid to exit..."] + catch {exec kill -KILL $pid} + } elseif {$wait % 1000 == 0} { + puts "Waiting for process $pid to exit..." + } + after 10 + } +} + +proc cleanup {} { + puts "Cleaning up..." + foreach pid $::pids { + puts "killing stale instance $pid" + stop_instance $pid + } + log_crashes + if {$::dont_clean} { + return + } + foreach dir $::dirs { + catch {exec rm -rf $dir} + } +} + +proc abort_sentinel_test msg { + incr ::failed + puts "WARNING: Aborting the test." + puts ">>>>>>>> $msg" + if {$::pause_on_error} pause_on_error + cleanup + exit 1 +} + +proc parse_options {} { + for {set j 0} {$j < [llength $::argv]} {incr j} { + set opt [lindex $::argv $j] + set val [lindex $::argv [expr $j+1]] + if {$opt eq "--single"} { + incr j + lappend ::run_matching "*${val}*" + } elseif {$opt eq "--pause-on-error"} { + set ::pause_on_error 1 + } elseif {$opt eq {--dont-clean}} { + set ::dont_clean 1 + } elseif {$opt eq "--fail"} { + set ::simulate_error 1 + } elseif {$opt eq {--valgrind}} { + set ::valgrind 1 + } elseif {$opt eq {--host}} { + incr j + set ::host ${val} + } elseif {$opt eq {--tls}} { + package require tls 1.6 + ::tls::init \ + -cafile "$::tlsdir/ca.crt" \ + -certfile "$::tlsdir/client.crt" \ + -keyfile "$::tlsdir/client.key" + set ::tls 1 + } elseif {$opt eq {--config}} { + set val2 [lindex $::argv [expr $j+2]] + dict set ::global_config $val $val2 + incr j 2 + } elseif {$opt eq "--help"} { + puts "--single <pattern> Only runs tests specified by pattern." + puts "--dont-clean Keep log files on exit." + puts "--pause-on-error Pause for manual inspection on error." + puts "--fail Simulate a test failure." + puts "--valgrind Run with valgrind." + puts "--tls Run tests in TLS mode." + puts "--host <host> Use hostname instead of 127.0.0.1." + puts "--config <k> <v> Extra config argument(s)." + puts "--help Shows this help." + exit 0 + } else { + puts "Unknown option $opt" + exit 1 + } + } +} + +# If --pause-on-error option was passed at startup this function is called +# on error in order to give the developer a chance to understand more about +# the error condition while the instances are still running. +proc pause_on_error {} { + puts "" + puts [colorstr yellow "*** Please inspect the error now ***"] + puts "\nType \"continue\" to resume the test, \"help\" for help screen.\n" + while 1 { + puts -nonewline "> " + flush stdout + set line [gets stdin] + set argv [split $line " "] + set cmd [lindex $argv 0] + if {$cmd eq {continue}} { + break + } elseif {$cmd eq {show-redis-logs}} { + set count 10 + if {[lindex $argv 1] ne {}} {set count [lindex $argv 1]} + foreach_redis_id id { + puts "=== REDIS $id ====" + puts [exec tail -$count redis_$id/log.txt] + puts "---------------------\n" + } + } elseif {$cmd eq {show-sentinel-logs}} { + set count 10 + if {[lindex $argv 1] ne {}} {set count [lindex $argv 1]} + foreach_sentinel_id id { + puts "=== SENTINEL $id ====" + puts [exec tail -$count sentinel_$id/log.txt] + puts "---------------------\n" + } + } elseif {$cmd eq {ls}} { + foreach_redis_id id { + puts -nonewline "Redis $id" + set errcode [catch { + set str {} + append str "@[RI $id tcp_port]: " + append str "[RI $id role] " + if {[RI $id role] eq {slave}} { + append str "[RI $id master_host]:[RI $id master_port]" + } + set str + } retval] + if {$errcode} { + puts " -- $retval" + } else { + puts $retval + } + } + foreach_sentinel_id id { + puts -nonewline "Sentinel $id" + set errcode [catch { + set str {} + append str "@[SI $id tcp_port]: " + append str "[join [S $id sentinel get-master-addr-by-name mymaster]]" + set str + } retval] + if {$errcode} { + puts " -- $retval" + } else { + puts $retval + } + } + } elseif {$cmd eq {help}} { + puts "ls List Sentinel and Redis instances." + puts "show-sentinel-logs \[N\] Show latest N lines of logs." + puts "show-redis-logs \[N\] Show latest N lines of logs." + puts "S <id> cmd ... arg Call command in Sentinel <id>." + puts "R <id> cmd ... arg Call command in Redis <id>." + puts "SI <id> <field> Show Sentinel <id> INFO <field>." + puts "RI <id> <field> Show Redis <id> INFO <field>." + puts "continue Resume test." + } else { + set errcode [catch {eval $line} retval] + if {$retval ne {}} {puts "$retval"} + } + } +} + +# We redefine 'test' as for Sentinel we don't use the server-client +# architecture for the test, everything is sequential. +proc test {descr code} { + set ts [clock format [clock seconds] -format %H:%M:%S] + puts -nonewline "$ts> $descr: " + flush stdout + + if {[catch {set retval [uplevel 1 $code]} error]} { + incr ::failed + if {[string match "assertion:*" $error]} { + set msg "FAILED: [string range $error 10 end]" + puts [colorstr red $msg] + if {$::pause_on_error} pause_on_error + puts [colorstr red "(Jumping to next unit after error)"] + return -code continue + } else { + # Re-raise, let handler up the stack take care of this. + error $error $::errorInfo + } + } else { + puts [colorstr green OK] + } +} + +# Check memory leaks when running on OSX using the "leaks" utility. +proc check_leaks instance_types { + if {[string match {*Darwin*} [exec uname -a]]} { + puts -nonewline "Testing for memory leaks..."; flush stdout + foreach type $instance_types { + foreach_instance_id [set ::${type}_instances] id { + if {[instance_is_killed $type $id]} continue + set pid [get_instance_attrib $type $id pid] + set output {0 leaks} + catch {exec leaks $pid} output + if {[string match {*process does not exist*} $output] || + [string match {*cannot examine*} $output]} { + # In a few tests we kill the server process. + set output "0 leaks" + } else { + puts -nonewline "$type/$pid " + flush stdout + } + if {![string match {*0 leaks*} $output]} { + puts [colorstr red "=== MEMORY LEAK DETECTED ==="] + puts "Instance type $type, ID $id:" + puts $output + puts "===" + incr ::failed + } + } + } + puts "" + } +} + +# Execute all the units inside the 'tests' directory. +proc run_tests {} { + set tests [lsort [glob ../tests/*]] + foreach test $tests { + # Remove leaked_fds file before starting + if {$::leaked_fds_file != "" && [file exists $::leaked_fds_file]} { + file delete $::leaked_fds_file + } + + if {[llength $::run_matching] != 0 && ![search_pattern_list $test $::run_matching true]} { + continue + } + if {[file isdirectory $test]} continue + puts [colorstr yellow "Testing unit: [lindex [file split $test] end]"] + if {[catch { source $test } err]} { + puts "FAILED: caught an error in the test $err" + puts $::errorInfo + incr ::failed + # letting the tests resume, so we'll eventually reach the cleanup and report crashes + } + check_leaks {redis sentinel} + + # Check if a leaked fds file was created and abort the test. + if {$::leaked_fds_file != "" && [file exists $::leaked_fds_file]} { + puts [colorstr red "ERROR: Sentinel has leaked fds to scripts:"] + puts [exec cat $::leaked_fds_file] + puts "----" + incr ::failed + } + } +} + +# Print a message and exists with 0 / 1 according to zero or more failures. +proc end_tests {} { + if {$::failed == 0 } { + puts [colorstr green "GOOD! No errors."] + exit 0 + } else { + puts [colorstr red "WARNING $::failed test(s) failed."] + exit 1 + } +} + +# The "S" command is used to interact with the N-th Sentinel. +# The general form is: +# +# S <sentinel-id> command arg arg arg ... +# +# Example to ping the Sentinel 0 (first instance): S 0 PING +proc S {n args} { + set s [lindex $::sentinel_instances $n] + [dict get $s link] {*}$args +} + +# Returns a Redis instance by index. +# Example: +# [Rn 0] info +proc Rn {n} { + return [dict get [lindex $::redis_instances $n] link] +} + +# Like R but to chat with Redis instances. +proc R {n args} { + [Rn $n] {*}$args +} + +proc get_info_field {info field} { + set fl [string length $field] + append field : + foreach line [split $info "\n"] { + set line [string trim $line "\r\n "] + if {[string range $line 0 $fl] eq $field} { + return [string range $line [expr {$fl+1}] end] + } + } + return {} +} + +proc SI {n field} { + get_info_field [S $n info] $field +} + +proc RI {n field} { + get_info_field [R $n info] $field +} + +proc RPort {n} { + if {$::tls} { + return [lindex [R $n config get tls-port] 1] + } else { + return [lindex [R $n config get port] 1] + } +} + +# Iterate over IDs of sentinel or redis instances. +proc foreach_instance_id {instances idvar code} { + upvar 1 $idvar id + for {set id 0} {$id < [llength $instances]} {incr id} { + set errcode [catch {uplevel 1 $code} result] + if {$errcode == 1} { + error $result $::errorInfo $::errorCode + } elseif {$errcode == 4} { + continue + } elseif {$errcode == 3} { + break + } elseif {$errcode != 0} { + return -code $errcode $result + } + } +} + +proc foreach_sentinel_id {idvar code} { + set errcode [catch {uplevel 1 [list foreach_instance_id $::sentinel_instances $idvar $code]} result] + return -code $errcode $result +} + +proc foreach_redis_id {idvar code} { + set errcode [catch {uplevel 1 [list foreach_instance_id $::redis_instances $idvar $code]} result] + return -code $errcode $result +} + +# Get the specific attribute of the specified instance type, id. +proc get_instance_attrib {type id attrib} { + dict get [lindex [set ::${type}_instances] $id] $attrib +} + +# Set the specific attribute of the specified instance type, id. +proc set_instance_attrib {type id attrib newval} { + set d [lindex [set ::${type}_instances] $id] + dict set d $attrib $newval + lset ::${type}_instances $id $d +} + +# Create a master-slave cluster of the given number of total instances. +# The first instance "0" is the master, all others are configured as +# slaves. +proc create_redis_master_slave_cluster n { + foreach_redis_id id { + if {$id == 0} { + # Our master. + R $id slaveof no one + R $id flushall + } elseif {$id < $n} { + R $id slaveof [get_instance_attrib redis 0 host] \ + [get_instance_attrib redis 0 port] + } else { + # Instances not part of the cluster. + R $id slaveof no one + } + } + # Wait for all the slaves to sync. + wait_for_condition 1000 50 { + [RI 0 connected_slaves] == ($n-1) + } else { + fail "Unable to create a master-slaves cluster." + } +} + +proc get_instance_id_by_port {type port} { + foreach_${type}_id id { + if {[get_instance_attrib $type $id port] == $port} { + return $id + } + } + fail "Instance $type port $port not found." +} + +# Kill an instance of the specified type/id with SIGKILL. +# This function will mark the instance PID as -1 to remember that this instance +# is no longer running and will remove its PID from the list of pids that +# we kill at cleanup. +# +# The instance can be restarted with restart-instance. +proc kill_instance {type id} { + set pid [get_instance_attrib $type $id pid] + set port [get_instance_attrib $type $id port] + + if {$pid == -1} { + error "You tried to kill $type $id twice." + } + + stop_instance $pid + set_instance_attrib $type $id pid -1 + set_instance_attrib $type $id link you_tried_to_talk_with_killed_instance + + # Remove the PID from the list of pids to kill at exit. + set ::pids [lsearch -all -inline -not -exact $::pids $pid] + + # Wait for the port it was using to be available again, so that's not + # an issue to start a new server ASAP with the same port. + set retry 100 + while {[incr retry -1]} { + set port_is_free [catch {set s [socket 127.0.0.1 $port]}] + if {$port_is_free} break + catch {close $s} + after 100 + } + if {$retry == 0} { + error "Port $port does not return available after killing instance." + } +} + +# Return true of the instance of the specified type/id is killed. +proc instance_is_killed {type id} { + set pid [get_instance_attrib $type $id pid] + expr {$pid == -1} +} + +# Restart an instance previously killed by kill_instance +proc restart_instance {type id} { + set dirname "${type}_${id}" + set cfgfile [file join $dirname $type.conf] + set port [get_instance_attrib $type $id port] + + # Execute the instance with its old setup and append the new pid + # file for cleanup. + set pid [exec_instance $type $dirname $cfgfile] + set_instance_attrib $type $id pid $pid + lappend ::pids $pid + + # Check that the instance is running + if {[server_is_up 127.0.0.1 $port 100] == 0} { + set logfile [file join $dirname log.txt] + puts [exec tail $logfile] + abort_sentinel_test "Problems starting $type #$id: ping timeout, maybe server start failed, check $logfile" + } + + # Connect with it with a fresh link + set link [redis 127.0.0.1 $port 0 $::tls] + $link reconnect 1 + set_instance_attrib $type $id link $link + + # Make sure the instance is not loading the dataset when this + # function returns. + while 1 { + catch {[$link ping]} retval + if {[string match {*LOADING*} $retval]} { + after 100 + continue + } else { + break + } + } +} + +proc redis_deferring_client {type id} { + set port [get_instance_attrib $type $id port] + set host [get_instance_attrib $type $id host] + set client [redis $host $port 1 $::tls] + return $client +} + +proc redis_deferring_client_by_addr {host port} { + set client [redis $host $port 1 $::tls] + return $client +} + +proc redis_client {type id} { + set port [get_instance_attrib $type $id port] + set host [get_instance_attrib $type $id host] + set client [redis $host $port 0 $::tls] + return $client +} + +proc redis_client_by_addr {host port} { + set client [redis $host $port 0 $::tls] + return $client +} |