tests/integration/psync2-master-restart.tcl


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218

start_server {tags {"psync2 external:skip"}} {
start_server {} {
start_server {} {
    set master [srv 0 client]
    set master_host [srv 0 host]
    set master_port [srv 0 port]

    set replica [srv -1 client]
    set replica_host [srv -1 host]
    set replica_port [srv -1 port]

    set sub_replica [srv -2 client]

    # Make sure the server saves an RDB on shutdown
    $master config set save "3600 1"

    # Because we will test partial resync later, we don’t want a timeout to cause
    # the master-replica disconnect, then the extra reconnections will break the
    # sync_partial_ok stat test
    $master config set repl-timeout 3600
    $replica config set repl-timeout 3600
    $sub_replica config set repl-timeout 3600

    # Avoid PINGs
    $master config set repl-ping-replica-period 3600
    $master config rewrite

    # Build replication chain
    $replica replicaof $master_host $master_port
    $sub_replica replicaof $replica_host $replica_port

    wait_for_condition 50 100 {
        [status $replica master_link_status] eq {up} &&
        [status $sub_replica master_link_status] eq {up}
    } else {
        fail "Replication not started."
    }

    test "PSYNC2: Partial resync after Master restart using RDB aux fields when offset is 0" {
        assert {[status $master master_repl_offset] == 0}

        set replid [status $master master_replid]
        $replica config resetstat

        catch {
            restart_server 0 true false true now
            set master [srv 0 client]
        }
        wait_for_condition 50 1000 {
            [status $replica master_link_status] eq {up} &&
            [status $sub_replica master_link_status] eq {up}
        } else {
            fail "Replicas didn't sync after master restart"
        }

        # Make sure master restore replication info correctly
        assert {[status $master master_replid] != $replid}
        assert {[status $master master_repl_offset] == 0}
        assert {[status $master master_replid2] eq $replid}
        assert {[status $master second_repl_offset] == 1}

        # Make sure master set replication backlog correctly
        assert {[status $master repl_backlog_active] == 1}
        assert {[status $master repl_backlog_first_byte_offset] == 1}
        assert {[status $master repl_backlog_histlen] == 0}

        # Partial resync after Master restart
        assert {[status $master sync_partial_ok] == 1}
        assert {[status $replica sync_partial_ok] == 1}
    }

    # Generate some data
    createComplexDataset $master 1000

    test "PSYNC2: Partial resync after Master restart using RDB aux fields with data" {
        wait_for_condition 500 100 {
            [status $master master_repl_offset] == [status $replica master_repl_offset] &&
            [status $master master_repl_offset] == [status $sub_replica master_repl_offset]
        } else {
            fail "Replicas and master offsets were unable to match *exactly*."
        }

        set replid [status $master master_replid]
        set offset [status $master master_repl_offset]
        $replica config resetstat

        catch {
            # SHUTDOWN NOW ensures master doesn't send GETACK to replicas before
            # shutting down which would affect the replication offset.
            restart_server 0 true false true now
            set master [srv 0 client]
        }
        wait_for_condition 50 1000 {
            [status $replica master_link_status] eq {up} &&
            [status $sub_replica master_link_status] eq {up}
        } else {
            fail "Replicas didn't sync after master restart"
        }

        # Make sure master restore replication info correctly
        assert {[status $master master_replid] != $replid}
        assert {[status $master master_repl_offset] == $offset}
        assert {[status $master master_replid2] eq $replid}
        assert {[status $master second_repl_offset] == [expr $offset+1]}

        # Make sure master set replication backlog correctly
        assert {[status $master repl_backlog_active] == 1}
        assert {[status $master repl_backlog_first_byte_offset] == [expr $offset+1]}
        assert {[status $master repl_backlog_histlen] == 0}

        # Partial resync after Master restart
        assert {[status $master sync_partial_ok] == 1}
        assert {[status $replica sync_partial_ok] == 1}
    }

    test "PSYNC2: Partial resync after Master restart using RDB aux fields with expire" {
        $master debug set-active-expire 0
        for {set j 0} {$j < 1024} {incr j} {
            $master select [expr $j%16]
            $master set $j somevalue px 10
        }

        after 20

        # Wait until master has received ACK from replica. If the master thinks
        # that any replica is lagging when it shuts down, master would send
        # GETACK to the replicas, affecting the replication offset.
        set offset [status $master master_repl_offset]
        wait_for_condition 500 100 {
            [string match "*slave0:*,offset=$offset,*" [$master info replication]] &&
            $offset == [status $replica master_repl_offset] &&
            $offset == [status $sub_replica master_repl_offset]
        } else {
            show_cluster_status
            fail "Replicas and master offsets were unable to match *exactly*."
        }

        set offset [status $master master_repl_offset]
        $replica config resetstat

        catch {
            # Unlike the test above, here we use SIGTERM, which behaves
            # differently compared to SHUTDOWN NOW if there are lagging
            # replicas. This is just to increase coverage and let each test use
            # a different shutdown approach. In this case there are no lagging
            # replicas though.
            restart_server 0 true false
            set master [srv 0 client]
        }
        wait_for_condition 50 1000 {
            [status $replica master_link_status] eq {up} &&
            [status $sub_replica master_link_status] eq {up}
        } else {
            fail "Replicas didn't sync after master restart"
        }

        set expired_offset [status $master repl_backlog_histlen]
        # Stale keys expired and master_repl_offset grows correctly
        assert {[status $master rdb_last_load_keys_expired] == 1024}
        assert {[status $master master_repl_offset] == [expr $offset+$expired_offset]}

        # Partial resync after Master restart
        assert {[status $master sync_partial_ok] == 1}
        assert {[status $replica sync_partial_ok] == 1}

        set digest [$master debug digest]
        assert {$digest eq [$replica debug digest]}
        assert {$digest eq [$sub_replica debug digest]}
    }

    test "PSYNC2: Full resync after Master restart when too many key expired" {
        $master config set repl-backlog-size 16384
        $master config rewrite

        $master debug set-active-expire 0
        # Make sure replication backlog is full and will be trimmed.
        for {set j 0} {$j < 2048} {incr j} {
            $master select [expr $j%16]
            $master set $j somevalue px 10
        }

        after 20

        wait_for_condition 500 100 {
            [status $master master_repl_offset] == [status $replica master_repl_offset] &&
            [status $master master_repl_offset] == [status $sub_replica master_repl_offset]
        } else {
            fail "Replicas and master offsets were unable to match *exactly*."
        }

        $replica config resetstat

        catch {
            # Unlike the test above, here we use SIGTERM. This is just to
            # increase coverage and let each test use a different shutdown
            # approach.
            restart_server 0 true false
            set master [srv 0 client]
        }
        wait_for_condition 50 1000 {
            [status $replica master_link_status] eq {up} &&
            [status $sub_replica master_link_status] eq {up}
        } else {
            fail "Replicas didn't sync after master restart"
        }

        # Replication backlog is full
        assert {[status $master repl_backlog_first_byte_offset] > [status $master second_repl_offset]}
        assert {[status $master sync_partial_ok] == 0}
        assert {[status $master sync_full] == 1}
        assert {[status $master rdb_last_load_keys_expired] == 2048}
        assert {[status $replica sync_full] == 1}

        set digest [$master debug digest]
        assert {$digest eq [$replica debug digest]}
        assert {$digest eq [$sub_replica debug digest]}
    }
}}}