src/pybind/mgr/rook/ci/scripts/bootstrap-rook-cluster.sh


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193

#!/usr/bin/env bash

set -eEx

: ${CEPH_DEV_FOLDER:=${PWD}}
CLUSTER_SPEC=${CEPH_DEV_FOLDER}/src/pybind/mgr/rook/ci/cluster-specs/cluster-on-pvc-minikube.yaml
DEFAULT_NS="rook-ceph"
KUBECTL="minikube kubectl --"
export ROOK_CLUSTER_NS="${ROOK_CLUSTER_NS:=$DEFAULT_NS}" ## CephCluster namespace

# We build a local ceph image that contains the latest code
# plus changes from the PR. This image will be used by the docker
# running inside the minikube to start the different ceph pods
LOCAL_CEPH_IMG="local/ceph"

on_error() {
    echo "on error"
    minikube delete
}

setup_minikube_env() {

    # Check if Minikube is running
    if minikube status > /dev/null 2>&1; then
	echo "Minikube is running"
	minikube stop
	minikube delete
    else
	echo "Minikube is not running"
    fi

    rm -rf ~/.minikube
    minikube start --memory="6144" --disk-size=20g --extra-disks=4 --driver kvm2
    # point Docker env to use docker daemon running on minikube
    eval $(minikube docker-env -p minikube)
}

build_ceph_image() {

    CURR_CEPH_IMG=$(grep -E '^\s*image:\s+' $CLUSTER_SPEC | sed 's/.*image: *\([^ ]*\)/\1/')

    cd ${CEPH_DEV_FOLDER}/src/pybind/mgr/rook/ci
    mkdir -p tmp_build/rook
    mkdir -p tmp_build/orchestrator
    cp ./../../orchestrator/*.py tmp_build/orchestrator
    cp ../*.py tmp_build/rook
    cp -r ../../../../../src/python-common/ceph/ tmp_build/

    # we use the following tag to trick the Docker
    # running inside minikube so it uses this image instead
    # of pulling it from the registry
    docker build --tag ${LOCAL_CEPH_IMG} .
    docker tag ${LOCAL_CEPH_IMG} ${CURR_CEPH_IMG}

    # cleanup
    rm -rf tmp_build
    cd ${CEPH_DEV_FOLDER}
}

create_rook_cluster() {
    $KUBECTL create -f https://raw.githubusercontent.com/rook/rook/master/deploy/examples/crds.yaml
    $KUBECTL create -f https://raw.githubusercontent.com/rook/rook/master/deploy/examples/common.yaml
    $KUBECTL create -f https://raw.githubusercontent.com/rook/rook/master/deploy/examples/operator.yaml
    $KUBECTL create -f $CLUSTER_SPEC
    $KUBECTL create -f https://raw.githubusercontent.com/rook/rook/master/deploy/examples/toolbox.yaml
}

is_operator_ready() {
    local phase
    phase=$($KUBECTL get cephclusters.ceph.rook.io -n rook-ceph -o jsonpath='{.items[?(@.kind == "CephCluster")].status.phase}')
    echo "PHASE: $phase"
    [[ "$phase" == "Ready" ]]
}

wait_for_rook_operator() {
    local max_attempts=10
    local sleep_interval=20
    local attempts=0

    $KUBECTL rollout status deployment rook-ceph-operator -n rook-ceph --timeout=180s

    while ! is_operator_ready; do
        echo "Waiting for rook operator to be ready..."
        sleep $sleep_interval

	# log current cluster state and pods info for debugging
        PHASE=$($KUBECTL get cephclusters.ceph.rook.io -n rook-ceph -o jsonpath='{.items[?(@.kind == "CephCluster")].status.phase}')
        $KUBECTL -n rook-ceph get pods

        attempts=$((attempts + 1))
        if [ $attempts -ge $max_attempts ]; then
            echo "Maximum number of attempts ($max_attempts) reached. Exiting..."
            $KUBECTL -n rook-ceph get pods | grep operator | awk '{print $1}' | xargs $KUBECTL -n rook-ceph logs
            return 1
        fi
    done
}

wait_for_ceph_cluster() {
    local max_attempts=10
    local sleep_interval=20
    local attempts=0
    $KUBECTL rollout status deployment rook-ceph-tools -n rook-ceph --timeout=90s
    while ! $KUBECTL get cephclusters.ceph.rook.io -n rook-ceph -o jsonpath='{.items[?(@.kind == "CephCluster")].status.ceph.health}' | grep -q "HEALTH_OK"; do
	echo "Waiting for Ceph cluster to enter HEALTH_OK" state
	sleep $sleep_interval
	attempts=$((attempts+1))
        if [ $attempts -ge $max_attempts ]; then
            echo "Maximum number of attempts ($max_attempts) reached. Exiting..."
            return 1
        fi
    done
    echo "Ceph cluster installed and running"

    # add an additional wait to cover with any subttle change in the state
    sleep 20
}

configure_libvirt(){
    if sudo usermod -aG libvirt $(id -un); then
	echo "User added to libvirt group successfully."
	sudo systemctl enable --now libvirtd
	sudo systemctl restart libvirtd
	sleep 30 # wait some time for libvirtd service to restart
	newgrp libvirt
    else
	echo "Error adding user to libvirt group."
	return 1
    fi
}

recreate_default_network(){

    # destroy any existing kvm default network
    if sudo virsh net-destroy default; then
	sudo virsh net-undefine default
    fi

    # let's create a new kvm default network
    sudo virsh net-define /usr/share/libvirt/networks/default.xml
    if sudo virsh net-start default; then
        echo "Network 'default' started successfully."
    else
        # Optionally, handle the error
        echo "Failed to start network 'default', but continuing..."
    fi

    # restart libvirtd service and wait a little bit for the service
    sudo systemctl restart libvirtd
    sleep 30

    # Just some debugging information
    all_networks=$(virsh net-list --all)
    groups=$(groups)
}

enable_rook_orchestrator() {
    echo "Enabling rook orchestrator"
    $KUBECTL rollout status deployment rook-ceph-tools -n "$ROOK_CLUSTER_NS" --timeout=90s
    $KUBECTL -n "$ROOK_CLUSTER_NS" exec -it deploy/rook-ceph-tools -- ceph mgr module enable rook
    $KUBECTL -n "$ROOK_CLUSTER_NS" exec -it deploy/rook-ceph-tools -- ceph orch set backend rook
    $KUBECTL -n "$ROOK_CLUSTER_NS" exec -it deploy/rook-ceph-tools -- ceph orch status
}

enable_monitoring() {
    echo "Enabling monitoring"
    $KUBECTL apply -f https://raw.githubusercontent.com/coreos/prometheus-operator/v0.40.0/bundle.yaml
    $KUBECTL wait --for=condition=ready pod -l app.kubernetes.io/name=prometheus-operator --timeout=90s
    $KUBECTL apply -f https://raw.githubusercontent.com/rook/rook/master/deploy/examples/monitoring/rbac.yaml
    $KUBECTL apply -f https://raw.githubusercontent.com/rook/rook/master/deploy/examples/monitoring/service-monitor.yaml
    $KUBECTL apply -f https://raw.githubusercontent.com/rook/rook/master/deploy/examples/monitoring/exporter-service-monitor.yaml
    $KUBECTL apply -f https://raw.githubusercontent.com/rook/rook/master/deploy/examples/monitoring/prometheus.yaml
    $KUBECTL apply -f https://raw.githubusercontent.com/rook/rook/master/deploy/examples/monitoring/prometheus-service.yaml
}

####################################################################
####################################################################

trap 'on_error $? $LINENO' ERR

configure_libvirt
recreate_default_network
setup_minikube_env
build_ceph_image
create_rook_cluster
wait_for_rook_operator
wait_for_ceph_cluster
enable_rook_orchestrator
enable_monitoring
sleep 30 # wait for the metrics cache warmup

####################################################################
####################################################################