modules/md/mod_md_drive.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345

/* Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 
#include <assert.h>
#include <apr_optional.h>
#include <apr_hash.h>
#include <apr_strings.h>
#include <apr_date.h>

#include <httpd.h>
#include <http_core.h>
#include <http_protocol.h>
#include <http_request.h>
#include <http_log.h>

#include "mod_watchdog.h"

#include "md.h"
#include "md_curl.h"
#include "md_crypt.h"
#include "md_event.h"
#include "md_http.h"
#include "md_json.h"
#include "md_status.h"
#include "md_store.h"
#include "md_store_fs.h"
#include "md_log.h"
#include "md_result.h"
#include "md_reg.h"
#include "md_util.h"
#include "md_version.h"
#include "md_acme.h"
#include "md_acme_authz.h"

#include "mod_md.h"
#include "mod_md_private.h"
#include "mod_md_config.h"
#include "mod_md_status.h"
#include "mod_md_drive.h"

/**************************************************************************************************/
/* watchdog based impl. */

#define MD_RENEW_WATCHDOG_NAME   "_md_renew_"

static APR_OPTIONAL_FN_TYPE(ap_watchdog_get_instance) *wd_get_instance;
static APR_OPTIONAL_FN_TYPE(ap_watchdog_register_callback) *wd_register_callback;
static APR_OPTIONAL_FN_TYPE(ap_watchdog_set_callback_interval) *wd_set_interval;

struct md_renew_ctx_t {
    apr_pool_t *p;
    server_rec *s;
    md_mod_conf_t *mc;
    ap_watchdog_t *watchdog;
    
    apr_array_header_t *jobs;
};

static void process_drive_job(md_renew_ctx_t *dctx, md_job_t *job, apr_pool_t *ptemp)
{
    const md_t *md;
    md_result_t *result = NULL;
    apr_status_t rv;
    
    md_job_load(job);
    /* Evaluate again on loaded value. Values will change when watchdog switches child process */
    if (apr_time_now() < job->next_run) return;
    
    job->next_run = 0;
    if (job->finished && job->notified_renewed) {
        /* finished and notification handled, nothing to do. */
        goto leave;
    }
    
    md = md_get_by_name(dctx->mc->mds, job->mdomain);
    AP_DEBUG_ASSERT(md);

    result = md_result_md_make(ptemp, md->name);
    if (job->last_result) md_result_assign(result, job->last_result);
    
    if (md->state == MD_S_MISSING_INFORMATION) {
        /* Missing information, this will not change until configuration
         * is changed and server reloaded. */
        job->fatal_error = 1;
        job->next_run = 0;
        goto leave;
    }
    
    if (md_will_renew_cert(md)) {
        /* Renew the MDs credentials in a STAGING area. Might be invoked repeatedly 
         * without discarding previous/intermediate results.
         * Only returns SUCCESS when the renewal is complete, e.g. STAGING has a
         * complete set of new credentials.
         */
        ap_log_error( APLOG_MARK, APLOG_DEBUG, 0, dctx->s, APLOGNO(10052) 
                     "md(%s): state=%d, driving", job->mdomain, md->state);

        if (!md_reg_should_renew(dctx->mc->reg, md, dctx->p)) {
            ap_log_error( APLOG_MARK, APLOG_DEBUG, 0, dctx->s, APLOGNO(10053) 
                         "md(%s): no need to renew", job->mdomain);
            goto expiry;
        }
    
        /* The (possibly configured) event handler may veto renewals. This
         * is used in cluster installtations, see #233. */
        rv = md_event_raise("renewing", md->name, job, result, ptemp);
        if (APR_SUCCESS != rv) {
                ap_log_error(APLOG_MARK, APLOG_INFO, 0, dctx->s, APLOGNO(10060)
                             "%s: event-handler for 'renewing' returned %d, preventing renewal to proceed.",
                             job->mdomain, rv);
                goto leave;
        }

        md_job_start_run(job, result, md_reg_store_get(dctx->mc->reg));
        md_reg_renew(dctx->mc->reg, md, dctx->mc->env, 0, job->error_runs, result, ptemp);
        md_job_end_run(job, result);
        
        if (APR_SUCCESS == result->status) {
            /* Finished jobs might take a while before the results become valid.
             * If that is in the future, request to run then */
            if (apr_time_now() < result->ready_at) {
                md_job_retry_at(job, result->ready_at);
                goto leave;
            }
            
            if (!job->notified_renewed) {
                md_job_save(job, result, ptemp);
                md_job_notify(job, "renewed", result);
            }
        }
        else {
            ap_log_error( APLOG_MARK, APLOG_ERR, result->status, dctx->s, APLOGNO(10056) 
                         "processing %s: %s", job->mdomain, result->detail);
            md_job_log_append(job, "renewal-error", result->problem, result->detail);
            md_event_holler("errored", job->mdomain, job, result, ptemp);
            ap_log_error(APLOG_MARK, APLOG_INFO, 0, dctx->s, APLOGNO(10057) 
                         "%s: encountered error for the %d. time, next run in %s",
                         job->mdomain, job->error_runs, 
                         md_duration_print(ptemp, job->next_run - apr_time_now()));
        }
    }

expiry:
    if (!job->finished && md_reg_should_warn(dctx->mc->reg, md, dctx->p)) {
        ap_log_error( APLOG_MARK, APLOG_TRACE1, 0, dctx->s,
                     "md(%s): warn about expiration", md->name);
        md_job_start_run(job, result, md_reg_store_get(dctx->mc->reg));
        md_job_notify(job, "expiring", result);
        md_job_end_run(job, result);
    }

leave:
    if (job->dirty && result) {
        rv = md_job_save(job, result, ptemp);
        ap_log_error(APLOG_MARK, APLOG_TRACE1, rv, dctx->s, "%s: saving job props", job->mdomain);
    }
}

int md_will_renew_cert(const md_t *md)
{
    if (md->renew_mode == MD_RENEW_MANUAL) {
        return 0;
    }
    else if (md->renew_mode == MD_RENEW_AUTO && md->cert_files && md->cert_files->nelts) {
        return 0;
    } 
    return 1;
}

static apr_time_t next_run_default(void)
{
    /* we'd like to run at least twice a day by default */
    return apr_time_now() + apr_time_from_sec(MD_SECS_PER_DAY / 2);
}

static apr_status_t run_watchdog(int state, void *baton, apr_pool_t *ptemp)
{
    md_renew_ctx_t *dctx = baton;
    md_job_t *job;
    apr_time_t next_run, wait_time;
    int i;
    
    /* mod_watchdog invoked us as a single thread inside the whole server (on this machine).
     * This might be a repeated run inside the same child (mod_watchdog keeps affinity as
     * long as the child lives) or another/new child.
     */
    switch (state) {
        case AP_WATCHDOG_STATE_STARTING:
            ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, dctx->s, APLOGNO(10054)
                         "md watchdog start, auto drive %d mds", dctx->jobs->nelts);
            break;
            
        case AP_WATCHDOG_STATE_RUNNING:
            ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, dctx->s, APLOGNO(10055)
                         "md watchdog run, auto drive %d mds", dctx->jobs->nelts);
                         
            /* Process all drive jobs. They will update their next_run property
             * and we schedule ourself at the earliest of all. A job may specify 0
             * as next_run to indicate that it wants to participate in the normal
             * regular runs. */
            next_run = next_run_default();
            for (i = 0; i < dctx->jobs->nelts; ++i) {
                job = APR_ARRAY_IDX(dctx->jobs, i, md_job_t *);
                
                if (apr_time_now() >= job->next_run) {
                    process_drive_job(dctx, job, ptemp);
                }
                
                if (job->next_run && job->next_run < next_run) {
                    next_run = job->next_run;
                }
            }

            wait_time = next_run - apr_time_now();
            if (APLOGdebug(dctx->s)) {
                ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, dctx->s, APLOGNO(10107)
                             "next run in %s", md_duration_print(ptemp, wait_time));
            }
            wd_set_interval(dctx->watchdog, wait_time, dctx, run_watchdog);
            break;
            
        case AP_WATCHDOG_STATE_STOPPING:
            ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, dctx->s, APLOGNO(10058)
                         "md watchdog stopping");
            break;
    }
    
    return APR_SUCCESS;
}

apr_status_t md_renew_start_watching(md_mod_conf_t *mc, server_rec *s, apr_pool_t *p)
{
    apr_allocator_t *allocator;
    md_renew_ctx_t *dctx;
    apr_pool_t *dctxp;
    apr_status_t rv;
    md_t *md;
    md_job_t *job;
    int i;
    
    /* We use mod_watchdog to run a single thread in one of the child processes
     * to monitor the MDs marked as watched, using the const data in the list
     * mc->mds of our MD structures.
     *
     * The data in mc cannot be changed, as we may spawn copies in new child processes
     * of the original data at any time. The child which hosts the watchdog thread
     * may also die or be recycled, which causes a new watchdog thread to run
     * in another process with the original data.
     * 
     * Instead, we use our store to persist changes in group STAGING. This is
     * kept writable to child processes, but the data stored there is not live.
     * However, mod_watchdog makes sure that we only ever have a single thread in
     * our server (on this machine) that writes there. Other processes, e.g. informing
     * the user about progress, only read from there.
     *
     * All changes during driving an MD are stored as files in MG_SG_STAGING/<MD.name>.
     * All will have "md.json" and "job.json". There may be a range of other files used
     * by the protocol obtaining the certificate/keys.
     * 
     * 
     */
    wd_get_instance = APR_RETRIEVE_OPTIONAL_FN(ap_watchdog_get_instance);
    wd_register_callback = APR_RETRIEVE_OPTIONAL_FN(ap_watchdog_register_callback);
    wd_set_interval = APR_RETRIEVE_OPTIONAL_FN(ap_watchdog_set_callback_interval);
    
    if (!wd_get_instance || !wd_register_callback || !wd_set_interval) {
        ap_log_error(APLOG_MARK, APLOG_CRIT, 0, s, APLOGNO(10061) "mod_watchdog is required");
        return !OK;
    }
    
    /* We want our own pool with own allocator to keep data across watchdog invocations.
     * Since we'll run in a single watchdog thread, using our own allocator will prevent 
     * any confusion in the parent pool. */
    apr_allocator_create(&allocator);
    apr_allocator_max_free_set(allocator, 1);
    rv = apr_pool_create_ex(&dctxp, p, NULL, allocator);
    if (rv != APR_SUCCESS) {
        ap_log_error(APLOG_MARK, APLOG_ERR, rv, s, APLOGNO(10062) "md_renew_watchdog: create pool");
        return rv;
    }
    apr_allocator_owner_set(allocator, dctxp);
    apr_pool_tag(dctxp, "md_renew_watchdog");

    dctx = apr_pcalloc(dctxp, sizeof(*dctx));
    dctx->p = dctxp;
    dctx->s = s;
    dctx->mc = mc;
    
    dctx->jobs = apr_array_make(dctx->p, mc->mds->nelts, sizeof(md_job_t *));
    for (i = 0; i < mc->mds->nelts; ++i) {
        md = APR_ARRAY_IDX(mc->mds, i, md_t*);
        if (!md || !md->watched) continue;
        
        job = md_reg_job_make(mc->reg, md->name, p);
        APR_ARRAY_PUSH(dctx->jobs, md_job_t*) = job;
        ap_log_error( APLOG_MARK, APLOG_TRACE1, 0, dctx->s,  
                     "md(%s): state=%d, created drive job", md->name, md->state);
        
        md_job_load(job);
        if (job->error_runs) {
            /* Server has just restarted. If we encounter an MD job with errors
             * on a previous driving, we purge its STAGING area.
             * This will reset the driving for the MD. It may run into the same
             * error again, or in case of race/confusion/our error/CA error, it
             * might allow the MD to succeed by a fresh start.
             */
            ap_log_error( APLOG_MARK, APLOG_NOTICE, 0, dctx->s, APLOGNO(10064) 
                         "md(%s): previous drive job showed %d errors, purging STAGING "
                         "area to reset.", md->name, job->error_runs);
            md_store_purge(md_reg_store_get(dctx->mc->reg), p, MD_SG_STAGING, md->name);
            md_store_purge(md_reg_store_get(dctx->mc->reg), p, MD_SG_CHALLENGES, md->name);
            job->error_runs = 0;
        }
    }

    if (!dctx->jobs->nelts) {
        ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, s, APLOGNO(10065)
                     "no managed domain to drive, no watchdog needed.");
        apr_pool_destroy(dctx->p);
        return APR_SUCCESS;
    }
    
    if (APR_SUCCESS != (rv = wd_get_instance(&dctx->watchdog, MD_RENEW_WATCHDOG_NAME, 0, 1, dctx->p))) {
        ap_log_error(APLOG_MARK, APLOG_CRIT, rv, s, APLOGNO(10066) 
                     "create md renew watchdog(%s)", MD_RENEW_WATCHDOG_NAME);
        return rv;
    }
    rv = wd_register_callback(dctx->watchdog, 0, dctx, run_watchdog);
    ap_log_error(APLOG_MARK, rv? APLOG_CRIT : APLOG_DEBUG, rv, s, APLOGNO(10067) 
                 "register md renew watchdog(%s)", MD_RENEW_WATCHDOG_NAME);
    return rv;
}