1 files changed, 868 insertions, 0 deletions
diff --git a/server/core_filters.c b/server/core_filters.c
new file mode 100644
index 0000000..d8a661f
--- /dev/null
+++ b/server/core_filters.c
@@ -0,0 +1,868 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file  core_filters.c
+ * @brief Core input/output network filters.
+ */
+
+#include "apr.h"
+#include "apr_strings.h"
+#include "apr_lib.h"
+#include "apr_fnmatch.h"
+#include "apr_hash.h"
+#include "apr_thread_proc.h"    /* for RLIMIT stuff */
+#include "apr_version.h"
+
+#define APR_WANT_IOVEC
+#define APR_WANT_STRFUNC
+#define APR_WANT_MEMFUNC
+#include "apr_want.h"
+
+#include "ap_config.h"
+#include "httpd.h"
+#include "http_config.h"
+#include "http_core.h"
+#include "http_protocol.h" /* For index_of_response().  Grump. */
+#include "http_request.h"
+#include "http_vhost.h"
+#include "http_main.h"     /* For the default_handler below... */
+#include "http_log.h"
+#include "util_md5.h"
+#include "http_connection.h"
+#include "apr_buckets.h"
+#include "util_filter.h"
+#include "util_ebcdic.h"
+#include "mpm_common.h"
+#include "scoreboard.h"
+#include "mod_core.h"
+#include "ap_listen.h"
+
+#include "mod_so.h" /* for ap_find_loaded_module_symbol */
+
+#define AP_MIN_SENDFILE_BYTES           (256)
+
+/**
+ * Remove all zero length buckets from the brigade.
+ */
+#define BRIGADE_NORMALIZE(b) \
+do { \
+    apr_bucket *e = APR_BRIGADE_FIRST(b); \
+    do {  \
+        if (e->length == 0 && !APR_BUCKET_IS_METADATA(e)) { \
+            apr_bucket *d; \
+            d = APR_BUCKET_NEXT(e); \
+            apr_bucket_delete(e); \
+            e = d; \
+        } \
+        else { \
+            e = APR_BUCKET_NEXT(e); \
+        } \
+    } while (!APR_BRIGADE_EMPTY(b) && (e != APR_BRIGADE_SENTINEL(b))); \
+} while (0)
+
+/* we know core's module_index is 0 */
+#undef APLOG_MODULE_INDEX
+#define APLOG_MODULE_INDEX AP_CORE_MODULE_INDEX
+
+struct core_output_filter_ctx {
+    apr_bucket_brigade *buffered_bb;
+    apr_pool_t *deferred_write_pool;
+    apr_size_t bytes_written;
+    struct iovec *vec;
+    apr_size_t nvec;
+};
+
+
+apr_status_t ap_core_input_filter(ap_filter_t *f, apr_bucket_brigade *b,
+                                  ap_input_mode_t mode, apr_read_type_e block,
+                                  apr_off_t readbytes)
+{
+    apr_status_t rv;
+    core_net_rec *net = f->ctx;
+    core_ctx_t *ctx = net->in_ctx;
+    const char *str;
+    apr_size_t len;
+
+    if (mode == AP_MODE_INIT) {
+        /*
+         * this mode is for filters that might need to 'initialize'
+         * a connection before reading request data from a client.
+         * NNTP over SSL for example needs to handshake before the
+         * server sends the welcome message.
+         * such filters would have changed the mode before this point
+         * is reached.  however, protocol modules such as NNTP should
+         * not need to know anything about SSL.  given the example, if
+         * SSL is not in the filter chain, AP_MODE_INIT is a noop.
+         */
+        return APR_SUCCESS;
+    }
+
+    if (!ctx)
+    {
+        net->in_ctx = ctx = apr_palloc(f->c->pool, sizeof(*ctx));
+        ctx->b = apr_brigade_create(f->c->pool, f->c->bucket_alloc);
+        ctx->tmpbb = apr_brigade_create(f->c->pool, f->c->bucket_alloc);
+        /* seed the brigade with the client socket. */
+        rv = ap_run_insert_network_bucket(f->c, ctx->b, net->client_socket);
+        if (rv != APR_SUCCESS)
+            return rv;
+    }
+    else if (APR_BRIGADE_EMPTY(ctx->b)) {
+        return APR_EOF;
+    }
+
+    /* ### This is bad. */
+    BRIGADE_NORMALIZE(ctx->b);
+
+    /* check for empty brigade again *AFTER* BRIGADE_NORMALIZE()
+     * If we have lost our socket bucket (see above), we are EOF.
+     *
+     * Ideally, this should be returning SUCCESS with EOS bucket, but
+     * some higher-up APIs (spec. read_request_line via ap_rgetline)
+     * want an error code. */
+    if (APR_BRIGADE_EMPTY(ctx->b)) {
+        return APR_EOF;
+    }
+
+    if (mode == AP_MODE_GETLINE) {
+        /* we are reading a single LF line, e.g. the HTTP headers */
+        rv = apr_brigade_split_line(b, ctx->b, block, HUGE_STRING_LEN);
+        /* We should treat EAGAIN here the same as we do for EOF (brigade is
+         * empty).  We do this by returning whatever we have read.  This may
+         * or may not be bogus, but is consistent (for now) with EOF logic.
+         */
+        if (APR_STATUS_IS_EAGAIN(rv) && block == APR_NONBLOCK_READ) {
+            rv = APR_SUCCESS;
+        }
+        return rv;
+    }
+
+    /* ### AP_MODE_PEEK is a horrific name for this mode because we also
+     * eat any CRLFs that we see.  That's not the obvious intention of
+     * this mode.  Determine whether anyone actually uses this or not. */
+    if (mode == AP_MODE_EATCRLF) {
+        apr_bucket *e;
+        const char *c;
+
+        /* The purpose of this loop is to ignore any CRLF (or LF) at the end
+         * of a request.  Many browsers send extra lines at the end of POST
+         * requests.  We use the PEEK method to determine if there is more
+         * data on the socket, so that we know if we should delay sending the
+         * end of one request until we have served the second request in a
+         * pipelined situation.  We don't want to actually delay sending a
+         * response if the server finds a CRLF (or LF), becuause that doesn't
+         * mean that there is another request, just a blank line.
+         */
+        while (1) {
+            if (APR_BRIGADE_EMPTY(ctx->b))
+                return APR_EOF;
+
+            e = APR_BRIGADE_FIRST(ctx->b);
+
+            rv = apr_bucket_read(e, &str, &len, APR_NONBLOCK_READ);
+
+            if (rv != APR_SUCCESS)
+                return rv;
+
+            c = str;
+            while (c < str + len) {
+                if (*c == APR_ASCII_LF)
+                    c++;
+                else if (*c == APR_ASCII_CR && *(c + 1) == APR_ASCII_LF)
+                    c += 2;
+                else
+                    return APR_SUCCESS;
+            }
+
+            /* If we reach here, we were a bucket just full of CRLFs, so
+             * just toss the bucket. */
+            /* FIXME: Is this the right thing to do in the core? */
+            apr_bucket_delete(e);
+        }
+        return APR_SUCCESS;
+    }
+
+    /* If mode is EXHAUSTIVE, we want to just read everything until the end
+     * of the brigade, which in this case means the end of the socket.
+     * To do this, we attach the brigade that has currently been setaside to
+     * the brigade that was passed down, and send that brigade back.
+     *
+     * NOTE:  This is VERY dangerous to use, and should only be done with
+     * extreme caution.  FWLIW, this would be needed by an MPM like Perchild;
+     * such an MPM can easily request the socket and all data that has been
+     * read, which means that it can pass it to the correct child process.
+     */
+    if (mode == AP_MODE_EXHAUSTIVE) {
+        apr_bucket *e;
+
+        /* Tack on any buckets that were set aside. */
+        APR_BRIGADE_CONCAT(b, ctx->b);
+
+        /* Since we've just added all potential buckets (which will most
+         * likely simply be the socket bucket) we know this is the end,
+         * so tack on an EOS too. */
+        /* We have read until the brigade was empty, so we know that we
+         * must be EOS. */
+        e = apr_bucket_eos_create(f->c->bucket_alloc);
+        APR_BRIGADE_INSERT_TAIL(b, e);
+        return APR_SUCCESS;
+    }
+
+    /* read up to the amount they specified. */
+    if (mode == AP_MODE_READBYTES || mode == AP_MODE_SPECULATIVE) {
+        apr_bucket *e;
+
+        AP_DEBUG_ASSERT(readbytes > 0);
+
+        e = APR_BRIGADE_FIRST(ctx->b);
+        rv = apr_bucket_read(e, &str, &len, block);
+
+        if (APR_STATUS_IS_EAGAIN(rv) && block == APR_NONBLOCK_READ) {
+            /* getting EAGAIN for a blocking read is an error; for a
+             * non-blocking read, return an empty brigade. */
+            return APR_SUCCESS;
+        }
+        else if (rv != APR_SUCCESS) {
+            return rv;
+        }
+        else if (block == APR_BLOCK_READ && len == 0) {
+            /* We wanted to read some bytes in blocking mode.  We read
+             * 0 bytes.  Hence, we now assume we are EOS.
+             *
+             * When we are in normal mode, return an EOS bucket to the
+             * caller.
+             * When we are in speculative mode, leave ctx->b empty, so
+             * that the next call returns an EOS bucket.
+             */
+            apr_bucket_delete(e);
+
+            if (mode == AP_MODE_READBYTES) {
+                e = apr_bucket_eos_create(f->c->bucket_alloc);
+                APR_BRIGADE_INSERT_TAIL(b, e);
+            }
+            return APR_SUCCESS;
+        }
+
+        /* Have we read as much data as we wanted (be greedy)? */
+        if (len < readbytes) {
+            apr_size_t bucket_len;
+
+            rv = APR_SUCCESS;
+            /* We already registered the data in e in len */
+            e = APR_BUCKET_NEXT(e);
+            while ((len < readbytes) && (rv == APR_SUCCESS)
+                   && (e != APR_BRIGADE_SENTINEL(ctx->b))) {
+                /* Check for the availability of buckets with known length */
+                if (e->length != (apr_size_t)-1) {
+                    len += e->length;
+                    e = APR_BUCKET_NEXT(e);
+                }
+                else {
+                    /*
+                     * Read from bucket, but non blocking. If there isn't any
+                     * more data, well than this is fine as well, we will
+                     * not wait for more since we already got some and we are
+                     * only checking if there isn't more.
+                     */
+                    rv = apr_bucket_read(e, &str, &bucket_len,
+                                         APR_NONBLOCK_READ);
+                    if (rv == APR_SUCCESS) {
+                        len += bucket_len;
+                        e = APR_BUCKET_NEXT(e);
+                    }
+                }
+            }
+        }
+
+        /* We can only return at most what we read. */
+        if (len < readbytes) {
+            readbytes = len;
+        }
+
+        rv = apr_brigade_partition(ctx->b, readbytes, &e);
+        if (rv != APR_SUCCESS) {
+            return rv;
+        }
+
+        /* Must do move before CONCAT */
+        ctx->tmpbb = apr_brigade_split_ex(ctx->b, e, ctx->tmpbb);
+
+        if (mode == AP_MODE_READBYTES) {
+            APR_BRIGADE_CONCAT(b, ctx->b);
+        }
+        else if (mode == AP_MODE_SPECULATIVE) {
+            apr_bucket *copy_bucket;
+
+            for (e = APR_BRIGADE_FIRST(ctx->b);
+                 e != APR_BRIGADE_SENTINEL(ctx->b);
+                 e = APR_BUCKET_NEXT(e))
+            {
+                rv = apr_bucket_copy(e, &copy_bucket);
+                if (rv != APR_SUCCESS) {
+                    return rv;
+                }
+                APR_BRIGADE_INSERT_TAIL(b, copy_bucket);
+            }
+        }
+
+        /* Take what was originally there and place it back on ctx->b */
+        APR_BRIGADE_CONCAT(ctx->b, ctx->tmpbb);
+    }
+    return APR_SUCCESS;
+}
+
+static void setaside_remaining_output(ap_filter_t *f,
+                                      core_output_filter_ctx_t *ctx,
+                                      apr_bucket_brigade *bb,
+                                      conn_rec *c);
+
+static apr_status_t send_brigade_nonblocking(apr_socket_t *s,
+                                             apr_bucket_brigade *bb,
+                                             core_output_filter_ctx_t *ctx,
+                                             conn_rec *c);
+
+static apr_status_t writev_nonblocking(apr_socket_t *s,
+                                       apr_bucket_brigade *bb,
+                                       core_output_filter_ctx_t *ctx,
+                                       apr_size_t bytes_to_write,
+                                       apr_size_t nvec,
+                                       conn_rec *c);
+
+#if APR_HAS_SENDFILE
+static apr_status_t sendfile_nonblocking(apr_socket_t *s,
+                                         apr_bucket *bucket,
+                                         core_output_filter_ctx_t *ctx,
+                                         conn_rec *c);
+#endif
+
+/* XXX: Should these be configurable parameters? */
+#define THRESHOLD_MIN_WRITE 4096
+
+/* Optional function coming from mod_logio, used for logging of output
+ * traffic
+ */
+extern APR_OPTIONAL_FN_TYPE(ap_logio_add_bytes_out) *ap__logio_add_bytes_out;
+
+static int should_send_brigade(apr_bucket_brigade *bb, conn_rec *c, int *flush)
+{
+    core_server_config *conf =
+        ap_get_core_module_config(c->base_server->module_config);
+    apr_size_t total_bytes = 0, non_file_bytes = 0;
+    apr_uint32_t eor_buckets = 0;
+    apr_bucket *bucket;
+    int need_flush = 0;
+
+    /* Scan through the brigade and decide whether we need to flush it,
+     * based on the following rules:
+     *
+     *  a) The brigade contains a flush bucket: Do a blocking write
+     *     of everything up that point.
+     *
+     *  b) The request is in CONN_STATE_HANDLER state, and the brigade
+     *     contains at least flush_max_threshold bytes in non-file
+     *     buckets: Do blocking writes until the amount of data in the
+     *     buffer is less than flush_max_threshold.  (The point of this
+     *     rule is to provide flow control, in case a handler is
+     *     streaming out lots of data faster than the data can be
+     *     sent to the client.)
+     *
+     *  c) The request is in CONN_STATE_HANDLER state, and the brigade
+     *     contains at least flush_max_pipelined EOR buckets:
+     *     Do blocking writes until less than flush_max_pipelined EOR
+     *     buckets are left. (The point of this rule is to prevent too many
+     *     FDs being kept open by pipelined requests, possibly allowing a
+     *     DoS).
+     *
+     *  d) The brigade contains a morphing bucket: otherwise ap_save_brigade()
+     *     could read the whole bucket into memory.
+     */
+    for (bucket = APR_BRIGADE_FIRST(bb);
+         bucket != APR_BRIGADE_SENTINEL(bb);
+         bucket = APR_BUCKET_NEXT(bucket)) {
+
+        if (!APR_BUCKET_IS_METADATA(bucket)) {
+            if (bucket->length == (apr_size_t)-1) {
+                if (flush) {
+                    ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, c,
+                                  "core_output_filter: flushing because "
+                                  "of morphing bucket");
+                }
+                need_flush = 1;
+                break;
+            }
+
+            total_bytes += bucket->length;
+            if (!APR_BUCKET_IS_FILE(bucket)) {
+                non_file_bytes += bucket->length;
+                if (non_file_bytes > conf->flush_max_threshold) {
+                    if (flush) {
+                        ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, c,
+                                      "core_output_filter: flushing because "
+                                      "of max threshold");
+                    }
+                    need_flush = 1;
+                    break;
+                }
+            }
+        }
+        else if (APR_BUCKET_IS_FLUSH(bucket)) {
+            if (flush) {
+                ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, c,
+                              "core_output_filter: flushing because "
+                              "of FLUSH bucket");
+            }
+            need_flush = 1;
+            break;
+        }
+        else if (AP_BUCKET_IS_EOR(bucket)
+                 && conf->flush_max_pipelined >= 0
+                 && ++eor_buckets > conf->flush_max_pipelined) {
+            if (flush) {
+                ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, c,
+                              "core_output_filter: flushing because "
+                              "of max pipelined");
+            }
+            need_flush = 1;
+            break;
+        }
+    }
+    if (flush) {
+        *flush = need_flush;
+    }
+
+    /* Also send if above flush_min_threshold, or if there are FILE buckets */
+    return (need_flush
+            || total_bytes >= THRESHOLD_MIN_WRITE
+            || total_bytes > non_file_bytes);
+}
+
+apr_status_t ap_core_output_filter(ap_filter_t *f, apr_bucket_brigade *new_bb)
+{
+    conn_rec *c = f->c;
+    core_net_rec *net = f->ctx;
+    core_output_filter_ctx_t *ctx = net->out_ctx;
+    apr_bucket_brigade *bb = NULL;
+    apr_status_t rv = APR_SUCCESS;
+
+    /* Fail quickly if the connection has already been aborted. */
+    if (c->aborted) {
+        if (new_bb != NULL) {
+            apr_brigade_cleanup(new_bb);
+        }
+        return APR_ECONNABORTED;
+    }
+
+    if (ctx == NULL) {
+        ctx = apr_pcalloc(c->pool, sizeof(*ctx));
+        net->out_ctx = (core_output_filter_ctx_t *)ctx;
+        /*
+         * Need to create buffered_bb brigade with correct lifetime. Passing
+         * NULL to ap_save_brigade() would result in a brigade
+         * allocated from bb->pool which might be wrong.
+         */
+        ctx->buffered_bb = apr_brigade_create(c->pool, c->bucket_alloc);
+    }
+
+    if (new_bb != NULL)
+        bb = new_bb;
+
+    if ((ctx->buffered_bb != NULL) &&
+        !APR_BRIGADE_EMPTY(ctx->buffered_bb)) {
+        if (new_bb != NULL) {
+            APR_BRIGADE_PREPEND(bb, ctx->buffered_bb);
+        }
+        else {
+            bb = ctx->buffered_bb;
+        }
+    }
+    else if (new_bb == NULL) {
+        c->data_in_output_filters = 0;
+        return APR_SUCCESS;
+    }
+
+    if (!new_bb || should_send_brigade(bb, c, NULL)) {
+        apr_socket_t *sock = net->client_socket;
+        apr_interval_time_t sock_timeout = 0;
+
+        /* Non-blocking writes on the socket in any case. */
+        apr_socket_timeout_get(sock, &sock_timeout);
+        apr_socket_timeout_set(sock, 0);
+
+        do {
+            rv = send_brigade_nonblocking(sock, bb, ctx, c);
+            if (new_bb && APR_STATUS_IS_EAGAIN(rv)) {
+                /* Scan through the brigade and decide whether we must absolutely
+                 * flush the remaining data, based on should_send_brigade() &flush
+                 * rules. If so, wait for writability and retry, otherwise we did
+                 * our best already and can wait for the next call.
+                 */
+                int flush;
+                (void)should_send_brigade(bb, c, &flush);
+                if (flush) {
+                    apr_int32_t nfd;
+                    apr_pollfd_t pfd;
+                    memset(&pfd, 0, sizeof(pfd));
+                    pfd.reqevents = APR_POLLOUT;
+                    pfd.desc_type = APR_POLL_SOCKET;
+                    pfd.desc.s = sock;
+                    pfd.p = c->pool;
+                    do {
+                        rv = apr_poll(&pfd, 1, &nfd, sock_timeout);
+                    } while (APR_STATUS_IS_EINTR(rv));
+                }
+            }
+        } while (rv == APR_SUCCESS && !APR_BRIGADE_EMPTY(bb));
+
+        /* Restore original socket timeout before leaving. */
+        apr_socket_timeout_set(sock, sock_timeout);
+    }
+
+    if (rv != APR_SUCCESS && !APR_STATUS_IS_EAGAIN(rv)) {
+        /* The client has aborted the connection */
+        ap_log_cerror(
+                APLOG_MARK, APLOG_TRACE1, rv, c,
+                "core_output_filter: writing data to the network");
+        /*
+         * Set c->aborted before apr_brigade_cleanup to have the correct status
+         * when logging the request as apr_brigade_cleanup triggers the logging
+         * of the request if it contains an EOR bucket.
+         */
+        c->aborted = 1;
+        apr_brigade_cleanup(bb);
+        return rv;
+    }
+
+    setaside_remaining_output(f, ctx, bb, c);
+    return APR_SUCCESS;
+}
+
+/*
+ * This function assumes that either ctx->buffered_bb == NULL, or
+ * ctx->buffered_bb is empty, or ctx->buffered_bb == bb
+ */
+static void setaside_remaining_output(ap_filter_t *f,
+                                      core_output_filter_ctx_t *ctx,
+                                      apr_bucket_brigade *bb,
+                                      conn_rec *c)
+{
+    apr_bucket *bucket;
+
+    /* Don't set aside leading empty buckets, all previous data have been
+     * consumed so it's safe to delete them now.
+     */
+    while (((bucket = APR_BRIGADE_FIRST(bb)) != APR_BRIGADE_SENTINEL(bb)) &&
+           (APR_BUCKET_IS_METADATA(bucket) || (bucket->length == 0))) {
+        apr_bucket_delete(bucket);
+    }
+
+    c->data_in_output_filters = 0;
+    if (!APR_BRIGADE_EMPTY(bb)) {
+        c->data_in_output_filters = 1;
+        if (bb != ctx->buffered_bb) {
+            if (!ctx->deferred_write_pool) {
+                apr_pool_create(&ctx->deferred_write_pool, c->pool);
+                apr_pool_tag(ctx->deferred_write_pool, "deferred_write");
+            }
+            ap_save_brigade(f, &(ctx->buffered_bb), &bb,
+                            ctx->deferred_write_pool);
+        }
+    }
+    else if (ctx->deferred_write_pool) {
+        /*
+         * There are no more requests in the pipeline. We can just clear the
+         * pool.
+         */
+        apr_pool_clear(ctx->deferred_write_pool);
+    }
+}
+
+#ifndef APR_MAX_IOVEC_SIZE
+#define NVEC_MIN 16
+#define NVEC_MAX NVEC_MIN
+#else
+#if APR_MAX_IOVEC_SIZE > 16
+#define NVEC_MIN 16
+#else
+#define NVEC_MIN APR_MAX_IOVEC_SIZE
+#endif
+#define NVEC_MAX APR_MAX_IOVEC_SIZE
+#endif
+
+static APR_INLINE int is_in_memory_bucket(apr_bucket *b)
+{
+    /* These buckets' data are already in memory. */
+    return APR_BUCKET_IS_HEAP(b)
+           || APR_BUCKET_IS_POOL(b)
+           || APR_BUCKET_IS_TRANSIENT(b)
+           || APR_BUCKET_IS_IMMORTAL(b);
+}
+
+#if APR_HAS_SENDFILE
+static APR_INLINE int can_sendfile_bucket(apr_bucket *b)
+{
+    /* Use sendfile to send the bucket unless:
+     *   - the bucket is not a file bucket, or
+     *   - the file is too small for sendfile to be useful, or
+     *   - sendfile is disabled in the httpd config via "EnableSendfile off".
+     */
+    if (APR_BUCKET_IS_FILE(b) && b->length >= AP_MIN_SENDFILE_BYTES) {
+        apr_file_t *file = ((apr_bucket_file *)b->data)->fd;
+        return apr_file_flags_get(file) & APR_SENDFILE_ENABLED;
+    }
+    else {
+        return 0;
+    }
+}
+#endif
+
+#if defined(WIN32) && (APR_MAJOR_VERSION == 1 && APR_MINOR_VERSION <= 7)
+#undef APR_TCP_NOPUSH_FLAG
+#define APR_TCP_NOPUSH_FLAG 0
+#endif
+
+static APR_INLINE void sock_nopush(apr_socket_t *s, int to)
+{
+    /* Disable TCP_NOPUSH handling on OSX since unsetting it won't push
+     * retained data, which might introduce delays if further data don't
+     * come soon enough or cause the last chunk to be sent only when the
+     * connection is shutdown (e.g. after KeepAliveTimeout).
+     */
+#if APR_TCP_NOPUSH_FLAG && !defined(__APPLE__)
+    (void)apr_socket_opt_set(s, APR_TCP_NOPUSH, to);
+#endif
+}
+
+static apr_status_t send_brigade_nonblocking(apr_socket_t *s,
+                                             apr_bucket_brigade *bb,
+                                             core_output_filter_ctx_t *ctx,
+                                             conn_rec *c)
+{
+    apr_status_t rv = APR_SUCCESS;
+    core_server_config *conf =
+        ap_get_core_module_config(c->base_server->module_config);
+    apr_size_t nvec = 0, nbytes = 0;
+    apr_bucket *bucket, *next;
+    const char *data;
+    apr_size_t length;
+
+    for (bucket = APR_BRIGADE_FIRST(bb);
+         bucket != APR_BRIGADE_SENTINEL(bb);
+         bucket = next) {
+        next = APR_BUCKET_NEXT(bucket);
+
+#if APR_HAS_SENDFILE
+        if (can_sendfile_bucket(bucket)) {
+            if (nvec > 0) {
+                sock_nopush(s, 1);
+                rv = writev_nonblocking(s, bb, ctx, nbytes, nvec, c);
+                if (rv != APR_SUCCESS) {
+                    goto cleanup;
+                }
+                nbytes = 0;
+                nvec = 0;
+            }
+            rv = sendfile_nonblocking(s, bucket, ctx, c);
+            if (rv != APR_SUCCESS) {
+                goto cleanup;
+            }
+            continue;
+        }
+#endif /* APR_HAS_SENDFILE */
+
+        if (bucket->length) {
+            /* Non-blocking read first, in case this is a morphing
+             * bucket type. */
+            rv = apr_bucket_read(bucket, &data, &length, APR_NONBLOCK_READ);
+            if (APR_STATUS_IS_EAGAIN(rv)) {
+                /* Read would block; flush any pending data and retry. */
+                if (nvec) {
+                    rv = writev_nonblocking(s, bb, ctx, nbytes, nvec, c);
+                    if (rv != APR_SUCCESS) {
+                        goto cleanup;
+                    }
+                    nbytes = 0;
+                    nvec = 0;
+                }
+                sock_nopush(s, 0);
+
+                rv = apr_bucket_read(bucket, &data, &length, APR_BLOCK_READ);
+            }
+            if (rv != APR_SUCCESS) {
+                goto cleanup;
+            }
+
+            /* reading may have split the bucket, so recompute next: */
+            next = APR_BUCKET_NEXT(bucket);
+        }
+
+        if (!bucket->length) {
+            /* Don't delete empty buckets until all the previous ones have been
+             * sent (nvec == 0); this must happen in sequence since metabuckets
+             * like EOR could free the data still pointed to by the iovec. So
+             * unless the latter is empty, let writev_nonblocking() cleanup the
+             * brigade in order.
+             */
+            if (!nvec) {
+                apr_bucket_delete(bucket);
+            }
+            continue;
+        }
+
+        /* Make sure that these new data fit in our iovec. */
+        if (nvec == ctx->nvec) {
+            if (nvec == NVEC_MAX) {
+                sock_nopush(s, 1);
+                rv = writev_nonblocking(s, bb, ctx, nbytes, nvec, c);
+                if (rv != APR_SUCCESS) {
+                    goto cleanup;
+                }
+                nbytes = 0;
+                nvec = 0;
+            }
+            else {
+                struct iovec *newvec;
+                apr_size_t newn = nvec * 2;
+                if (newn < NVEC_MIN) {
+                    newn = NVEC_MIN;
+                }
+                else if (newn > NVEC_MAX) {
+                    newn = NVEC_MAX;
+                }
+                newvec = apr_palloc(c->pool, newn * sizeof(struct iovec));
+                if (nvec) {
+                    memcpy(newvec, ctx->vec, nvec * sizeof(struct iovec));
+                }
+                ctx->vec = newvec;
+                ctx->nvec = newn;
+            }
+        }
+        nbytes += length;
+        ctx->vec[nvec].iov_base = (void *)data;
+        ctx->vec[nvec].iov_len = length;
+        nvec++;
+
+        /* Flush above max threshold, unless the brigade still contains in
+         * memory buckets which we want to try writing in the same pass (if
+         * we are at the end of the brigade, the write will happen outside
+         * the loop anyway).
+         */
+        if (nbytes > conf->flush_max_threshold
+                && next != APR_BRIGADE_SENTINEL(bb)
+                && !is_in_memory_bucket(next)) {
+            sock_nopush(s, 1);
+            rv = writev_nonblocking(s, bb, ctx, nbytes, nvec, c);
+            if (rv != APR_SUCCESS) {
+                goto cleanup;
+            }
+            nbytes = 0;
+            nvec = 0;
+        }
+    }
+    if (nvec > 0) {
+        rv = writev_nonblocking(s, bb, ctx, nbytes, nvec, c);
+    }
+
+cleanup:
+    sock_nopush(s, 0);
+    return rv;
+}
+
+static apr_status_t writev_nonblocking(apr_socket_t *s,
+                                       apr_bucket_brigade *bb,
+                                       core_output_filter_ctx_t *ctx,
+                                       apr_size_t bytes_to_write,
+                                       apr_size_t nvec,
+                                       conn_rec *c)
+{
+    apr_status_t rv;
+    struct iovec *vec = ctx->vec;
+    apr_size_t bytes_written = 0;
+    apr_size_t i, offset = 0;
+
+    do {
+        apr_size_t n = 0;
+        rv = apr_socket_sendv(s, vec + offset, nvec - offset, &n);
+        bytes_written += n;
+
+        for (i = offset; i < nvec; ) {
+            apr_bucket *bucket = APR_BRIGADE_FIRST(bb);
+            if (!bucket->length) {
+                apr_bucket_delete(bucket);
+            }
+            else if (n >= vec[i].iov_len) {
+                apr_bucket_delete(bucket);
+                n -= vec[i++].iov_len;
+                offset++;
+            }
+            else {
+                if (n) {
+                    apr_bucket_split(bucket, n);
+                    apr_bucket_delete(bucket);
+                    vec[i].iov_len -= n;
+                    vec[i].iov_base = (char *) vec[i].iov_base + n;
+                }
+                break;
+            }
+        }
+    } while (rv == APR_SUCCESS && bytes_written < bytes_to_write);
+
+    if ((ap__logio_add_bytes_out != NULL) && (bytes_written > 0)) {
+        ap__logio_add_bytes_out(c, bytes_written);
+    }
+    ctx->bytes_written += bytes_written;
+
+    ap_log_cerror(APLOG_MARK, APLOG_TRACE6, rv, c,
+                  "writev_nonblocking: %"APR_SIZE_T_FMT"/%"APR_SIZE_T_FMT,
+                  bytes_written, bytes_to_write);
+    return rv;
+}
+
+#if APR_HAS_SENDFILE
+
+static apr_status_t sendfile_nonblocking(apr_socket_t *s,
+                                         apr_bucket *bucket,
+                                         core_output_filter_ctx_t *ctx,
+                                         conn_rec *c)
+{
+    apr_status_t rv;
+    apr_file_t *file = ((apr_bucket_file *)bucket->data)->fd;
+    apr_size_t bytes_written = bucket->length; /* bytes_to_write for now */
+    apr_off_t file_offset = bucket->start;
+
+    rv = apr_socket_sendfile(s, file, NULL, &file_offset, &bytes_written, 0);
+    if ((ap__logio_add_bytes_out != NULL) && (bytes_written > 0)) {
+        ap__logio_add_bytes_out(c, bytes_written);
+    }
+    ctx->bytes_written += bytes_written;
+
+    ap_log_cerror(APLOG_MARK, APLOG_TRACE6, rv, c,
+                  "sendfile_nonblocking: %" APR_SIZE_T_FMT "/%" APR_SIZE_T_FMT,
+                  bytes_written, bucket->length);
+    if (bytes_written >= bucket->length) {
+        apr_bucket_delete(bucket);
+    }
+    else if (bytes_written > 0) {
+        apr_bucket_split(bucket, bytes_written);
+        apr_bucket_delete(bucket);
+        if (rv == APR_SUCCESS) {
+            rv = APR_EAGAIN;
+        }
+    }
+    return rv;
+}
+
+#endif