summaryrefslogtreecommitdiffstats
path: root/server/core_filters.c
diff options
context:
space:
mode:
Diffstat (limited to 'server/core_filters.c')
-rw-r--r--server/core_filters.c868
1 files changed, 868 insertions, 0 deletions
diff --git a/server/core_filters.c b/server/core_filters.c
new file mode 100644
index 0000000..d8a661f
--- /dev/null
+++ b/server/core_filters.c
@@ -0,0 +1,868 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file core_filters.c
+ * @brief Core input/output network filters.
+ */
+
+#include "apr.h"
+#include "apr_strings.h"
+#include "apr_lib.h"
+#include "apr_fnmatch.h"
+#include "apr_hash.h"
+#include "apr_thread_proc.h" /* for RLIMIT stuff */
+#include "apr_version.h"
+
+#define APR_WANT_IOVEC
+#define APR_WANT_STRFUNC
+#define APR_WANT_MEMFUNC
+#include "apr_want.h"
+
+#include "ap_config.h"
+#include "httpd.h"
+#include "http_config.h"
+#include "http_core.h"
+#include "http_protocol.h" /* For index_of_response(). Grump. */
+#include "http_request.h"
+#include "http_vhost.h"
+#include "http_main.h" /* For the default_handler below... */
+#include "http_log.h"
+#include "util_md5.h"
+#include "http_connection.h"
+#include "apr_buckets.h"
+#include "util_filter.h"
+#include "util_ebcdic.h"
+#include "mpm_common.h"
+#include "scoreboard.h"
+#include "mod_core.h"
+#include "ap_listen.h"
+
+#include "mod_so.h" /* for ap_find_loaded_module_symbol */
+
+#define AP_MIN_SENDFILE_BYTES (256)
+
+/**
+ * Remove all zero length buckets from the brigade.
+ */
+#define BRIGADE_NORMALIZE(b) \
+do { \
+ apr_bucket *e = APR_BRIGADE_FIRST(b); \
+ do { \
+ if (e->length == 0 && !APR_BUCKET_IS_METADATA(e)) { \
+ apr_bucket *d; \
+ d = APR_BUCKET_NEXT(e); \
+ apr_bucket_delete(e); \
+ e = d; \
+ } \
+ else { \
+ e = APR_BUCKET_NEXT(e); \
+ } \
+ } while (!APR_BRIGADE_EMPTY(b) && (e != APR_BRIGADE_SENTINEL(b))); \
+} while (0)
+
+/* we know core's module_index is 0 */
+#undef APLOG_MODULE_INDEX
+#define APLOG_MODULE_INDEX AP_CORE_MODULE_INDEX
+
+struct core_output_filter_ctx {
+ apr_bucket_brigade *buffered_bb;
+ apr_pool_t *deferred_write_pool;
+ apr_size_t bytes_written;
+ struct iovec *vec;
+ apr_size_t nvec;
+};
+
+
+apr_status_t ap_core_input_filter(ap_filter_t *f, apr_bucket_brigade *b,
+ ap_input_mode_t mode, apr_read_type_e block,
+ apr_off_t readbytes)
+{
+ apr_status_t rv;
+ core_net_rec *net = f->ctx;
+ core_ctx_t *ctx = net->in_ctx;
+ const char *str;
+ apr_size_t len;
+
+ if (mode == AP_MODE_INIT) {
+ /*
+ * this mode is for filters that might need to 'initialize'
+ * a connection before reading request data from a client.
+ * NNTP over SSL for example needs to handshake before the
+ * server sends the welcome message.
+ * such filters would have changed the mode before this point
+ * is reached. however, protocol modules such as NNTP should
+ * not need to know anything about SSL. given the example, if
+ * SSL is not in the filter chain, AP_MODE_INIT is a noop.
+ */
+ return APR_SUCCESS;
+ }
+
+ if (!ctx)
+ {
+ net->in_ctx = ctx = apr_palloc(f->c->pool, sizeof(*ctx));
+ ctx->b = apr_brigade_create(f->c->pool, f->c->bucket_alloc);
+ ctx->tmpbb = apr_brigade_create(f->c->pool, f->c->bucket_alloc);
+ /* seed the brigade with the client socket. */
+ rv = ap_run_insert_network_bucket(f->c, ctx->b, net->client_socket);
+ if (rv != APR_SUCCESS)
+ return rv;
+ }
+ else if (APR_BRIGADE_EMPTY(ctx->b)) {
+ return APR_EOF;
+ }
+
+ /* ### This is bad. */
+ BRIGADE_NORMALIZE(ctx->b);
+
+ /* check for empty brigade again *AFTER* BRIGADE_NORMALIZE()
+ * If we have lost our socket bucket (see above), we are EOF.
+ *
+ * Ideally, this should be returning SUCCESS with EOS bucket, but
+ * some higher-up APIs (spec. read_request_line via ap_rgetline)
+ * want an error code. */
+ if (APR_BRIGADE_EMPTY(ctx->b)) {
+ return APR_EOF;
+ }
+
+ if (mode == AP_MODE_GETLINE) {
+ /* we are reading a single LF line, e.g. the HTTP headers */
+ rv = apr_brigade_split_line(b, ctx->b, block, HUGE_STRING_LEN);
+ /* We should treat EAGAIN here the same as we do for EOF (brigade is
+ * empty). We do this by returning whatever we have read. This may
+ * or may not be bogus, but is consistent (for now) with EOF logic.
+ */
+ if (APR_STATUS_IS_EAGAIN(rv) && block == APR_NONBLOCK_READ) {
+ rv = APR_SUCCESS;
+ }
+ return rv;
+ }
+
+ /* ### AP_MODE_PEEK is a horrific name for this mode because we also
+ * eat any CRLFs that we see. That's not the obvious intention of
+ * this mode. Determine whether anyone actually uses this or not. */
+ if (mode == AP_MODE_EATCRLF) {
+ apr_bucket *e;
+ const char *c;
+
+ /* The purpose of this loop is to ignore any CRLF (or LF) at the end
+ * of a request. Many browsers send extra lines at the end of POST
+ * requests. We use the PEEK method to determine if there is more
+ * data on the socket, so that we know if we should delay sending the
+ * end of one request until we have served the second request in a
+ * pipelined situation. We don't want to actually delay sending a
+ * response if the server finds a CRLF (or LF), becuause that doesn't
+ * mean that there is another request, just a blank line.
+ */
+ while (1) {
+ if (APR_BRIGADE_EMPTY(ctx->b))
+ return APR_EOF;
+
+ e = APR_BRIGADE_FIRST(ctx->b);
+
+ rv = apr_bucket_read(e, &str, &len, APR_NONBLOCK_READ);
+
+ if (rv != APR_SUCCESS)
+ return rv;
+
+ c = str;
+ while (c < str + len) {
+ if (*c == APR_ASCII_LF)
+ c++;
+ else if (*c == APR_ASCII_CR && *(c + 1) == APR_ASCII_LF)
+ c += 2;
+ else
+ return APR_SUCCESS;
+ }
+
+ /* If we reach here, we were a bucket just full of CRLFs, so
+ * just toss the bucket. */
+ /* FIXME: Is this the right thing to do in the core? */
+ apr_bucket_delete(e);
+ }
+ return APR_SUCCESS;
+ }
+
+ /* If mode is EXHAUSTIVE, we want to just read everything until the end
+ * of the brigade, which in this case means the end of the socket.
+ * To do this, we attach the brigade that has currently been setaside to
+ * the brigade that was passed down, and send that brigade back.
+ *
+ * NOTE: This is VERY dangerous to use, and should only be done with
+ * extreme caution. FWLIW, this would be needed by an MPM like Perchild;
+ * such an MPM can easily request the socket and all data that has been
+ * read, which means that it can pass it to the correct child process.
+ */
+ if (mode == AP_MODE_EXHAUSTIVE) {
+ apr_bucket *e;
+
+ /* Tack on any buckets that were set aside. */
+ APR_BRIGADE_CONCAT(b, ctx->b);
+
+ /* Since we've just added all potential buckets (which will most
+ * likely simply be the socket bucket) we know this is the end,
+ * so tack on an EOS too. */
+ /* We have read until the brigade was empty, so we know that we
+ * must be EOS. */
+ e = apr_bucket_eos_create(f->c->bucket_alloc);
+ APR_BRIGADE_INSERT_TAIL(b, e);
+ return APR_SUCCESS;
+ }
+
+ /* read up to the amount they specified. */
+ if (mode == AP_MODE_READBYTES || mode == AP_MODE_SPECULATIVE) {
+ apr_bucket *e;
+
+ AP_DEBUG_ASSERT(readbytes > 0);
+
+ e = APR_BRIGADE_FIRST(ctx->b);
+ rv = apr_bucket_read(e, &str, &len, block);
+
+ if (APR_STATUS_IS_EAGAIN(rv) && block == APR_NONBLOCK_READ) {
+ /* getting EAGAIN for a blocking read is an error; for a
+ * non-blocking read, return an empty brigade. */
+ return APR_SUCCESS;
+ }
+ else if (rv != APR_SUCCESS) {
+ return rv;
+ }
+ else if (block == APR_BLOCK_READ && len == 0) {
+ /* We wanted to read some bytes in blocking mode. We read
+ * 0 bytes. Hence, we now assume we are EOS.
+ *
+ * When we are in normal mode, return an EOS bucket to the
+ * caller.
+ * When we are in speculative mode, leave ctx->b empty, so
+ * that the next call returns an EOS bucket.
+ */
+ apr_bucket_delete(e);
+
+ if (mode == AP_MODE_READBYTES) {
+ e = apr_bucket_eos_create(f->c->bucket_alloc);
+ APR_BRIGADE_INSERT_TAIL(b, e);
+ }
+ return APR_SUCCESS;
+ }
+
+ /* Have we read as much data as we wanted (be greedy)? */
+ if (len < readbytes) {
+ apr_size_t bucket_len;
+
+ rv = APR_SUCCESS;
+ /* We already registered the data in e in len */
+ e = APR_BUCKET_NEXT(e);
+ while ((len < readbytes) && (rv == APR_SUCCESS)
+ && (e != APR_BRIGADE_SENTINEL(ctx->b))) {
+ /* Check for the availability of buckets with known length */
+ if (e->length != (apr_size_t)-1) {
+ len += e->length;
+ e = APR_BUCKET_NEXT(e);
+ }
+ else {
+ /*
+ * Read from bucket, but non blocking. If there isn't any
+ * more data, well than this is fine as well, we will
+ * not wait for more since we already got some and we are
+ * only checking if there isn't more.
+ */
+ rv = apr_bucket_read(e, &str, &bucket_len,
+ APR_NONBLOCK_READ);
+ if (rv == APR_SUCCESS) {
+ len += bucket_len;
+ e = APR_BUCKET_NEXT(e);
+ }
+ }
+ }
+ }
+
+ /* We can only return at most what we read. */
+ if (len < readbytes) {
+ readbytes = len;
+ }
+
+ rv = apr_brigade_partition(ctx->b, readbytes, &e);
+ if (rv != APR_SUCCESS) {
+ return rv;
+ }
+
+ /* Must do move before CONCAT */
+ ctx->tmpbb = apr_brigade_split_ex(ctx->b, e, ctx->tmpbb);
+
+ if (mode == AP_MODE_READBYTES) {
+ APR_BRIGADE_CONCAT(b, ctx->b);
+ }
+ else if (mode == AP_MODE_SPECULATIVE) {
+ apr_bucket *copy_bucket;
+
+ for (e = APR_BRIGADE_FIRST(ctx->b);
+ e != APR_BRIGADE_SENTINEL(ctx->b);
+ e = APR_BUCKET_NEXT(e))
+ {
+ rv = apr_bucket_copy(e, &copy_bucket);
+ if (rv != APR_SUCCESS) {
+ return rv;
+ }
+ APR_BRIGADE_INSERT_TAIL(b, copy_bucket);
+ }
+ }
+
+ /* Take what was originally there and place it back on ctx->b */
+ APR_BRIGADE_CONCAT(ctx->b, ctx->tmpbb);
+ }
+ return APR_SUCCESS;
+}
+
+static void setaside_remaining_output(ap_filter_t *f,
+ core_output_filter_ctx_t *ctx,
+ apr_bucket_brigade *bb,
+ conn_rec *c);
+
+static apr_status_t send_brigade_nonblocking(apr_socket_t *s,
+ apr_bucket_brigade *bb,
+ core_output_filter_ctx_t *ctx,
+ conn_rec *c);
+
+static apr_status_t writev_nonblocking(apr_socket_t *s,
+ apr_bucket_brigade *bb,
+ core_output_filter_ctx_t *ctx,
+ apr_size_t bytes_to_write,
+ apr_size_t nvec,
+ conn_rec *c);
+
+#if APR_HAS_SENDFILE
+static apr_status_t sendfile_nonblocking(apr_socket_t *s,
+ apr_bucket *bucket,
+ core_output_filter_ctx_t *ctx,
+ conn_rec *c);
+#endif
+
+/* XXX: Should these be configurable parameters? */
+#define THRESHOLD_MIN_WRITE 4096
+
+/* Optional function coming from mod_logio, used for logging of output
+ * traffic
+ */
+extern APR_OPTIONAL_FN_TYPE(ap_logio_add_bytes_out) *ap__logio_add_bytes_out;
+
+static int should_send_brigade(apr_bucket_brigade *bb, conn_rec *c, int *flush)
+{
+ core_server_config *conf =
+ ap_get_core_module_config(c->base_server->module_config);
+ apr_size_t total_bytes = 0, non_file_bytes = 0;
+ apr_uint32_t eor_buckets = 0;
+ apr_bucket *bucket;
+ int need_flush = 0;
+
+ /* Scan through the brigade and decide whether we need to flush it,
+ * based on the following rules:
+ *
+ * a) The brigade contains a flush bucket: Do a blocking write
+ * of everything up that point.
+ *
+ * b) The request is in CONN_STATE_HANDLER state, and the brigade
+ * contains at least flush_max_threshold bytes in non-file
+ * buckets: Do blocking writes until the amount of data in the
+ * buffer is less than flush_max_threshold. (The point of this
+ * rule is to provide flow control, in case a handler is
+ * streaming out lots of data faster than the data can be
+ * sent to the client.)
+ *
+ * c) The request is in CONN_STATE_HANDLER state, and the brigade
+ * contains at least flush_max_pipelined EOR buckets:
+ * Do blocking writes until less than flush_max_pipelined EOR
+ * buckets are left. (The point of this rule is to prevent too many
+ * FDs being kept open by pipelined requests, possibly allowing a
+ * DoS).
+ *
+ * d) The brigade contains a morphing bucket: otherwise ap_save_brigade()
+ * could read the whole bucket into memory.
+ */
+ for (bucket = APR_BRIGADE_FIRST(bb);
+ bucket != APR_BRIGADE_SENTINEL(bb);
+ bucket = APR_BUCKET_NEXT(bucket)) {
+
+ if (!APR_BUCKET_IS_METADATA(bucket)) {
+ if (bucket->length == (apr_size_t)-1) {
+ if (flush) {
+ ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, c,
+ "core_output_filter: flushing because "
+ "of morphing bucket");
+ }
+ need_flush = 1;
+ break;
+ }
+
+ total_bytes += bucket->length;
+ if (!APR_BUCKET_IS_FILE(bucket)) {
+ non_file_bytes += bucket->length;
+ if (non_file_bytes > conf->flush_max_threshold) {
+ if (flush) {
+ ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, c,
+ "core_output_filter: flushing because "
+ "of max threshold");
+ }
+ need_flush = 1;
+ break;
+ }
+ }
+ }
+ else if (APR_BUCKET_IS_FLUSH(bucket)) {
+ if (flush) {
+ ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, c,
+ "core_output_filter: flushing because "
+ "of FLUSH bucket");
+ }
+ need_flush = 1;
+ break;
+ }
+ else if (AP_BUCKET_IS_EOR(bucket)
+ && conf->flush_max_pipelined >= 0
+ && ++eor_buckets > conf->flush_max_pipelined) {
+ if (flush) {
+ ap_log_cerror(APLOG_MARK, APLOG_TRACE6, 0, c,
+ "core_output_filter: flushing because "
+ "of max pipelined");
+ }
+ need_flush = 1;
+ break;
+ }
+ }
+ if (flush) {
+ *flush = need_flush;
+ }
+
+ /* Also send if above flush_min_threshold, or if there are FILE buckets */
+ return (need_flush
+ || total_bytes >= THRESHOLD_MIN_WRITE
+ || total_bytes > non_file_bytes);
+}
+
+apr_status_t ap_core_output_filter(ap_filter_t *f, apr_bucket_brigade *new_bb)
+{
+ conn_rec *c = f->c;
+ core_net_rec *net = f->ctx;
+ core_output_filter_ctx_t *ctx = net->out_ctx;
+ apr_bucket_brigade *bb = NULL;
+ apr_status_t rv = APR_SUCCESS;
+
+ /* Fail quickly if the connection has already been aborted. */
+ if (c->aborted) {
+ if (new_bb != NULL) {
+ apr_brigade_cleanup(new_bb);
+ }
+ return APR_ECONNABORTED;
+ }
+
+ if (ctx == NULL) {
+ ctx = apr_pcalloc(c->pool, sizeof(*ctx));
+ net->out_ctx = (core_output_filter_ctx_t *)ctx;
+ /*
+ * Need to create buffered_bb brigade with correct lifetime. Passing
+ * NULL to ap_save_brigade() would result in a brigade
+ * allocated from bb->pool which might be wrong.
+ */
+ ctx->buffered_bb = apr_brigade_create(c->pool, c->bucket_alloc);
+ }
+
+ if (new_bb != NULL)
+ bb = new_bb;
+
+ if ((ctx->buffered_bb != NULL) &&
+ !APR_BRIGADE_EMPTY(ctx->buffered_bb)) {
+ if (new_bb != NULL) {
+ APR_BRIGADE_PREPEND(bb, ctx->buffered_bb);
+ }
+ else {
+ bb = ctx->buffered_bb;
+ }
+ }
+ else if (new_bb == NULL) {
+ c->data_in_output_filters = 0;
+ return APR_SUCCESS;
+ }
+
+ if (!new_bb || should_send_brigade(bb, c, NULL)) {
+ apr_socket_t *sock = net->client_socket;
+ apr_interval_time_t sock_timeout = 0;
+
+ /* Non-blocking writes on the socket in any case. */
+ apr_socket_timeout_get(sock, &sock_timeout);
+ apr_socket_timeout_set(sock, 0);
+
+ do {
+ rv = send_brigade_nonblocking(sock, bb, ctx, c);
+ if (new_bb && APR_STATUS_IS_EAGAIN(rv)) {
+ /* Scan through the brigade and decide whether we must absolutely
+ * flush the remaining data, based on should_send_brigade() &flush
+ * rules. If so, wait for writability and retry, otherwise we did
+ * our best already and can wait for the next call.
+ */
+ int flush;
+ (void)should_send_brigade(bb, c, &flush);
+ if (flush) {
+ apr_int32_t nfd;
+ apr_pollfd_t pfd;
+ memset(&pfd, 0, sizeof(pfd));
+ pfd.reqevents = APR_POLLOUT;
+ pfd.desc_type = APR_POLL_SOCKET;
+ pfd.desc.s = sock;
+ pfd.p = c->pool;
+ do {
+ rv = apr_poll(&pfd, 1, &nfd, sock_timeout);
+ } while (APR_STATUS_IS_EINTR(rv));
+ }
+ }
+ } while (rv == APR_SUCCESS && !APR_BRIGADE_EMPTY(bb));
+
+ /* Restore original socket timeout before leaving. */
+ apr_socket_timeout_set(sock, sock_timeout);
+ }
+
+ if (rv != APR_SUCCESS && !APR_STATUS_IS_EAGAIN(rv)) {
+ /* The client has aborted the connection */
+ ap_log_cerror(
+ APLOG_MARK, APLOG_TRACE1, rv, c,
+ "core_output_filter: writing data to the network");
+ /*
+ * Set c->aborted before apr_brigade_cleanup to have the correct status
+ * when logging the request as apr_brigade_cleanup triggers the logging
+ * of the request if it contains an EOR bucket.
+ */
+ c->aborted = 1;
+ apr_brigade_cleanup(bb);
+ return rv;
+ }
+
+ setaside_remaining_output(f, ctx, bb, c);
+ return APR_SUCCESS;
+}
+
+/*
+ * This function assumes that either ctx->buffered_bb == NULL, or
+ * ctx->buffered_bb is empty, or ctx->buffered_bb == bb
+ */
+static void setaside_remaining_output(ap_filter_t *f,
+ core_output_filter_ctx_t *ctx,
+ apr_bucket_brigade *bb,
+ conn_rec *c)
+{
+ apr_bucket *bucket;
+
+ /* Don't set aside leading empty buckets, all previous data have been
+ * consumed so it's safe to delete them now.
+ */
+ while (((bucket = APR_BRIGADE_FIRST(bb)) != APR_BRIGADE_SENTINEL(bb)) &&
+ (APR_BUCKET_IS_METADATA(bucket) || (bucket->length == 0))) {
+ apr_bucket_delete(bucket);
+ }
+
+ c->data_in_output_filters = 0;
+ if (!APR_BRIGADE_EMPTY(bb)) {
+ c->data_in_output_filters = 1;
+ if (bb != ctx->buffered_bb) {
+ if (!ctx->deferred_write_pool) {
+ apr_pool_create(&ctx->deferred_write_pool, c->pool);
+ apr_pool_tag(ctx->deferred_write_pool, "deferred_write");
+ }
+ ap_save_brigade(f, &(ctx->buffered_bb), &bb,
+ ctx->deferred_write_pool);
+ }
+ }
+ else if (ctx->deferred_write_pool) {
+ /*
+ * There are no more requests in the pipeline. We can just clear the
+ * pool.
+ */
+ apr_pool_clear(ctx->deferred_write_pool);
+ }
+}
+
+#ifndef APR_MAX_IOVEC_SIZE
+#define NVEC_MIN 16
+#define NVEC_MAX NVEC_MIN
+#else
+#if APR_MAX_IOVEC_SIZE > 16
+#define NVEC_MIN 16
+#else
+#define NVEC_MIN APR_MAX_IOVEC_SIZE
+#endif
+#define NVEC_MAX APR_MAX_IOVEC_SIZE
+#endif
+
+static APR_INLINE int is_in_memory_bucket(apr_bucket *b)
+{
+ /* These buckets' data are already in memory. */
+ return APR_BUCKET_IS_HEAP(b)
+ || APR_BUCKET_IS_POOL(b)
+ || APR_BUCKET_IS_TRANSIENT(b)
+ || APR_BUCKET_IS_IMMORTAL(b);
+}
+
+#if APR_HAS_SENDFILE
+static APR_INLINE int can_sendfile_bucket(apr_bucket *b)
+{
+ /* Use sendfile to send the bucket unless:
+ * - the bucket is not a file bucket, or
+ * - the file is too small for sendfile to be useful, or
+ * - sendfile is disabled in the httpd config via "EnableSendfile off".
+ */
+ if (APR_BUCKET_IS_FILE(b) && b->length >= AP_MIN_SENDFILE_BYTES) {
+ apr_file_t *file = ((apr_bucket_file *)b->data)->fd;
+ return apr_file_flags_get(file) & APR_SENDFILE_ENABLED;
+ }
+ else {
+ return 0;
+ }
+}
+#endif
+
+#if defined(WIN32) && (APR_MAJOR_VERSION == 1 && APR_MINOR_VERSION <= 7)
+#undef APR_TCP_NOPUSH_FLAG
+#define APR_TCP_NOPUSH_FLAG 0
+#endif
+
+static APR_INLINE void sock_nopush(apr_socket_t *s, int to)
+{
+ /* Disable TCP_NOPUSH handling on OSX since unsetting it won't push
+ * retained data, which might introduce delays if further data don't
+ * come soon enough or cause the last chunk to be sent only when the
+ * connection is shutdown (e.g. after KeepAliveTimeout).
+ */
+#if APR_TCP_NOPUSH_FLAG && !defined(__APPLE__)
+ (void)apr_socket_opt_set(s, APR_TCP_NOPUSH, to);
+#endif
+}
+
+static apr_status_t send_brigade_nonblocking(apr_socket_t *s,
+ apr_bucket_brigade *bb,
+ core_output_filter_ctx_t *ctx,
+ conn_rec *c)
+{
+ apr_status_t rv = APR_SUCCESS;
+ core_server_config *conf =
+ ap_get_core_module_config(c->base_server->module_config);
+ apr_size_t nvec = 0, nbytes = 0;
+ apr_bucket *bucket, *next;
+ const char *data;
+ apr_size_t length;
+
+ for (bucket = APR_BRIGADE_FIRST(bb);
+ bucket != APR_BRIGADE_SENTINEL(bb);
+ bucket = next) {
+ next = APR_BUCKET_NEXT(bucket);
+
+#if APR_HAS_SENDFILE
+ if (can_sendfile_bucket(bucket)) {
+ if (nvec > 0) {
+ sock_nopush(s, 1);
+ rv = writev_nonblocking(s, bb, ctx, nbytes, nvec, c);
+ if (rv != APR_SUCCESS) {
+ goto cleanup;
+ }
+ nbytes = 0;
+ nvec = 0;
+ }
+ rv = sendfile_nonblocking(s, bucket, ctx, c);
+ if (rv != APR_SUCCESS) {
+ goto cleanup;
+ }
+ continue;
+ }
+#endif /* APR_HAS_SENDFILE */
+
+ if (bucket->length) {
+ /* Non-blocking read first, in case this is a morphing
+ * bucket type. */
+ rv = apr_bucket_read(bucket, &data, &length, APR_NONBLOCK_READ);
+ if (APR_STATUS_IS_EAGAIN(rv)) {
+ /* Read would block; flush any pending data and retry. */
+ if (nvec) {
+ rv = writev_nonblocking(s, bb, ctx, nbytes, nvec, c);
+ if (rv != APR_SUCCESS) {
+ goto cleanup;
+ }
+ nbytes = 0;
+ nvec = 0;
+ }
+ sock_nopush(s, 0);
+
+ rv = apr_bucket_read(bucket, &data, &length, APR_BLOCK_READ);
+ }
+ if (rv != APR_SUCCESS) {
+ goto cleanup;
+ }
+
+ /* reading may have split the bucket, so recompute next: */
+ next = APR_BUCKET_NEXT(bucket);
+ }
+
+ if (!bucket->length) {
+ /* Don't delete empty buckets until all the previous ones have been
+ * sent (nvec == 0); this must happen in sequence since metabuckets
+ * like EOR could free the data still pointed to by the iovec. So
+ * unless the latter is empty, let writev_nonblocking() cleanup the
+ * brigade in order.
+ */
+ if (!nvec) {
+ apr_bucket_delete(bucket);
+ }
+ continue;
+ }
+
+ /* Make sure that these new data fit in our iovec. */
+ if (nvec == ctx->nvec) {
+ if (nvec == NVEC_MAX) {
+ sock_nopush(s, 1);
+ rv = writev_nonblocking(s, bb, ctx, nbytes, nvec, c);
+ if (rv != APR_SUCCESS) {
+ goto cleanup;
+ }
+ nbytes = 0;
+ nvec = 0;
+ }
+ else {
+ struct iovec *newvec;
+ apr_size_t newn = nvec * 2;
+ if (newn < NVEC_MIN) {
+ newn = NVEC_MIN;
+ }
+ else if (newn > NVEC_MAX) {
+ newn = NVEC_MAX;
+ }
+ newvec = apr_palloc(c->pool, newn * sizeof(struct iovec));
+ if (nvec) {
+ memcpy(newvec, ctx->vec, nvec * sizeof(struct iovec));
+ }
+ ctx->vec = newvec;
+ ctx->nvec = newn;
+ }
+ }
+ nbytes += length;
+ ctx->vec[nvec].iov_base = (void *)data;
+ ctx->vec[nvec].iov_len = length;
+ nvec++;
+
+ /* Flush above max threshold, unless the brigade still contains in
+ * memory buckets which we want to try writing in the same pass (if
+ * we are at the end of the brigade, the write will happen outside
+ * the loop anyway).
+ */
+ if (nbytes > conf->flush_max_threshold
+ && next != APR_BRIGADE_SENTINEL(bb)
+ && !is_in_memory_bucket(next)) {
+ sock_nopush(s, 1);
+ rv = writev_nonblocking(s, bb, ctx, nbytes, nvec, c);
+ if (rv != APR_SUCCESS) {
+ goto cleanup;
+ }
+ nbytes = 0;
+ nvec = 0;
+ }
+ }
+ if (nvec > 0) {
+ rv = writev_nonblocking(s, bb, ctx, nbytes, nvec, c);
+ }
+
+cleanup:
+ sock_nopush(s, 0);
+ return rv;
+}
+
+static apr_status_t writev_nonblocking(apr_socket_t *s,
+ apr_bucket_brigade *bb,
+ core_output_filter_ctx_t *ctx,
+ apr_size_t bytes_to_write,
+ apr_size_t nvec,
+ conn_rec *c)
+{
+ apr_status_t rv;
+ struct iovec *vec = ctx->vec;
+ apr_size_t bytes_written = 0;
+ apr_size_t i, offset = 0;
+
+ do {
+ apr_size_t n = 0;
+ rv = apr_socket_sendv(s, vec + offset, nvec - offset, &n);
+ bytes_written += n;
+
+ for (i = offset; i < nvec; ) {
+ apr_bucket *bucket = APR_BRIGADE_FIRST(bb);
+ if (!bucket->length) {
+ apr_bucket_delete(bucket);
+ }
+ else if (n >= vec[i].iov_len) {
+ apr_bucket_delete(bucket);
+ n -= vec[i++].iov_len;
+ offset++;
+ }
+ else {
+ if (n) {
+ apr_bucket_split(bucket, n);
+ apr_bucket_delete(bucket);
+ vec[i].iov_len -= n;
+ vec[i].iov_base = (char *) vec[i].iov_base + n;
+ }
+ break;
+ }
+ }
+ } while (rv == APR_SUCCESS && bytes_written < bytes_to_write);
+
+ if ((ap__logio_add_bytes_out != NULL) && (bytes_written > 0)) {
+ ap__logio_add_bytes_out(c, bytes_written);
+ }
+ ctx->bytes_written += bytes_written;
+
+ ap_log_cerror(APLOG_MARK, APLOG_TRACE6, rv, c,
+ "writev_nonblocking: %"APR_SIZE_T_FMT"/%"APR_SIZE_T_FMT,
+ bytes_written, bytes_to_write);
+ return rv;
+}
+
+#if APR_HAS_SENDFILE
+
+static apr_status_t sendfile_nonblocking(apr_socket_t *s,
+ apr_bucket *bucket,
+ core_output_filter_ctx_t *ctx,
+ conn_rec *c)
+{
+ apr_status_t rv;
+ apr_file_t *file = ((apr_bucket_file *)bucket->data)->fd;
+ apr_size_t bytes_written = bucket->length; /* bytes_to_write for now */
+ apr_off_t file_offset = bucket->start;
+
+ rv = apr_socket_sendfile(s, file, NULL, &file_offset, &bytes_written, 0);
+ if ((ap__logio_add_bytes_out != NULL) && (bytes_written > 0)) {
+ ap__logio_add_bytes_out(c, bytes_written);
+ }
+ ctx->bytes_written += bytes_written;
+
+ ap_log_cerror(APLOG_MARK, APLOG_TRACE6, rv, c,
+ "sendfile_nonblocking: %" APR_SIZE_T_FMT "/%" APR_SIZE_T_FMT,
+ bytes_written, bucket->length);
+ if (bytes_written >= bucket->length) {
+ apr_bucket_delete(bucket);
+ }
+ else if (bytes_written > 0) {
+ apr_bucket_split(bucket, bytes_written);
+ apr_bucket_delete(bucket);
+ if (rv == APR_SUCCESS) {
+ rv = APR_EAGAIN;
+ }
+ }
+ return rv;
+}
+
+#endif