diff options
Diffstat (limited to 'htp/htp_util.c')
-rw-r--r-- | htp/htp_util.c | 2602 |
1 files changed, 2602 insertions, 0 deletions
diff --git a/htp/htp_util.c b/htp/htp_util.c new file mode 100644 index 0000000..936e22b --- /dev/null +++ b/htp/htp_util.c @@ -0,0 +1,2602 @@ +/*************************************************************************** + * Copyright (c) 2009-2010 Open Information Security Foundation + * Copyright (c) 2010-2013 Qualys, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + + * - Neither the name of the Qualys, Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + ***************************************************************************/ + +/** + * @file + * @author Ivan Ristic <ivanr@webkreator.com> + */ + +#include "htp_config_auto.h" + +//inet_pton +#if _WIN32 +#include <ws2tcpip.h> +#else // mac, linux, freebsd +#include <sys/types.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#endif + +#include "htp_private.h" + +/** + * Is character a linear white space character? + * + * @param[in] c + * @return 0 or 1 + */ +int htp_is_lws(int c) { + if ((c == ' ') || (c == '\t')) return 1; + else return 0; +} + +/** + * Is character a separator character? + * + * @param[in] c + * @return 0 or 1 + */ +int htp_is_separator(int c) { + /* separators = "(" | ")" | "<" | ">" | "@" + | "," | ";" | ":" | "\" | <"> + | "/" | "[" | "]" | "?" | "=" + | "{" | "}" | SP | HT */ + switch (c) { + case '(': + case ')': + case '<': + case '>': + case '@': + case ',': + case ';': + case ':': + case '\\': + case '"': + case '/': + case '[': + case ']': + case '?': + case '=': + case '{': + case '}': + case ' ': + case '\t': + return 1; + break; + default: + return 0; + } +} + +/** + * Is character a text character? + * + * @param[in] c + * @return 0 or 1 + */ +int htp_is_text(int c) { + if (c == '\t') return 1; + if (c < 32) return 0; + return 1; +} + +/** + * Is character a token character? + * + * @param[in] c + * @return 0 or 1 + */ +int htp_is_token(int c) { + /* token = 1*<any CHAR except CTLs or separators> */ + /* CHAR = <any US-ASCII character (octets 0 - 127)> */ + if ((c < 32) || (c > 126)) return 0; + if (htp_is_separator(c)) return 0; + return 1; +} + +/** + * Remove all line terminators (LF, CR or CRLF) from + * the end of the line provided as input. + * + * @return 0 if nothing was removed, 1 if one or more LF characters were removed, or + * 2 if one or more CR and/or LF characters were removed. + */ +int htp_chomp(unsigned char *data, size_t *len) { + int r = 0; + + // Loop until there's no more stuff in the buffer + while (*len > 0) { + // Try one LF first + if (data[*len - 1] == LF) { + (*len)--; + r = 1; + + if (*len == 0) return r; + + // A CR is allowed before LF + if (data[*len - 1] == CR) { + (*len)--; + r = 2; + } + } else if (data[*len - 1] == CR) { + (*len)--; + r = 1; + } else return r; + } + + return r; +} + +/** + * Is character a white space character? + * + * @param[in] c + * @return 0 or 1 + */ +int htp_is_space(int c) { + switch (c) { + case ' ': + case '\f': + case '\v': + case '\t': + case '\r': + case '\n': + return 1; + default: + return 0; + } +} + +/** + * Converts request method, given as a string, into a number. + * + * @param[in] method + * @return Method number of M_UNKNOWN + */ +int htp_convert_method_to_number(bstr *method) { + if (method == NULL) return HTP_M_UNKNOWN; + + // TODO Optimize using parallel matching, or something similar. + + if (bstr_cmp_c(method, "GET") == 0) return HTP_M_GET; + if (bstr_cmp_c(method, "PUT") == 0) return HTP_M_PUT; + if (bstr_cmp_c(method, "POST") == 0) return HTP_M_POST; + if (bstr_cmp_c(method, "DELETE") == 0) return HTP_M_DELETE; + if (bstr_cmp_c(method, "CONNECT") == 0) return HTP_M_CONNECT; + if (bstr_cmp_c(method, "OPTIONS") == 0) return HTP_M_OPTIONS; + if (bstr_cmp_c(method, "TRACE") == 0) return HTP_M_TRACE; + if (bstr_cmp_c(method, "PATCH") == 0) return HTP_M_PATCH; + if (bstr_cmp_c(method, "PROPFIND") == 0) return HTP_M_PROPFIND; + if (bstr_cmp_c(method, "PROPPATCH") == 0) return HTP_M_PROPPATCH; + if (bstr_cmp_c(method, "MKCOL") == 0) return HTP_M_MKCOL; + if (bstr_cmp_c(method, "COPY") == 0) return HTP_M_COPY; + if (bstr_cmp_c(method, "MOVE") == 0) return HTP_M_MOVE; + if (bstr_cmp_c(method, "LOCK") == 0) return HTP_M_LOCK; + if (bstr_cmp_c(method, "UNLOCK") == 0) return HTP_M_UNLOCK; + if (bstr_cmp_c(method, "VERSION-CONTROL") == 0) return HTP_M_VERSION_CONTROL; + if (bstr_cmp_c(method, "CHECKOUT") == 0) return HTP_M_CHECKOUT; + if (bstr_cmp_c(method, "UNCHECKOUT") == 0) return HTP_M_UNCHECKOUT; + if (bstr_cmp_c(method, "CHECKIN") == 0) return HTP_M_CHECKIN; + if (bstr_cmp_c(method, "UPDATE") == 0) return HTP_M_UPDATE; + if (bstr_cmp_c(method, "LABEL") == 0) return HTP_M_LABEL; + if (bstr_cmp_c(method, "REPORT") == 0) return HTP_M_REPORT; + if (bstr_cmp_c(method, "MKWORKSPACE") == 0) return HTP_M_MKWORKSPACE; + if (bstr_cmp_c(method, "MKACTIVITY") == 0) return HTP_M_MKACTIVITY; + if (bstr_cmp_c(method, "BASELINE-CONTROL") == 0) return HTP_M_BASELINE_CONTROL; + if (bstr_cmp_c(method, "MERGE") == 0) return HTP_M_MERGE; + if (bstr_cmp_c(method, "INVALID") == 0) return HTP_M_INVALID; + if (bstr_cmp_c(method, "HEAD") == 0) return HTP_M_HEAD; + + return HTP_M_UNKNOWN; +} + +/** + * Is the given line empty? + * + * @param[in] data + * @param[in] len + * @return 0 or 1 + */ +int htp_is_line_empty(unsigned char *data, size_t len) { + if (((len == 1) && ((data[0] == CR) || (data[0] == LF))) || + ((len == 2) && (data[0] == CR) && (data[1] == LF))) { + return 1; + } + + return 0; +} + +/** + * Does line consist entirely of whitespace characters? + * + * @param[in] data + * @param[in] len + * @return 0 or 1 + */ +int htp_is_line_whitespace(unsigned char *data, size_t len) { + size_t i; + + for (i = 0; i < len; i++) { + if (!isspace(data[i])) { + return 0; + } + } + + return 1; +} + +/** + * Parses Content-Length string (positive decimal number). + * White space is allowed before and after the number. + * + * @param[in] b + * @return Content-Length as a number, or -1 on error. + */ +int64_t htp_parse_content_length(bstr *b, htp_connp_t *connp) { + size_t len = bstr_len(b); + unsigned char * data = (unsigned char *) bstr_ptr(b); + size_t pos = 0; + int64_t r = 0; + + if (len == 0) return -1003; + + // Ignore junk before + while ((pos < len) && (data[pos] < '0' || data[pos] > '9')) { + if (!htp_is_lws(data[pos]) && connp != NULL && r == 0) { + htp_log(connp, HTP_LOG_MARK, HTP_LOG_WARNING, 0, + "C-L value with extra data in the beginning"); + r = -1; + } + pos++; + } + if (pos == len) return -1001; + + r = bstr_util_mem_to_pint(data + pos, len - pos, 10, &pos); + // Ok to have junk afterwards + if (pos < len && connp != NULL) { + htp_log(connp, HTP_LOG_MARK, HTP_LOG_WARNING, 0, + "C-L value with extra data in the end"); + } + return r; +} + +/** + * Parses chunk length (positive hexadecimal number). White space is allowed before + * and after the number. An error will be returned if the chunk length is greater than + * INT32_MAX. + * + * @param[in] data + * @param[in] len + * @return Chunk length, or a negative number on error. + */ +int64_t htp_parse_chunked_length(unsigned char *data, size_t len, int *extension) { + // skip leading line feeds and other control chars + while (len) { + unsigned char c = *data; + if (!(c == 0x0d || c == 0x0a || c == 0x20 || c == 0x09 || c == 0x0b || c == 0x0c)) + break; + data++; + len--; + } + if (len == 0) + return -1004; + + // find how much of the data is correctly formatted + size_t i = 0; + while (i < len) { + unsigned char c = data[i]; + if (!(isdigit(c) || + (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'))) + break; + i++; + } + // cut off trailing junk + if (i != len) { + if (extension) { + size_t j = i; + while (j < len) { + if (data[j] == ';') { + *extension = 1; + break; + } + j++; + } + } + len = i; + } + + int64_t chunk_len = htp_parse_positive_integer_whitespace(data, len, 16); + if (chunk_len < 0) return chunk_len; + if (chunk_len > INT32_MAX) return -1; + return chunk_len; +} + +/** + * A somewhat forgiving parser for a positive integer in a given base. + * Only LWS is allowed before and after the number. + * + * @param[in] data + * @param[in] len + * @param[in] base + * @return The parsed number on success; a negative number on error. + */ +int64_t htp_parse_positive_integer_whitespace(unsigned char *data, size_t len, int base) { + if (len == 0) return -1003; + + size_t last_pos; + size_t pos = 0; + + // Ignore LWS before + while ((pos < len) && (htp_is_lws(data[pos]))) pos++; + if (pos == len) return -1001; + + int64_t r = bstr_util_mem_to_pint(data + pos, len - pos, base, &last_pos); + if (r < 0) return r; + + // Move after the last digit + pos += last_pos; + + // Ignore LWS after + while (pos < len) { + if (!htp_is_lws(data[pos])) { + return -1002; + } + + pos++; + } + + return r; +} + +#ifdef HTP_DEBUG + +/** + * Prints one log message to stderr. + * + * @param[in] stream + * @param[in] log + */ +void htp_print_log(FILE *stream, htp_log_t *log) { + if (log->code != 0) { + fprintf(stream, "[%d][code %d][file %s][line %d] %s\n", log->level, + log->code, log->file, log->line, log->msg); + } else { + fprintf(stream, "[%d][file %s][line %d] %s\n", log->level, + log->file, log->line, log->msg); + } +} +#endif + +/** + * Records one log message. + * + * @param[in] connp + * @param[in] file + * @param[in] line + * @param[in] level + * @param[in] code + * @param[in] fmt + */ +void htp_log(htp_connp_t *connp, const char *file, int line, enum htp_log_level_t level, int code, const char *fmt, ...) { + if (connp == NULL) return; + + char buf[1024]; + va_list args; + + // Ignore messages below our log level. + if (connp->cfg->log_level < level) { + return; + } + + va_start(args, fmt); + + int r = vsnprintf(buf, 1024, fmt, args); + + va_end(args); + + if (r < 0) { + snprintf(buf, 1024, "[vnsprintf returned error %d]", r); + } else if (r >= 1024) { + // Indicate overflow with a '+' at the end. + buf[1022] = '+'; + buf[1023] = '\0'; + } + + // Create a new log entry. + + htp_log_t *log = calloc(1, sizeof (htp_log_t)); + if (log == NULL) return; + + log->connp = connp; + log->file = file; + log->line = line; + log->level = level; + log->code = code; + log->msg = strdup(buf); + + if (htp_list_add(connp->conn->messages, log) != HTP_OK) { + free((void *) log->msg); + free(log); + return; + } + + if (level == HTP_LOG_ERROR) { + connp->last_error = log; + } + + #ifdef HTP_DEBUG + fprintf(stderr, "[LOG] %s\n", log->msg); + #endif + + /* coverity[check_return] */ + htp_hook_run_all(connp->cfg->hook_log, log); +} + +/** + * Determines if the given line is a continuation (of some previous line). + * + * @param[in] data + * @param[in] len + * @return 0 or 1 for false and true, respectively. Returns -1 on error (NULL pointer or length zero). + */ +int htp_connp_is_line_folded(unsigned char *data, size_t len) { + if ((data == NULL) || (len == 0)) return -1; + return htp_is_folding_char(data[0]); +} + +int htp_is_folding_char(int c) { + if (htp_is_lws(c) || c == 0) return 1; + else return 0; +} + +/** + * Determines if the given line is a request terminator. + * + * @param[in] connp + * @param[in] data + * @param[in] len + * @return 0 or 1 + */ +int htp_connp_is_line_terminator(htp_connp_t *connp, unsigned char *data, size_t len, int next_no_lf) { + // Is this the end of request headers? + switch (connp->cfg->server_personality) { + case HTP_SERVER_IIS_5_1: + // IIS 5 will accept a whitespace line as a terminator + if (htp_is_line_whitespace(data, len)) { + return 1; + } + + // Fall through + default: + // Treat an empty line as terminator + if (htp_is_line_empty(data, len)) { + return 1; + } + // Only space is terminator if terminator does not follow right away + if (len == 2 && htp_is_lws(data[0]) && data[1] == LF) { + return next_no_lf; + } + break; + } + + return 0; +} + +/** + * Determines if the given line can be ignored when it appears before a request. + * + * @param[in] connp + * @param[in] data + * @param[in] len + * @return 0 or 1 + */ +int htp_connp_is_line_ignorable(htp_connp_t *connp, unsigned char *data, size_t len) { + return htp_connp_is_line_terminator(connp, data, len, 0); +} + +static htp_status_t htp_parse_port(unsigned char *data, size_t len, int *port, int *invalid) { + if (len == 0) { + *port = -1; + *invalid = 1; + return HTP_OK; + } + + int64_t port_parsed = htp_parse_positive_integer_whitespace(data, len, 10); + + if (port_parsed < 0) { + // Failed to parse the port number. + *port = -1; + *invalid = 1; + } else if ((port_parsed > 0) && (port_parsed < 65536)) { + // Valid port number. + *port = (int) port_parsed; + } else { + // Port number out of range. + *port = -1; + *invalid = 1; + } + + return HTP_OK; +} + +/** + * Parses an authority string, which consists of a hostname with an optional port number; username + * and password are not allowed and will not be handled. + * + * @param[in] hostport + * @param[out] hostname A bstring containing the hostname, or NULL if the hostname is invalid. If this value + * is not NULL, the caller assumes responsibility for memory management. + * @param[out] port Port as text, or NULL if not provided. + * @param[out] port_number Port number, or -1 if the port is not present or invalid. + * @param[out] invalid Set to 1 if any part of the authority is invalid. + * @return HTP_OK on success, HTP_ERROR on memory allocation failure. + */ +htp_status_t htp_parse_hostport(bstr *hostport, bstr **hostname, bstr **port, int *port_number, int *invalid) { + if ((hostport == NULL) || (hostname == NULL) || (port_number == NULL) || (invalid == NULL)) return HTP_ERROR; + + *hostname = NULL; + if (port != NULL) { + *port = NULL; + } + *port_number = -1; + *invalid = 0; + + unsigned char *data = bstr_ptr(hostport); + size_t len = bstr_len(hostport); + + bstr_util_mem_trim(&data, &len); + + if (len == 0) { + *invalid = 1; + return HTP_OK; + } + + // Check for an IPv6 address. + if (data[0] == '[') { + // IPv6 host. + + // Find the end of the IPv6 address. + size_t pos = 0; + while ((pos < len) && (data[pos] != ']')) pos++; + if (pos == len) { + *invalid = 1; + return HTP_OK; + } + + *hostname = bstr_dup_mem(data, pos + 1); + if (*hostname == NULL) return HTP_ERROR; + + // Over the ']'. + pos++; + if (pos == len) return HTP_OK; + + // Handle port. + if (data[pos] == ':') { + if (port != NULL) { + *port = bstr_dup_mem(data + pos + 1, len - pos - 1); + if (*port == NULL) { + bstr_free(*hostname); + return HTP_ERROR; + } + } + + return htp_parse_port(data + pos + 1, len - pos - 1, port_number, invalid); + } else { + *invalid = 1; + return HTP_OK; + } + } else { + // Not IPv6 host. + + // Is there a colon? + unsigned char *colon = memchr(data, ':', len); + if (colon == NULL) { + // Hostname alone, no port. + + *hostname = bstr_dup_mem(data, len); + if (*hostname == NULL) return HTP_ERROR; + + bstr_to_lowercase(*hostname); + } else { + // Hostname and port. + + // Ignore whitespace at the end of hostname. + unsigned char *hostend = colon; + while ((hostend > data) && (isspace(*(hostend - 1)))) hostend--; + + *hostname = bstr_dup_mem(data, hostend - data); + if (*hostname == NULL) return HTP_ERROR; + + if (port != NULL) { + *port = bstr_dup_mem(colon + 1, len - (colon + 1 - data)); + if (*port == NULL) { + bstr_free(*hostname); + return HTP_ERROR; + } + } + + return htp_parse_port(colon + 1, len - (colon + 1 - data), port_number, invalid); + } + } + + return HTP_OK; +} + +/** + * Parses hostport provided in the URI. + * + * @param[in] connp + * @param[in] hostport + * @param[in] uri + * @return HTP_OK on success or HTP_ERROR error. + */ +int htp_parse_uri_hostport(htp_connp_t *connp, bstr *hostport, htp_uri_t *uri) { + int invalid; + + htp_status_t rc = htp_parse_hostport(hostport, &(uri->hostname), &(uri->port), &(uri->port_number), &invalid); + if (rc != HTP_OK) return rc; + + if (invalid) { + connp->in_tx->flags |= HTP_HOSTU_INVALID; + } + + if (uri->hostname != NULL) { + if (htp_validate_hostname(uri->hostname) == 0) { + connp->in_tx->flags |= HTP_HOSTU_INVALID; + } + } + + return HTP_OK; +} + +/** + * Parses hostport provided in the Host header. + * + * @param[in] hostport + * @param[out] hostname + * @param[out] port + * @param[out] port_number + * @param[out] flags + * @return HTP_OK on success or HTP_ERROR error. + */ +htp_status_t htp_parse_header_hostport(bstr *hostport, bstr **hostname, bstr **port, int *port_number, uint64_t *flags) { + int invalid; + + htp_status_t rc = htp_parse_hostport(hostport, hostname, port, port_number, &invalid); + if (rc != HTP_OK) return rc; + + if (invalid) { + *flags |= HTP_HOSTH_INVALID; + } + + if (*hostname != NULL) { + if (htp_validate_hostname(*hostname) == 0) { + *flags |= HTP_HOSTH_INVALID; + } + } + + return HTP_OK; +} + +/** + * Parses request URI, making no attempt to validate the contents. + * + * @param[in] input + * @param[in] uri + * @return HTP_ERROR on memory allocation failure, HTP_OK otherwise + */ +int htp_parse_uri(bstr *input, htp_uri_t **uri) { + // Allow a htp_uri_t structure to be provided on input, + // but allocate a new one if the structure is NULL. + if (*uri == NULL) { + *uri = calloc(1, sizeof (htp_uri_t)); + if (*uri == NULL) return HTP_ERROR; + } + + if (input == NULL) { + // The input might be NULL on requests that don't actually + // contain the URI. We allow that. + return HTP_OK; + } + + unsigned char *data = bstr_ptr(input); + size_t len = bstr_len(input); + // remove trailing spaces + while (len > 0) { + if (data[len-1] != ' ') { + break; + } + len--; + } + size_t start, pos; + + if (len == 0) { + // Empty string. + return HTP_OK; + } + + pos = 0; + + // Scheme test: if it doesn't start with a forward slash character (which it must + // for the contents to be a path or an authority, then it must be the scheme part + if (data[0] != '/') { + // Parse scheme + + // Find the colon, which marks the end of the scheme part + start = pos; + while ((pos < len) && (data[pos] != ':')) pos++; + + if (pos >= len) { + // We haven't found a colon, which means that the URI + // is invalid. Apache will ignore this problem and assume + // the URI contains an invalid path so, for the time being, + // we are going to do the same. + pos = 0; + } else { + // Make a copy of the scheme + (*uri)->scheme = bstr_dup_mem(data + start, pos - start); + if ((*uri)->scheme == NULL) return HTP_ERROR; + + // Go over the colon + pos++; + } + } + + // Authority test: two forward slash characters and it's an authority. + // One, three or more slash characters, and it's a path. We, however, + // only attempt to parse authority if we've seen a scheme. + if ((*uri)->scheme != NULL) + if ((pos + 2 < len) && (data[pos] == '/') && (data[pos + 1] == '/') && (data[pos + 2] != '/')) { + // Parse authority + + // Go over the two slash characters + start = pos = pos + 2; + + // Authority ends with a question mark, forward slash or hash + while ((pos < len) && (data[pos] != '?') && (data[pos] != '/') && (data[pos] != '#')) pos++; + + unsigned char *hostname_start; + size_t hostname_len; + + // Are the credentials included in the authority? + unsigned char *m = memchr(data + start, '@', pos - start); + if (m != NULL) { + // Credentials present + unsigned char *credentials_start = data + start; + size_t credentials_len = m - data - start; + + // Figure out just the hostname part + hostname_start = data + start + credentials_len + 1; + hostname_len = pos - start - credentials_len - 1; + + // Extract the username and the password + m = memchr(credentials_start, ':', credentials_len); + if (m != NULL) { + // Username and password + (*uri)->username = bstr_dup_mem(credentials_start, m - credentials_start); + if ((*uri)->username == NULL) return HTP_ERROR; + (*uri)->password = bstr_dup_mem(m + 1, credentials_len - (m - credentials_start) - 1); + if ((*uri)->password == NULL) return HTP_ERROR; + } else { + // Username alone + (*uri)->username = bstr_dup_mem(credentials_start, credentials_len); + if ((*uri)->username == NULL) return HTP_ERROR; + } + } else { + // No credentials + hostname_start = data + start; + hostname_len = pos - start; + } + + // Parsing authority without credentials. + if ((hostname_len > 0) && (hostname_start[0] == '[')) { + // IPv6 address. + + m = memchr(hostname_start, ']', hostname_len); + if (m == NULL) { + // Invalid IPv6 address; use the entire string as hostname. + (*uri)->hostname = bstr_dup_mem(hostname_start, hostname_len); + if ((*uri)->hostname == NULL) return HTP_ERROR; + } else { + (*uri)->hostname = bstr_dup_mem(hostname_start, m - hostname_start + 1); + if ((*uri)->hostname == NULL) return HTP_ERROR; + + // Is there a port? + hostname_len = hostname_len - (m - hostname_start + 1); + hostname_start = m + 1; + + // Port string + m = memchr(hostname_start, ':', hostname_len); + if (m != NULL) { + size_t port_len = hostname_len - (m - hostname_start) - 1; + (*uri)->port = bstr_dup_mem(m + 1, port_len); + if ((*uri)->port == NULL) return HTP_ERROR; + } + } + } else { + // Not IPv6 address. + + m = memchr(hostname_start, ':', hostname_len); + if (m != NULL) { + size_t port_len = hostname_len - (m - hostname_start) - 1; + hostname_len = hostname_len - port_len - 1; + + // Port string + (*uri)->port = bstr_dup_mem(m + 1, port_len); + if ((*uri)->port == NULL) return HTP_ERROR; + } + + // Hostname + (*uri)->hostname = bstr_dup_mem(hostname_start, hostname_len); + if ((*uri)->hostname == NULL) return HTP_ERROR; + } + } + + // Path + start = pos; + + // The path part will end with a question mark or a hash character, which + // mark the beginning of the query part or the fragment part, respectively. + while ((pos < len) && (data[pos] != '?') && (data[pos] != '#')) pos++; + + // Path + (*uri)->path = bstr_dup_mem(data + start, pos - start); + if ((*uri)->path == NULL) return HTP_ERROR; + + if (pos == len) return HTP_OK; + + // Query + if (data[pos] == '?') { + // Step over the question mark + start = pos + 1; + + // The query part will end with the end of the input + // or the beginning of the fragment part + while ((pos < len) && (data[pos] != '#')) pos++; + + // Query string + (*uri)->query = bstr_dup_mem(data + start, pos - start); + if ((*uri)->query == NULL) return HTP_ERROR; + + if (pos == len) return HTP_OK; + } + + // Fragment + if (data[pos] == '#') { + // Step over the hash character + start = pos + 1; + + // Fragment; ends with the end of the input + (*uri)->fragment = bstr_dup_mem(data + start, len - start); + if ((*uri)->fragment == NULL) return HTP_ERROR; + } + + return HTP_OK; +} + +/** + * Convert two input bytes, pointed to by the pointer parameter, + * into a single byte by assuming the input consists of hexadecimal + * characters. This function will happily convert invalid input. + * + * @param[in] what + * @return hex-decoded byte + */ +static unsigned char x2c(unsigned char *what) { + register unsigned char digit; + + digit = (what[0] >= 'A' ? ((what[0] & 0xdf) - 'A') + 10 : (what[0] - '0')); + digit *= 16; + digit += (what[1] >= 'A' ? ((what[1] & 0xdf) - 'A') + 10 : (what[1] - '0')); + + return digit; +} + +/** + * Convert a Unicode codepoint into a single-byte, using best-fit + * mapping (as specified in the provided configuration structure). + * + * @param[in] cfg + * @param[in] codepoint + * @return converted single byte + */ +static uint8_t bestfit_codepoint(htp_cfg_t *cfg, enum htp_decoder_ctx_t ctx, uint32_t codepoint) { + // Is it a single-byte codepoint? + if (codepoint < 0x100) { + return (uint8_t) codepoint; + } + + // Our current implementation converts only the 2-byte codepoints. + if (codepoint > 0xffff) { + return cfg->decoder_cfgs[ctx].bestfit_replacement_byte; + } + + uint8_t *p = cfg->decoder_cfgs[ctx].bestfit_map; + + // TODO Optimize lookup. + + for (;;) { + uint32_t x = (p[0] << 8) + p[1]; + + if (x == 0) { + return cfg->decoder_cfgs[ctx].bestfit_replacement_byte; + } + + if (x == codepoint) { + return p[2]; + } + + // Move to the next triplet + p += 3; + } +} + +/** + * Decode a UTF-8 encoded path. Overlong characters will be decoded, invalid + * characters will be left as-is. Best-fit mapping will be used to convert + * UTF-8 into a single-byte stream. + * + * @param[in] cfg + * @param[in] tx + * @param[in] path + */ +void htp_utf8_decode_path_inplace(htp_cfg_t *cfg, htp_tx_t *tx, bstr *path) { + if (path == NULL) return; + + uint8_t *data = bstr_ptr(path); + if (data == NULL) return; + + size_t len = bstr_len(path); + size_t rpos = 0; + size_t wpos = 0; + uint32_t codepoint = 0; + uint32_t state = HTP_UTF8_ACCEPT; + uint32_t counter = 0; + uint8_t seen_valid = 0; + + while ((rpos < len)&&(wpos < len)) { + counter++; + + switch (htp_utf8_decode_allow_overlong(&state, &codepoint, data[rpos])) { + case HTP_UTF8_ACCEPT: + if (counter == 1) { + // ASCII character, which we just copy. + data[wpos++] = (uint8_t) codepoint; + } else { + // A valid UTF-8 character, which we need to convert. + + seen_valid = 1; + + // Check for overlong characters and set the flag accordingly. + switch (counter) { + case 2: + if (codepoint < 0x80) { + tx->flags |= HTP_PATH_UTF8_OVERLONG; + } + break; + case 3: + if (codepoint < 0x800) { + tx->flags |= HTP_PATH_UTF8_OVERLONG; + } + break; + case 4: + if (codepoint < 0x10000) { + tx->flags |= HTP_PATH_UTF8_OVERLONG; + } + break; + } + + // Special flag for half-width/full-width evasion. + if ((codepoint >= 0xff00) && (codepoint <= 0xffef)) { + tx->flags |= HTP_PATH_HALF_FULL_RANGE; + } + + // Use best-fit mapping to convert to a single byte. + data[wpos++] = bestfit_codepoint(cfg, HTP_DECODER_URL_PATH, codepoint); + } + + // Advance over the consumed byte and reset the byte counter. + rpos++; + counter = 0; + + break; + + case HTP_UTF8_REJECT: + // Invalid UTF-8 character. + + tx->flags |= HTP_PATH_UTF8_INVALID; + + // Is the server expected to respond with 400? + if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].utf8_invalid_unwanted != HTP_UNWANTED_IGNORE) { + tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].utf8_invalid_unwanted; + } + + // Output the replacement byte, replacing one or more invalid bytes. + data[wpos++] = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].bestfit_replacement_byte; + + // If the invalid byte was first in a sequence, consume it. Otherwise, + // assume it's the starting byte of the next character. + if (counter == 1) { + rpos++; + } + + // Reset the decoder state and continue decoding. + state = HTP_UTF8_ACCEPT; + codepoint = 0; + counter = 0; + + break; + + default: + // Keep going; the character is not yet formed. + rpos++; + break; + } + } + + // Did the input stream seem like a valid UTF-8 string? + if ((seen_valid) && (!(tx->flags & HTP_PATH_UTF8_INVALID))) { + tx->flags |= HTP_PATH_UTF8_VALID; + } + + // Adjust the length of the string, because + // we're doing in-place decoding. + bstr_adjust_len(path, wpos); +} + +/** + * Validate a path that is quite possibly UTF-8 encoded. + * + * @param[in] tx + * @param[in] path + */ +void htp_utf8_validate_path(htp_tx_t *tx, bstr *path) { + unsigned char *data = bstr_ptr(path); + size_t len = bstr_len(path); + size_t rpos = 0; + uint32_t codepoint = 0; + uint32_t state = HTP_UTF8_ACCEPT; + uint32_t counter = 0; // How many bytes used by a UTF-8 character. + uint8_t seen_valid = 0; + + while (rpos < len) { + counter++; + + switch (htp_utf8_decode_allow_overlong(&state, &codepoint, data[rpos])) { + case HTP_UTF8_ACCEPT: + // We have a valid character. + + if (counter > 1) { + // A valid UTF-8 character, consisting of 2 or more bytes. + + seen_valid = 1; + + // Check for overlong characters and set the flag accordingly. + switch (counter) { + case 2: + if (codepoint < 0x80) { + tx->flags |= HTP_PATH_UTF8_OVERLONG; + } + break; + case 3: + if (codepoint < 0x800) { + tx->flags |= HTP_PATH_UTF8_OVERLONG; + } + break; + case 4: + if (codepoint < 0x10000) { + tx->flags |= HTP_PATH_UTF8_OVERLONG; + } + break; + } + } + + // Special flag for half-width/full-width evasion. + if ((codepoint > 0xfeff) && (codepoint < 0x010000)) { + tx->flags |= HTP_PATH_HALF_FULL_RANGE; + } + + // Advance over the consumed byte and reset the byte counter. + rpos++; + counter = 0; + + break; + + case HTP_UTF8_REJECT: + // Invalid UTF-8 character. + + tx->flags |= HTP_PATH_UTF8_INVALID; + + // Override the decoder state because we want to continue decoding. + state = HTP_UTF8_ACCEPT; + + // Advance over the consumed byte and reset the byte counter. + rpos++; + counter = 0; + + break; + + default: + // Keep going; the character is not yet formed. + rpos++; + break; + } + } + + // Did the input stream seem like a valid UTF-8 string? + if ((seen_valid) && (!(tx->flags & HTP_PATH_UTF8_INVALID))) { + tx->flags |= HTP_PATH_UTF8_VALID; + } +} + +/** + * Decode a %u-encoded character, using best-fit mapping as necessary. Path version. + * + * @param[in] cfg + * @param[in] tx + * @param[in] data + * @return decoded byte + */ +static uint8_t decode_u_encoding_path(htp_cfg_t *cfg, htp_tx_t *tx, unsigned char *data) { + uint8_t c1 = x2c(data); + uint8_t c2 = x2c(data + 2); + uint8_t r = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].bestfit_replacement_byte; + + if (c1 == 0x00) { + r = c2; + tx->flags |= HTP_PATH_OVERLONG_U; + } else { + // Check for fullwidth form evasion + if (c1 == 0xff) { + tx->flags |= HTP_PATH_HALF_FULL_RANGE; + } + + if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].u_encoding_unwanted != HTP_UNWANTED_IGNORE) { + tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].u_encoding_unwanted; + } + + // Use best-fit mapping + unsigned char *p = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].bestfit_map; + + // TODO Optimize lookup. + + for (;;) { + // Have we reached the end of the map? + if ((p[0] == 0) && (p[1] == 0)) { + break; + } + + // Have we found the mapping we're looking for? + if ((p[0] == c1) && (p[1] == c2)) { + r = p[2]; + break; + } + + // Move to the next triplet + p += 3; + } + } + + // Check for encoded path separators + if ((r == '/') || ((cfg->decoder_cfgs[HTP_DECODER_URL_PATH].backslash_convert_slashes) && (r == '\\'))) { + tx->flags |= HTP_PATH_ENCODED_SEPARATOR; + } + + return r; +} + +/** + * Decode a %u-encoded character, using best-fit mapping as necessary. Params version. + * + * @param[in] cfg + * @param[in] tx + * @param[in] data + * @return decoded byte + */ +static uint8_t decode_u_encoding_params(htp_cfg_t *cfg, enum htp_decoder_ctx_t ctx, unsigned char *data, uint64_t *flags) { + uint8_t c1 = x2c(data); + uint8_t c2 = x2c(data + 2); + + // Check for overlong usage first. + if (c1 == 0) { + (*flags) |= HTP_URLEN_OVERLONG_U; + return c2; + } + + // Both bytes were used. + + // Detect half-width and full-width range. + if ((c1 == 0xff) && (c2 <= 0xef)) { + (*flags) |= HTP_URLEN_HALF_FULL_RANGE; + } + + // Use best-fit mapping. + unsigned char *p = cfg->decoder_cfgs[ctx].bestfit_map; + uint8_t r = cfg->decoder_cfgs[ctx].bestfit_replacement_byte; + + // TODO Optimize lookup. + + for (;;) { + // Have we reached the end of the map? + if ((p[0] == 0) && (p[1] == 0)) { + break; + } + + // Have we found the mapping we're looking for? + if ((p[0] == c1) && (p[1] == c2)) { + r = p[2]; + break; + } + + // Move to the next triplet + p += 3; + } + + return r; +} + +/** + * Decode a request path according to the settings in the + * provided configuration structure. + * + * @param[in] cfg + * @param[in] tx + * @param[in] path + */ +htp_status_t htp_decode_path_inplace(htp_tx_t *tx, bstr *path) { + if (path == NULL) return HTP_ERROR; + unsigned char *data = bstr_ptr(path); + if (data == NULL) return HTP_ERROR; + + size_t len = bstr_len(path); + + htp_cfg_t *cfg = tx->cfg; + + size_t rpos = 0; + size_t wpos = 0; + int previous_was_separator = 0; + + while ((rpos < len) && (wpos < len)) { + uint8_t c = data[rpos]; + + // Decode encoded characters + if (c == '%') { + if (rpos + 2 < len) { + int handled = 0; + + if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].u_encoding_decode) { + // Check for the %u encoding + if ((data[rpos + 1] == 'u') || (data[rpos + 1] == 'U')) { + handled = 1; + + if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].u_encoding_unwanted != HTP_UNWANTED_IGNORE) { + tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].u_encoding_unwanted; + } + + if (rpos + 5 < len) { + if (isxdigit(data[rpos + 2]) && (isxdigit(data[rpos + 3])) + && isxdigit(data[rpos + 4]) && (isxdigit(data[rpos + 5]))) { + // Decode a valid %u encoding + c = decode_u_encoding_path(cfg, tx, &data[rpos + 2]); + rpos += 6; + + if (c == 0) { + tx->flags |= HTP_PATH_ENCODED_NUL; + + if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].nul_encoded_unwanted != HTP_UNWANTED_IGNORE) { + tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].nul_encoded_unwanted; + } + } + } else { + // Invalid %u encoding + tx->flags |= HTP_PATH_INVALID_ENCODING; + + if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_unwanted != HTP_UNWANTED_IGNORE) { + tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_unwanted; + } + + switch (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_handling) { + case HTP_URL_DECODE_REMOVE_PERCENT: + // Do not place anything in output; eat + // the percent character + rpos++; + continue; + break; + case HTP_URL_DECODE_PRESERVE_PERCENT: + // Leave the percent character in output + rpos++; + break; + case HTP_URL_DECODE_PROCESS_INVALID: + // Decode invalid %u encoding + c = decode_u_encoding_path(cfg, tx, &data[rpos + 2]); + rpos += 6; + break; + } + } + } else { + // Invalid %u encoding (not enough data) + tx->flags |= HTP_PATH_INVALID_ENCODING; + + if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_unwanted != HTP_UNWANTED_IGNORE) { + tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_unwanted; + } + + switch (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_handling) { + case HTP_URL_DECODE_REMOVE_PERCENT: + // Do not place anything in output; eat + // the percent character + rpos++; + continue; + break; + case HTP_URL_DECODE_PRESERVE_PERCENT: + // Leave the percent character in output + rpos++; + break; + case HTP_URL_DECODE_PROCESS_INVALID: + // Cannot decode, because there's not enough data. + // Leave the percent character in output + rpos++; + // TODO Configurable handling. + break; + } + } + } + } + + // Handle standard URL encoding + if (!handled) { + if ((isxdigit(data[rpos + 1])) && (isxdigit(data[rpos + 2]))) { + c = x2c(&data[rpos + 1]); + + if (c == 0) { + tx->flags |= HTP_PATH_ENCODED_NUL; + + if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].nul_encoded_unwanted != HTP_UNWANTED_IGNORE) { + tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].nul_encoded_unwanted; + } + + if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].nul_encoded_terminates) { + bstr_adjust_len(path, wpos); + return HTP_OK; + } + } + + if ((c == '/') || ((cfg->decoder_cfgs[HTP_DECODER_URL_PATH].backslash_convert_slashes) && (c == '\\'))) { + tx->flags |= HTP_PATH_ENCODED_SEPARATOR; + + if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].path_separators_encoded_unwanted != HTP_UNWANTED_IGNORE) { + tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].path_separators_encoded_unwanted; + } + + if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].path_separators_decode) { + // Decode + rpos += 3; + } else { + // Leave encoded + c = '%'; + rpos++; + } + } else { + // Decode + rpos += 3; + } + } else { + // Invalid encoding + tx->flags |= HTP_PATH_INVALID_ENCODING; + + if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_unwanted != HTP_UNWANTED_IGNORE) { + tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_unwanted; + } + + switch (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_handling) { + case HTP_URL_DECODE_REMOVE_PERCENT: + // Do not place anything in output; eat + // the percent character + rpos++; + continue; + break; + case HTP_URL_DECODE_PRESERVE_PERCENT: + // Leave the percent character in output + rpos++; + break; + case HTP_URL_DECODE_PROCESS_INVALID: + // Decode + c = x2c(&data[rpos + 1]); + rpos += 3; + // Note: What if an invalid encoding decodes into a path + // separator? This is theoretical at the moment, because + // the only platform we know doesn't convert separators is + // Apache, who will also respond with 400 if invalid encoding + // is encountered. Thus no check for a separator here. + break; + default: + // Unknown setting + return HTP_ERROR; + break; + } + } + } + } else { + // Invalid URL encoding (not enough data) + tx->flags |= HTP_PATH_INVALID_ENCODING; + + if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_unwanted != HTP_UNWANTED_IGNORE) { + tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_unwanted; + } + + switch (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_handling) { + case HTP_URL_DECODE_REMOVE_PERCENT: + // Do not place anything in output; eat + // the percent character + rpos++; + continue; + break; + case HTP_URL_DECODE_PRESERVE_PERCENT: + // Leave the percent character in output + rpos++; + break; + case HTP_URL_DECODE_PROCESS_INVALID: + // Cannot decode, because there's not enough data. + // Leave the percent character in output. + // TODO Configurable handling. + rpos++; + break; + } + } + } else { + // One non-encoded character + + // Is it a NUL byte? + if (c == 0) { + if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].nul_raw_unwanted != HTP_UNWANTED_IGNORE) { + tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].nul_raw_unwanted; + } + + if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].nul_raw_terminates) { + // Terminate path with a raw NUL byte + bstr_adjust_len(path, wpos); + return HTP_OK; + break; + } + } + + rpos++; + } + + // Place the character into output + + // Check for control characters + if (c < 0x20) { + if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].control_chars_unwanted != HTP_UNWANTED_IGNORE) { + tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].control_chars_unwanted; + } + } + + // Convert backslashes to forward slashes, if necessary + if ((c == '\\') && (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].backslash_convert_slashes)) { + c = '/'; + } + + // Lowercase characters, if necessary + if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].convert_lowercase) { + c = (uint8_t) tolower(c); + } + + // If we're compressing separators then we need + // to track if the previous character was a separator + if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].path_separators_compress) { + if (c == '/') { + if (!previous_was_separator) { + data[wpos++] = c; + previous_was_separator = 1; + } else { + // Do nothing; we don't want + // another separator in output + } + } else { + data[wpos++] = c; + previous_was_separator = 0; + } + } else { + data[wpos++] = c; + } + } + + bstr_adjust_len(path, wpos); + + return HTP_OK; +} + +htp_status_t htp_tx_urldecode_uri_inplace(htp_tx_t *tx, bstr *input) { + uint64_t flags = 0; + + htp_status_t rc = htp_urldecode_inplace_ex(tx->cfg, HTP_DECODER_URL_PATH, input, &flags, &(tx->response_status_expected_number)); + + if (flags & HTP_URLEN_INVALID_ENCODING) { + tx->flags |= HTP_PATH_INVALID_ENCODING; + } + + if (flags & HTP_URLEN_ENCODED_NUL) { + tx->flags |= HTP_PATH_ENCODED_NUL; + } + + if (flags & HTP_URLEN_RAW_NUL) { + tx->flags |= HTP_PATH_RAW_NUL; + } + + return rc; +} + +htp_status_t htp_tx_urldecode_params_inplace(htp_tx_t *tx, bstr *input) { + return htp_urldecode_inplace_ex(tx->cfg, HTP_DECODER_URLENCODED, input, &(tx->flags), &(tx->response_status_expected_number)); +} + +htp_status_t htp_urldecode_inplace(htp_cfg_t *cfg, enum htp_decoder_ctx_t ctx, bstr *input, uint64_t *flags) { + int expected_status_code = 0; + return htp_urldecode_inplace_ex(cfg, ctx, input, flags, &expected_status_code); +} + +htp_status_t htp_urldecode_inplace_ex(htp_cfg_t *cfg, enum htp_decoder_ctx_t ctx, bstr *input, uint64_t *flags, int *expected_status_code) { + if (input == NULL) return HTP_ERROR; + + unsigned char *data = bstr_ptr(input); + if (data == NULL) return HTP_ERROR; + size_t len = bstr_len(input); + + size_t rpos = 0; + size_t wpos = 0; + + while ((rpos < len) && (wpos < len)) { + uint8_t c = data[rpos]; + + // Decode encoded characters. + if (c == '%') { + // Need at least 2 additional bytes for %HH. + if (rpos + 2 < len) { + int handled = 0; + + // Decode %uHHHH encoding, but only if allowed in configuration. + if (cfg->decoder_cfgs[ctx].u_encoding_decode) { + // The next character must be a case-insensitive u. + if ((data[rpos + 1] == 'u') || (data[rpos + 1] == 'U')) { + handled = 1; + + if (cfg->decoder_cfgs[ctx].u_encoding_unwanted != HTP_UNWANTED_IGNORE) { + (*expected_status_code) = cfg->decoder_cfgs[ctx].u_encoding_unwanted; + } + + // Need at least 5 additional bytes for %uHHHH. + if (rpos + 5 < len) { + if (isxdigit(data[rpos + 2]) && (isxdigit(data[rpos + 3])) + && isxdigit(data[rpos + 4]) && (isxdigit(data[rpos + 5]))) { + // Decode a valid %u encoding. + c = decode_u_encoding_params(cfg, ctx, &(data[rpos + 2]), flags); + rpos += 6; + } else { + // Invalid %u encoding (could not find 4 xdigits). + (*flags) |= HTP_URLEN_INVALID_ENCODING; + + if (cfg->decoder_cfgs[ctx].url_encoding_invalid_unwanted != HTP_UNWANTED_IGNORE) { + (*expected_status_code) = cfg->decoder_cfgs[ctx].url_encoding_invalid_unwanted; + } + + switch (cfg->decoder_cfgs[ctx].url_encoding_invalid_handling) { + case HTP_URL_DECODE_REMOVE_PERCENT: + // Do not place anything in output; consume the %. + rpos++; + continue; + break; + case HTP_URL_DECODE_PRESERVE_PERCENT: + // Leave the % in output. + rpos++; + break; + case HTP_URL_DECODE_PROCESS_INVALID: + // Decode invalid %u encoding. + c = decode_u_encoding_params(cfg, ctx, &(data[rpos + 2]), flags); + rpos += 6; + break; + } + } + } else { + // Invalid %u encoding; not enough data. + (*flags) |= HTP_URLEN_INVALID_ENCODING; + + if (cfg->decoder_cfgs[ctx].url_encoding_invalid_unwanted != HTP_UNWANTED_IGNORE) { + (*expected_status_code) = cfg->decoder_cfgs[ctx].url_encoding_invalid_unwanted; + } + + switch (cfg->decoder_cfgs[ctx].url_encoding_invalid_handling) { + case HTP_URL_DECODE_REMOVE_PERCENT: + // Do not place anything in output; consume the %. + rpos++; + continue; + break; + case HTP_URL_DECODE_PRESERVE_PERCENT: + // Leave the % in output. + rpos++; + break; + case HTP_URL_DECODE_PROCESS_INVALID: + // Cannot decode because there's not enough data. + // Leave the % in output. + // TODO Configurable handling of %, u, etc. + rpos++; + break; + } + } + } + } + + // Handle standard URL encoding. + if (!handled) { + // Need 2 hexadecimal digits. + if ((isxdigit(data[rpos + 1])) && (isxdigit(data[rpos + 2]))) { + // Decode %HH encoding. + c = x2c(&(data[rpos + 1])); + rpos += 3; + } else { + // Invalid encoding (enough bytes, but not hexadecimal digits). + (*flags) |= HTP_URLEN_INVALID_ENCODING; + + if (cfg->decoder_cfgs[ctx].url_encoding_invalid_unwanted != HTP_UNWANTED_IGNORE) { + (*expected_status_code) = cfg->decoder_cfgs[ctx].url_encoding_invalid_unwanted; + } + + switch (cfg->decoder_cfgs[ctx].url_encoding_invalid_handling) { + case HTP_URL_DECODE_REMOVE_PERCENT: + // Do not place anything in output; consume the %. + rpos++; + continue; + break; + case HTP_URL_DECODE_PRESERVE_PERCENT: + // Leave the % in output. + rpos++; + break; + case HTP_URL_DECODE_PROCESS_INVALID: + // Decode. + c = x2c(&(data[rpos + 1])); + rpos += 3; + break; + } + } + } + } else { + // Invalid encoding; not enough data (at least 2 bytes required). + (*flags) |= HTP_URLEN_INVALID_ENCODING; + + if (cfg->decoder_cfgs[ctx].url_encoding_invalid_unwanted != HTP_UNWANTED_IGNORE) { + (*expected_status_code) = cfg->decoder_cfgs[ctx].url_encoding_invalid_unwanted; + } + + switch (cfg->decoder_cfgs[ctx].url_encoding_invalid_handling) { + case HTP_URL_DECODE_REMOVE_PERCENT: + // Do not place anything in output; consume the %. + rpos++; + continue; + break; + case HTP_URL_DECODE_PRESERVE_PERCENT: + // Leave the % in output. + rpos++; + break; + case HTP_URL_DECODE_PROCESS_INVALID: + // Cannot decode because there's not enough data. + // Leave the % in output. + // TODO Configurable handling of %, etc. + rpos++; + break; + } + } + + // Did we get an encoded NUL byte? + if (c == 0) { + if (cfg->decoder_cfgs[ctx].nul_encoded_unwanted != HTP_UNWANTED_IGNORE) { + (*expected_status_code) = cfg->decoder_cfgs[ctx].nul_encoded_unwanted; + } + + (*flags) |= HTP_URLEN_ENCODED_NUL; + + if (cfg->decoder_cfgs[ctx].nul_encoded_terminates) { + // Terminate the path at the raw NUL byte. + bstr_adjust_len(input, wpos); + return 1; + } + } + + data[wpos++] = c; + } else if (c == '+') { + // Decoding of the plus character is conditional on the configuration. + + if (cfg->decoder_cfgs[ctx].plusspace_decode) { + c = 0x20; + } + + rpos++; + data[wpos++] = c; + } else { + // One non-encoded byte. + + // Did we get a raw NUL byte? + if (c == 0) { + if (cfg->decoder_cfgs[ctx].nul_raw_unwanted != HTP_UNWANTED_IGNORE) { + (*expected_status_code) = cfg->decoder_cfgs[ctx].nul_raw_unwanted; + } + + (*flags) |= HTP_URLEN_RAW_NUL; + + if (cfg->decoder_cfgs[ctx].nul_raw_terminates) { + // Terminate the path at the encoded NUL byte. + bstr_adjust_len(input, wpos); + return HTP_OK; + } + } + + rpos++; + data[wpos++] = c; + } + } + + bstr_adjust_len(input, wpos); + + return HTP_OK; +} + +/** + * Normalize a previously-parsed request URI. + * + * @param[in] connp + * @param[in] incomplete + * @param[in] normalized + * @return HTP_OK or HTP_ERROR + */ +int htp_normalize_parsed_uri(htp_tx_t *tx, htp_uri_t *incomplete, htp_uri_t *normalized) { + // Scheme. + if (incomplete->scheme != NULL) { + // Duplicate and convert to lowercase. + normalized->scheme = bstr_dup_lower(incomplete->scheme); + if (normalized->scheme == NULL) return HTP_ERROR; + } + + // Username. + if (incomplete->username != NULL) { + normalized->username = bstr_dup(incomplete->username); + if (normalized->username == NULL) return HTP_ERROR; + htp_tx_urldecode_uri_inplace(tx, normalized->username); + } + + // Password. + if (incomplete->password != NULL) { + normalized->password = bstr_dup(incomplete->password); + if (normalized->password == NULL) return HTP_ERROR; + htp_tx_urldecode_uri_inplace(tx, normalized->password); + } + + // Hostname. + if (incomplete->hostname != NULL) { + // We know that incomplete->hostname does not contain + // port information, so no need to check for it here. + normalized->hostname = bstr_dup(incomplete->hostname); + if (normalized->hostname == NULL) return HTP_ERROR; + htp_tx_urldecode_uri_inplace(tx, normalized->hostname); + htp_normalize_hostname_inplace(normalized->hostname); + } + + // Port. + if (incomplete->port != NULL) { + int64_t port_parsed = htp_parse_positive_integer_whitespace( + bstr_ptr(incomplete->port), bstr_len(incomplete->port), 10); + + if (port_parsed < 0) { + // Failed to parse the port number. + normalized->port_number = -1; + tx->flags |= HTP_HOSTU_INVALID; + } else if ((port_parsed > 0) && (port_parsed < 65536)) { + // Valid port number. + normalized->port_number = (int) port_parsed; + } else { + // Port number out of range. + normalized->port_number = -1; + tx->flags |= HTP_HOSTU_INVALID; + } + } else { + normalized->port_number = -1; + } + + // Path. + if (incomplete->path != NULL) { + // Make a copy of the path, so that we can work on it. + normalized->path = bstr_dup(incomplete->path); + if (normalized->path == NULL) return HTP_ERROR; + + // Decode URL-encoded (and %u-encoded) characters, as well as lowercase, + // compress separators and convert backslashes. + htp_decode_path_inplace(tx, normalized->path); + + // Handle UTF-8 in the path. + if (tx->cfg->decoder_cfgs[HTP_DECODER_URL_PATH].utf8_convert_bestfit) { + // Decode Unicode characters into a single-byte stream, using best-fit mapping. + htp_utf8_decode_path_inplace(tx->cfg, tx, normalized->path); + } else { + // No decoding, but try to validate the path as a UTF-8 stream. + htp_utf8_validate_path(tx, normalized->path); + } + + // RFC normalization. + htp_normalize_uri_path_inplace(normalized->path); + } + + // Query string. + if (incomplete->query != NULL) { + normalized->query = bstr_dup(incomplete->query); + if (normalized->query == NULL) return HTP_ERROR; + } + + // Fragment. + if (incomplete->fragment != NULL) { + normalized->fragment = bstr_dup(incomplete->fragment); + if (normalized->fragment == NULL) return HTP_ERROR; + htp_tx_urldecode_uri_inplace(tx, normalized->fragment); + } + + return HTP_OK; +} + +/** + * Normalize request hostname. Convert all characters to lowercase and + * remove trailing dots from the end, if present. + * + * @param[in] hostname + * @return Normalized hostname. + */ +bstr *htp_normalize_hostname_inplace(bstr *hostname) { + if (hostname == NULL) return NULL; + + bstr_to_lowercase(hostname); + + // Remove dots from the end of the string. + while (bstr_char_at_end(hostname, 0) == '.') bstr_chop(hostname); + + return hostname; +} + +/** + * Normalize URL path. This function implements the remove dot segments algorithm + * specified in RFC 3986, section 5.2.4. + * + * @param[in] s + */ +void htp_normalize_uri_path_inplace(bstr *s) { + if (s == NULL) return; + + unsigned char *data = bstr_ptr(s); + if (data == NULL) return; + size_t len = bstr_len(s); + + size_t rpos = 0; + size_t wpos = 0; + + int c = -1; + while ((rpos < len)&&(wpos < len)) { + if (c == -1) { + c = data[rpos++]; + } + + // A. If the input buffer begins with a prefix of "../" or "./", + // then remove that prefix from the input buffer; otherwise, + if (c == '.') { + if ((rpos + 1 < len) && (data[rpos] == '.') && (data[rpos + 1] == '/')) { + c = -1; + rpos += 2; + continue; + } else if ((rpos < len) && (data[rpos] == '/')) { + c = -1; + rpos += 1; + continue; + } + } + + if (c == '/') { + // B. if the input buffer begins with a prefix of "/./" or "/.", + // where "." is a complete path segment, then replace that + // prefix with "/" in the input buffer; otherwise, + if ((rpos + 1 < len) && (data[rpos] == '.') && (data[rpos + 1] == '/')) { + c = '/'; + rpos += 2; + continue; + } else if ((rpos + 1 == len) && (data[rpos] == '.')) { + c = '/'; + rpos += 1; + continue; + } + + // C. if the input buffer begins with a prefix of "/../" or "/..", + // where ".." is a complete path segment, then replace that + // prefix with "/" in the input buffer and remove the last + // segment and its preceding "/" (if any) from the output + // buffer; otherwise, + if ((rpos + 2 < len) && (data[rpos] == '.') && (data[rpos + 1] == '.') && (data[rpos + 2] == '/')) { + c = '/'; + rpos += 3; + + // Remove the last segment + while ((wpos > 0) && (data[wpos - 1] != '/')) wpos--; + if (wpos > 0) wpos--; + continue; + } else if ((rpos + 2 == len) && (data[rpos] == '.') && (data[rpos + 1] == '.')) { + c = '/'; + rpos += 2; + + // Remove the last segment + while ((wpos > 0) && (data[wpos - 1] != '/')) wpos--; + if (wpos > 0) wpos--; + continue; + } + } + + // D. if the input buffer consists only of "." or "..", then remove + // that from the input buffer; otherwise, + if ((c == '.') && (rpos == len)) { + rpos++; + continue; + } + + if ((c == '.') && (rpos + 1 == len) && (data[rpos] == '.')) { + rpos += 2; + continue; + } + + // E. move the first path segment in the input buffer to the end of + // the output buffer, including the initial "/" character (if + // any) and any subsequent characters up to, but not including, + // the next "/" character or the end of the input buffer. + data[wpos++] = (uint8_t) c; + + while ((rpos < len) && (data[rpos] != '/') && (wpos < len)) { + data[wpos++] = data[rpos++]; + } + + c = -1; + } + + bstr_adjust_len(s, wpos); +} + +/** + * + */ +void fprint_bstr(FILE *stream, const char *name, bstr *b) { + if (b == NULL) { + fprint_raw_data_ex(stream, name, "(null)", 0, 6); + return; + } + + fprint_raw_data_ex(stream, name, bstr_ptr(b), 0, bstr_len(b)); +} + +/** + * + */ +void fprint_raw_data(FILE *stream, const char *name, const void *data, size_t len) { + // may happen for gaps + if (data == NULL) { + fprintf(stream, "\n%s: ptr NULL len %u\n", name, (unsigned int)len); + } else { + fprint_raw_data_ex(stream, name, data, 0, len); + } +} + +/** + * + */ +void fprint_raw_data_ex(FILE *stream, const char *name, const void *_data, size_t offset, size_t printlen) { + const unsigned char *data = (const unsigned char *) _data; + char buf[160]; + size_t len = offset + printlen; + + fprintf(stream, "\n%s: ptr %p offset %u len %u\n", name, (void*) data, (unsigned int)offset, (unsigned int)len); + + while (offset < len) { + size_t i; + + snprintf(buf, sizeof(buf), "%x" PRIx64, (unsigned int) offset); + strlcat(buf, " ", sizeof(buf)); + + i = 0; + while (i < 8) { + if (offset + i < len) { + char step[4]; + snprintf(step, sizeof(step), "%02x ", data[offset + i]); + strlcat(buf, step, sizeof(buf)); + } else { + strlcat(buf, " ", sizeof(buf)); + } + + i++; + } + + strlcat(buf, " ", sizeof(buf)); + + i = 8; + while (i < 16) { + if (offset + i < len) { + char step[4]; + snprintf(step, sizeof(step), "%02x ", data[offset + i]); + strlcat(buf, step, sizeof(buf)); + } else { + strlcat(buf, " ", sizeof(buf)); + } + + i++; + } + + strlcat(buf, " |", sizeof(buf)); + + i = 0; + char *p = buf + strlen(buf); + while ((offset + i < len) && (i < 16)) { + uint8_t c = data[offset + i]; + + if (isprint(c)) { + *p++ = c; + } else { + *p++ = '.'; + } + + i++; + } + + *p++ = '|'; + *p++ = '\n'; + *p = '\0'; + + fprintf(stream, "%s", buf); + offset += 16; + } + + fprintf(stream, "\n"); +} + +/** + * + */ +char *htp_connp_in_state_as_string(htp_connp_t *connp) { + if (connp == NULL) return "NULL"; + + if (connp->in_state == htp_connp_REQ_IDLE) return "REQ_IDLE"; + if (connp->in_state == htp_connp_REQ_LINE) return "REQ_LINE"; + if (connp->in_state == htp_connp_REQ_PROTOCOL) return "REQ_PROTOCOL"; + if (connp->in_state == htp_connp_REQ_HEADERS) return "REQ_HEADERS"; + if (connp->in_state == htp_connp_REQ_CONNECT_CHECK) return "REQ_CONNECT_CHECK"; + if (connp->in_state == htp_connp_REQ_CONNECT_WAIT_RESPONSE) return "REQ_CONNECT_WAIT_RESPONSE"; + if (connp->in_state == htp_connp_REQ_BODY_DETERMINE) return "REQ_BODY_DETERMINE"; + if (connp->in_state == htp_connp_REQ_BODY_IDENTITY) return "REQ_BODY_IDENTITY"; + if (connp->in_state == htp_connp_REQ_BODY_CHUNKED_LENGTH) return "REQ_BODY_CHUNKED_LENGTH"; + if (connp->in_state == htp_connp_REQ_BODY_CHUNKED_DATA) return "REQ_BODY_CHUNKED_DATA"; + if (connp->in_state == htp_connp_REQ_BODY_CHUNKED_DATA_END) return "REQ_BODY_CHUNKED_DATA_END"; + if (connp->in_state == htp_connp_REQ_FINALIZE) return "REQ_FINALIZE"; + if (connp->in_state == htp_connp_REQ_IGNORE_DATA_AFTER_HTTP_0_9) return "REQ_IGNORE_DATA_AFTER_HTTP_0_9"; + + return "UNKNOWN"; +} + +/** + * + */ +char *htp_connp_out_state_as_string(htp_connp_t *connp) { + if (connp == NULL) return "NULL"; + + if (connp->out_state == htp_connp_RES_IDLE) return "RES_IDLE"; + if (connp->out_state == htp_connp_RES_LINE) return "RES_LINE"; + if (connp->out_state == htp_connp_RES_HEADERS) return "RES_HEADERS"; + if (connp->out_state == htp_connp_RES_BODY_DETERMINE) return "RES_BODY_DETERMINE"; + if (connp->out_state == htp_connp_RES_BODY_IDENTITY_CL_KNOWN) return "RES_BODY_IDENTITY_CL_KNOWN"; + if (connp->out_state == htp_connp_RES_BODY_IDENTITY_STREAM_CLOSE) return "RES_BODY_IDENTITY_STREAM_CLOSE"; + if (connp->out_state == htp_connp_RES_BODY_CHUNKED_LENGTH) return "RES_BODY_CHUNKED_LENGTH"; + if (connp->out_state == htp_connp_RES_BODY_CHUNKED_DATA) return "RES_BODY_CHUNKED_DATA"; + if (connp->out_state == htp_connp_RES_BODY_CHUNKED_DATA_END) return "RES_BODY_CHUNKED_DATA_END"; + if (connp->out_state == htp_connp_RES_FINALIZE) return "RES_BODY_FINALIZE"; + + return "UNKNOWN"; +} + +/** + * + */ +char *htp_tx_request_progress_as_string(htp_tx_t *tx) { + if (tx == NULL) return "NULL"; + + switch (tx->request_progress) { + case HTP_REQUEST_NOT_STARTED: + return "NOT_STARTED"; + case HTP_REQUEST_LINE: + return "REQ_LINE"; + case HTP_REQUEST_HEADERS: + return "REQ_HEADERS"; + case HTP_REQUEST_BODY: + return "REQ_BODY"; + case HTP_REQUEST_TRAILER: + return "REQ_TRAILER"; + case HTP_REQUEST_COMPLETE: + return "COMPLETE"; + } + + return "INVALID"; +} + +/** + * + */ +char *htp_tx_response_progress_as_string(htp_tx_t *tx) { + if (tx == NULL) return "NULL"; + + switch (tx->response_progress) { + case HTP_RESPONSE_NOT_STARTED: + return "NOT_STARTED"; + case HTP_RESPONSE_LINE: + return "RES_LINE"; + case HTP_RESPONSE_HEADERS: + return "RES_HEADERS"; + case HTP_RESPONSE_BODY: + return "RES_BODY"; + case HTP_RESPONSE_TRAILER: + return "RES_TRAILER"; + case HTP_RESPONSE_COMPLETE: + return "COMPLETE"; + } + + return "INVALID"; +} + +bstr *htp_unparse_uri_noencode(htp_uri_t *uri) { + if (uri == NULL) return NULL; + + // On the first pass determine the length of the final string + size_t len = 0; + + if (uri->scheme != NULL) { + len += bstr_len(uri->scheme); + len += 3; // "://" + } + + if ((uri->username != NULL) || (uri->password != NULL)) { + if (uri->username != NULL) { + len += bstr_len(uri->username); + } + + len += 1; // ":" + + if (uri->password != NULL) { + len += bstr_len(uri->password); + } + + len += 1; // "@" + } + + if (uri->hostname != NULL) { + len += bstr_len(uri->hostname); + } + + if (uri->port != NULL) { + len += 1; // ":" + len += bstr_len(uri->port); + } + + if (uri->path != NULL) { + len += bstr_len(uri->path); + } + + if (uri->query != NULL) { + len += 1; // "?" + len += bstr_len(uri->query); + } + + if (uri->fragment != NULL) { + len += 1; // "#" + len += bstr_len(uri->fragment); + } + + // On the second pass construct the string + bstr *r = bstr_alloc(len); + if (r == NULL) return NULL; + + if (uri->scheme != NULL) { + bstr_add_noex(r, uri->scheme); + bstr_add_c_noex(r, "://"); + } + + if ((uri->username != NULL) || (uri->password != NULL)) { + if (uri->username != NULL) { + bstr_add_noex(r, uri->username); + } + + bstr_add_c_noex(r, ":"); + + if (uri->password != NULL) { + bstr_add_noex(r, uri->password); + } + + bstr_add_c_noex(r, "@"); + } + + if (uri->hostname != NULL) { + bstr_add_noex(r, uri->hostname); + } + + if (uri->port != NULL) { + bstr_add_c_noex(r, ":"); + bstr_add_noex(r, uri->port); + } + + if (uri->path != NULL) { + bstr_add_noex(r, uri->path); + } + + if (uri->query != NULL) { + bstr_add_c_noex(r, "?"); + bstr_add_noex(r, uri->query); + } + + if (uri->fragment != NULL) { + bstr_add_c_noex(r, "#"); + bstr_add_noex(r, uri->fragment); + } + + return r; +} + +/** + * Determine if the information provided on the response line + * is good enough. Browsers are lax when it comes to response + * line parsing. In most cases they will only look for the + * words "http" at the beginning. + * + * @param[in] data pointer to bytearray + * @param[in] len length in bytes of data + * @return 1 for good enough or 0 for not good enough + */ +int htp_treat_response_line_as_body(const uint8_t *data, size_t len) { + // Browser behavior: + // Firefox 3.5.x: (?i)^\s*http + // IE: (?i)^\s*http\s*/ + // Safari: ^HTTP/\d+\.\d+\s+\d{3} + size_t pos = 0; + + if (data == NULL) return 1; + while ((pos < len) && (htp_is_space(data[pos]) || data[pos] == 0)) pos++; + + if (len < pos + 4) return 1; + + if ((data[pos] != 'H') && (data[pos] != 'h')) return 1; + if ((data[pos+1] != 'T') && (data[pos+1] != 't')) return 1; + if ((data[pos+2] != 'T') && (data[pos+2] != 't')) return 1; + if ((data[pos+3] != 'P') && (data[pos+3] != 'p')) return 1; + + return 0; +} + +/** + * Run the REQUEST_BODY_DATA hook. + * + * @param[in] connp + * @param[in] d + */ +htp_status_t htp_req_run_hook_body_data(htp_connp_t *connp, htp_tx_data_t *d) { + // Do not invoke callbacks with an empty data chunk + if ((d->data != NULL) && (d->len == 0)) return HTP_OK; + + // Do not invoke callbacks without a transaction. + if (connp->in_tx == NULL) return HTP_OK; + + // Run transaction hooks first + htp_status_t rc = htp_hook_run_all(connp->in_tx->hook_request_body_data, d); + if (rc != HTP_OK) return rc; + + // Run configuration hooks second + rc = htp_hook_run_all(connp->cfg->hook_request_body_data, d); + if (rc != HTP_OK) return rc; + + // On PUT requests, treat request body as file + if (connp->put_file != NULL) { + htp_file_data_t file_data; + + file_data.data = d->data; + file_data.len = d->len; + file_data.file = connp->put_file; + file_data.file->len += d->len; + + rc = htp_hook_run_all(connp->cfg->hook_request_file_data, &file_data); + if (rc != HTP_OK) return rc; + } + + return HTP_OK; +} + +/** + * Run the RESPONSE_BODY_DATA hook. + * + * @param[in] connp + * @param[in] d + */ +htp_status_t htp_res_run_hook_body_data(htp_connp_t *connp, htp_tx_data_t *d) { + // Do not invoke callbacks with an empty data chunk. + if ((d->data != NULL) && (d->len == 0)) return HTP_OK; + + // Run transaction hooks first + htp_status_t rc = htp_hook_run_all(connp->out_tx->hook_response_body_data, d); + if (rc != HTP_OK) return rc; + + // Run configuration hooks second + rc = htp_hook_run_all(connp->cfg->hook_response_body_data, d); + if (rc != HTP_OK) return rc; + + return HTP_OK; +} + +/** + * Parses the provided memory region, extracting the double-quoted string. + * + * @param[in] data + * @param[in] len + * @param[out] out + * @param[out] endoffset + * @return HTP_OK on success, HTP_DECLINED if the input is not well formed, and HTP_ERROR on fatal errors. + */ +htp_status_t htp_extract_quoted_string_as_bstr(unsigned char *data, size_t len, bstr **out, size_t *endoffset) { + if ((data == NULL) || (out == NULL)) return HTP_ERROR; + + if (len == 0) return HTP_DECLINED; + + size_t pos = 0; + + // Check that the first character is a double quote. + if (data[pos] != '"') return HTP_DECLINED; + + // Step over the double quote. + pos++; + if (pos == len) return HTP_DECLINED; + + // Calculate the length of the resulting string. + size_t escaped_chars = 0; + while (pos < len) { + if (data[pos] == '\\') { + if (pos + 1 < len) { + escaped_chars++; + pos += 2; + continue; + } + } else if (data[pos] == '"') { + break; + } + + pos++; + } + + // Have we reached the end of input without seeing the terminating double quote? + if (pos == len) return HTP_DECLINED; + + // Copy the data and unescape it as necessary. + size_t outlen = pos - 1 - escaped_chars; + *out = bstr_alloc(outlen); + if (*out == NULL) return HTP_ERROR; + unsigned char *outptr = bstr_ptr(*out); + size_t outpos = 0; + + pos = 1; + while ((pos < len) && (outpos < outlen)) { + // TODO We are not properly unescaping test here, we're only + // handling escaped double quotes. + if (data[pos] == '\\') { + if (pos + 1 < len) { + outptr[outpos++] = data[pos + 1]; + pos += 2; + continue; + } + } else if (data[pos] == '"') { + break; + } + + outptr[outpos++] = data[pos++]; + } + + bstr_adjust_len(*out, outlen); + + if (endoffset != NULL) { + *endoffset = pos; + } + + return HTP_OK; +} + +htp_status_t htp_parse_ct_header(bstr *header, bstr **ct) { + if ((header == NULL) || (ct == NULL)) return HTP_ERROR; + + unsigned char *data = bstr_ptr(header); + size_t len = bstr_len(header); + + // The assumption here is that the header value we receive + // here has been left-trimmed, which means the starting position + // is on the media type. On some platforms that may not be the + // case, and we may need to do the left-trim ourselves. + + // Find the end of the MIME type, using the same approach PHP 5.4.3 uses. + size_t pos = 0; + while ((pos < len) && (data[pos] != ';') && (data[pos] != ',') && (data[pos] != ' ')) pos++; + + *ct = bstr_dup_ex(header, 0, pos); + if (*ct == NULL) return HTP_ERROR; + + bstr_to_lowercase(*ct); + + return HTP_OK; +} + +/** + * Implements relaxed (not strictly RFC) hostname validation. + * + * @param[in] hostname + * @return 1 if the supplied hostname is valid; 0 if it is not. + */ +int htp_validate_hostname(bstr *hostname) { + unsigned char *data = bstr_ptr(hostname); + size_t len = bstr_len(hostname); + size_t startpos = 0; + size_t pos = 0; + + if ((len == 0) || (len > 255)) return 0; + + if (data[0] == '[') { + // only ipv6 possible + if (len < 2 || len - 2 >= INET6_ADDRSTRLEN) { + return 0; + } + char dst[sizeof(struct in6_addr)]; + char str[INET6_ADDRSTRLEN]; + memcpy(str, data+1, len-2); + str[len-2] = 0; + return inet_pton(AF_INET6, str, dst); + } + while (pos < len) { + // Validate label characters. + startpos = pos; + while ((pos < len) && (data[pos] != '.')) { + unsigned char c = data[pos]; + // According to the RFC, the underscore is not allowed in a label, but + // we allow it here because we think it's often seen in practice. + if (!(((c >= 'a') && (c <= 'z')) || ((c >= 'A') && (c <= 'Z')) || + ((c >= '0') && (c <= '9')) || + (c == '-') || (c == '_'))) + { + return 0; + } + + pos++; + } + + // Validate label length. + if ((pos - startpos == 0) || (pos - startpos > 63)) return 0; + + if (pos >= len) return 1; // No more data after label. + + // How many dots are there? + startpos = pos; + while ((pos < len) && (data[pos] == '.')) pos++; + + if (pos - startpos != 1) return 0; // Exactly one dot expected. + } + + return 1; +} + +void htp_uri_free(htp_uri_t *uri) { + if (uri == NULL) return; + + bstr_free(uri->scheme); + bstr_free(uri->username); + bstr_free(uri->password); + bstr_free(uri->hostname); + bstr_free(uri->port); + bstr_free(uri->path); + bstr_free(uri->query); + bstr_free(uri->fragment); + + free(uri); +} + +htp_uri_t *htp_uri_alloc(void) { + htp_uri_t *u = calloc(1, sizeof (htp_uri_t)); + if (u == NULL) return NULL; + + u->port_number = -1; + + return u; +} + +char *htp_get_version(void) { + return HTP_VERSION_STRING_FULL; +} + +/** + * Tells if a header value (haystack) contains a token (needle) + * This is done with a caseless comparison + * + * @param[in] hvp header value pointer + * @param[in] hvlen length of header value buffer + * @param[in] value token to look for (null-terminated string), should be a lowercase constant + * @return HTP_OK if the header has the token; HTP_ERROR if it has not. + */ +htp_status_t htp_header_has_token(const unsigned char *hvp, size_t hvlen, const unsigned char *value) { + int state = 0; + // offset to compare in value + size_t v_off = 0; + // The header value is a list of comma-separated tokens (with additional spaces) + for (size_t i = 0; i < hvlen; i++) { + switch (state) { + case 0: + if (v_off == 0 && htp_is_space(hvp[i])) { + // skip leading space + continue; + } + if (tolower(hvp[i]) == value[v_off]) { + v_off++; + if (value[v_off] == 0) { + // finish validation if end of token + state = 2; + } + continue; + } else { + // wait for a new token + v_off = 0; + state = 1; + } + // fallthrough + case 1: + if (hvp[i] == ',') { + // start of next token + state = 0; + } + break; + case 2: + if (hvp[i] == ',') { + return HTP_OK; + } + if (!htp_is_space(hvp[i])) { + // trailing junk in token, wait for a next one + v_off = 0; + state = 1; + } + } + } + if (state == 2) { + return HTP_OK; + } + return HTP_ERROR; +} |