summaryrefslogtreecommitdiffstats
path: root/libnetdata/url
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--libnetdata/url/url.c334
-rw-r--r--libnetdata/url/url.h7
2 files changed, 336 insertions, 5 deletions
diff --git a/libnetdata/url/url.c b/libnetdata/url/url.c
index 07a9f8069..7df9faaf0 100644
--- a/libnetdata/url/url.c
+++ b/libnetdata/url/url.c
@@ -43,8 +43,16 @@ char *url_encode(char *str) {
return pbuf;
}
-/* Returns a url-decoded version of str */
-/* IMPORTANT: be sure to free() the returned string after use */
+/**
+ * URL Decode
+ *
+ * Returns a url-decoded version of str
+ * IMPORTANT: be sure to free() the returned string after use
+ *
+ * @param str the string that will be decode
+ *
+ * @return a pointer for the url decoded.
+ */
char *url_decode(char *str) {
size_t size = strlen(str) + 1;
@@ -52,6 +60,149 @@ char *url_decode(char *str) {
return url_decode_r(buf, str, size);
}
+/**
+ * Percentage escape decode
+ *
+ * Decode %XX character or return 0 if cannot
+ *
+ * @param s the string to decode
+ *
+ * @return The character decoded on success and 0 otherwise
+ */
+char url_percent_escape_decode(char *s) {
+ if(likely(s[1] && s[2]))
+ return from_hex(s[1]) << 4 | from_hex(s[2]);
+ return 0;
+}
+
+/**
+ * Get byte length
+ *
+ * This (utf8 string related) should be moved in separate file in future
+ *
+ * @param c is the utf8 character
+ * *
+ * @return It reurns the length of the specific character.
+ */
+char url_utf8_get_byte_length(char c) {
+ if(!IS_UTF8_BYTE(c))
+ return 1;
+
+ char length = 0;
+ while(likely(c & 0x80)) {
+ length++;
+ c <<= 1;
+ }
+ //4 byte is max size for UTF-8 char
+ //10XX XXXX is not valid character -> check length == 1
+ if(length > 4 || length == 1)
+ return -1;
+
+ return length;
+}
+
+/**
+ * Decode Multibyte UTF8
+ *
+ * Decode % encoded UTF-8 characters and copy them to *d
+ *
+ * @param s first address
+ * @param d
+ * @param d_end last address
+ *
+ * @return count of bytes written to *d
+ */
+char url_decode_multibyte_utf8(char *s, char *d, char *d_end) {
+ char first_byte = url_percent_escape_decode(s);
+
+ if(unlikely(!first_byte || !IS_UTF8_STARTBYTE(first_byte)))
+ return 0;
+
+ char byte_length = url_utf8_get_byte_length(first_byte);
+
+ if(unlikely(byte_length <= 0 || d+byte_length >= d_end))
+ return 0;
+
+ char to_read = byte_length;
+ while(to_read > 0) {
+ char c = url_percent_escape_decode(s);
+
+ if(unlikely( !IS_UTF8_BYTE(c) ))
+ return 0;
+ if((to_read != byte_length) && IS_UTF8_STARTBYTE(c))
+ return 0;
+
+ *d++ = c;
+ s+=3;
+ to_read--;
+ }
+
+ return byte_length;
+}
+
+/*
+ * The utf8_check() function scans the '\0'-terminated string starting
+ * at s. It returns a pointer to the first byte of the first malformed
+ * or overlong UTF-8 sequence found, or NULL if the string contains
+ * only correct UTF-8. It also spots UTF-8 sequences that could cause
+ * trouble if converted to UTF-16, namely surrogate characters
+ * (U+D800..U+DFFF) and non-Unicode positions (U+FFFE..U+FFFF). This
+ * routine is very likely to find a malformed sequence if the input
+ * uses any other encoding than UTF-8. It therefore can be used as a
+ * very effective heuristic for distinguishing between UTF-8 and other
+ * encodings.
+ *
+ * Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> -- 2005-03-30
+ * License: http://www.cl.cam.ac.uk/~mgk25/short-license.html
+ */
+unsigned char *utf8_check(unsigned char *s)
+{
+ while (*s)
+ {
+ if (*s < 0x80)
+ /* 0xxxxxxx */
+ s++;
+ else if ((s[0] & 0xe0) == 0xc0)
+ {
+ /* 110XXXXx 10xxxxxx */
+ if ((s[1] & 0xc0) != 0x80 ||
+ (s[0] & 0xfe) == 0xc0) /* overlong? */
+ return s;
+ else
+ s += 2;
+ }
+ else if ((s[0] & 0xf0) == 0xe0)
+ {
+ /* 1110XXXX 10Xxxxxx 10xxxxxx */
+ if ((s[1] & 0xc0) != 0x80 ||
+ (s[2] & 0xc0) != 0x80 ||
+ (s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) || /* overlong? */
+ (s[0] == 0xed && (s[1] & 0xe0) == 0xa0) || /* surrogate? */
+ (s[0] == 0xef && s[1] == 0xbf &&
+ (s[2] & 0xfe) == 0xbe)) /* U+FFFE or U+FFFF? */
+ return s;
+ else
+ s += 3;
+ }
+ else if ((s[0] & 0xf8) == 0xf0)
+ {
+ /* 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx */
+ if ((s[1] & 0xc0) != 0x80 ||
+ (s[2] & 0xc0) != 0x80 ||
+ (s[3] & 0xc0) != 0x80 ||
+ (s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) || /* overlong? */
+ (s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4) /* > U+10FFFF? */
+ return s;
+ else
+ s += 4;
+ }
+ else
+ return s;
+ }
+
+ return NULL;
+}
+
char *url_decode_r(char *to, char *url, size_t size) {
char *s = url, // source
*d = to, // destination
@@ -59,12 +210,24 @@ char *url_decode_r(char *to, char *url, size_t size) {
while(*s && d < e) {
if(unlikely(*s == '%')) {
- if(likely(s[1] && s[2])) {
- char t = from_hex(s[1]) << 4 | from_hex(s[2]);
+ char t = url_percent_escape_decode(s);
+ if(IS_UTF8_BYTE(t)) {
+ char bytes_written = url_decode_multibyte_utf8(s, d, e);
+ if(likely(bytes_written)){
+ d += bytes_written;
+ s += (bytes_written * 3)-1;
+ }
+ else {
+ goto fail_cleanup;
+ }
+ }
+ else if(likely(t) && isprint(t)) {
// avoid HTTP header injection
- *d++ = (char)((isprint(t))? t : ' ');
+ *d++ = t;
s += 2;
}
+ else
+ goto fail_cleanup;
}
else if(unlikely(*s == '+'))
*d++ = ' ';
@@ -77,5 +240,166 @@ char *url_decode_r(char *to, char *url, size_t size) {
*d = '\0';
+ if(unlikely( utf8_check((unsigned char *)to) )) //NULL means sucess here
+ return NULL;
+
return to;
+
+fail_cleanup:
+ *d = '\0';
+ return NULL;
+}
+
+/**
+ * Is request complete?
+ *
+ * Check whether the request is complete.
+ * This function cannot check all the requests METHODS, for example, case you are working with POST, it will fail.
+ *
+ * @param begin is the first character of the sequence to analyse.
+ * @param end is the last character of the sequence
+ * @param length is the length of the total of bytes read, it is not the difference between end and begin.
+ *
+ * @return It returns 1 when the request is complete and 0 otherwise.
+ */
+inline int url_is_request_complete(char *begin, char *end, size_t length) {
+
+ if ( begin == end) {
+ //Message cannot be complete when first and last address are the same
+ return 0;
+ }
+
+ //This math to verify the last is valid, because we are discarding the POST
+ if (length > 4) {
+ begin = end - 4;
+ }
+
+ return (strstr(begin, "\r\n\r\n"))?1:0;
+}
+
+/**
+ * Find protocol
+ *
+ * Search for the string ' HTTP/' in the message given.
+ *
+ * @param s is the start of the user request.
+ * @return
+ */
+inline char *url_find_protocol(char *s) {
+ while(*s) {
+ // find the next space
+ while (*s && *s != ' ') s++;
+
+ // is it SPACE + "HTTP/" ?
+ if(*s && !strncmp(s, " HTTP/", 6)) break;
+ else s++;
+ }
+
+ return s;
+}
+
+/**
+ * Map query string
+ *
+ * Map the query string fields that will be decoded.
+ * This functions must be called after to check the presence of query strings,
+ * here we are assuming that you already tested this.
+ *
+ * @param out the pointer to pointers that will be used to map
+ * @param url the input url that we are decoding.
+ *
+ * @return It returns the number of total variables in the query string.
+ */
+int url_map_query_string(char **out, char *url) {
+ (void)out;
+ (void)url;
+ int count = 0;
+
+ //First we try to parse considering that there was not URL encode process
+ char *moveme = url;
+ char *ptr;
+
+ //We always we have at least one here, so I can set this.
+ out[count++] = moveme;
+ while(moveme) {
+ ptr = strchr((moveme+1), '&');
+ if(ptr) {
+ out[count++] = ptr;
+ }
+
+ moveme = ptr;
+ }
+
+ //I could not find any '&', so I am assuming now it is like '%26'
+ if (count == 1) {
+ moveme = url;
+ while(moveme) {
+ ptr = strchr((moveme+1), '%');
+ if(ptr) {
+ char *test = (ptr+1);
+ if (!strncmp(test, "3f", 2) || !strncmp(test, "3F", 2)) {
+ out[count++] = ptr;
+ }
+ }
+ moveme = ptr;
+ }
+ }
+
+ return count;
+}
+
+/**
+ * Parse query string
+ *
+ * Parse the query string mapped and store it inside output.
+ *
+ * @param output is a vector where I will store the string.
+ * @param max is the maximum length of the output
+ * @param map the map done by the function url_map_query_string.
+ * @param total the total number of variables inside map
+ *
+ * @return It returns 0 on success and -1 otherwise
+ */
+int url_parse_query_string(char *output, size_t max, char **map, int total) {
+ if(!total) {
+ return 0;
+ }
+
+ int counter, next;
+ size_t length;
+ char *end;
+ char *begin = map[0];
+ char save;
+ size_t copied = 0;
+ for(counter = 0, next=1 ; next <= total ; ++counter, ++next) {
+ if (next != total) {
+ end = map[next];
+ length = (size_t) (end - begin);
+ save = *end;
+ *end = 0x00;
+ } else {
+ length = strlen(begin);
+ end = NULL;
+ }
+ length++;
+
+ if (length > (max - copied)) {
+ error("Parsing query string: we cannot parse a query string so big");
+ break;
+ }
+
+ if(!url_decode_r(output, begin, length)) {
+ return -1;
+ }
+ length = strlen(output);
+ copied += length;
+ output += length;
+
+ begin = end;
+ if (begin) {
+ *begin = save;
+ }
+ }
+
+ return 0;
}
diff --git a/libnetdata/url/url.h b/libnetdata/url/url.h
index 6cef6d7a8..10f3fe176 100644
--- a/libnetdata/url/url.h
+++ b/libnetdata/url/url.h
@@ -25,4 +25,11 @@ extern char *url_decode(char *str);
extern char *url_decode_r(char *to, char *url, size_t size);
+#define WEB_FIELDS_MAX 400
+extern int url_map_query_string(char **out, char *url);
+extern int url_parse_query_string(char *output, size_t max, char **map, int total);
+
+extern int url_is_request_complete(char *begin,char *end,size_t length);
+extern char *url_find_protocol(char *s);
+
#endif /* NETDATA_URL_H */