summaryrefslogtreecommitdiffstats
path: root/src/libnetdata/url
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-07-24 09:54:23 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-07-24 09:54:44 +0000
commit836b47cb7e99a977c5a23b059ca1d0b5065d310e (patch)
tree1604da8f482d02effa033c94a84be42bc0c848c3 /src/libnetdata/url
parentReleasing debian version 1.44.3-2. (diff)
downloadnetdata-836b47cb7e99a977c5a23b059ca1d0b5065d310e.tar.xz
netdata-836b47cb7e99a977c5a23b059ca1d0b5065d310e.zip
Merging upstream version 1.46.3.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/libnetdata/url')
-rw-r--r--src/libnetdata/url/README.md14
-rw-r--r--src/libnetdata/url/url.c313
-rw-r--r--src/libnetdata/url/url.h31
3 files changed, 358 insertions, 0 deletions
diff --git a/src/libnetdata/url/README.md b/src/libnetdata/url/README.md
new file mode 100644
index 000000000..01a2dddb6
--- /dev/null
+++ b/src/libnetdata/url/README.md
@@ -0,0 +1,14 @@
+<!--
+title: "URL"
+custom_edit_url: https://github.com/netdata/netdata/edit/master/src/libnetdata/url/README.md
+sidebar_label: "URL"
+learn_status: "Published"
+learn_topic_type: "Tasks"
+learn_rel_path: "Developers/libnetdata"
+-->
+
+# URL
+
+The URL library contains common functions useful for URLs, like conversion from/to hex,
+URL encode/decode and query string parsing.
+
diff --git a/src/libnetdata/url/url.c b/src/libnetdata/url/url.c
new file mode 100644
index 000000000..720a703d4
--- /dev/null
+++ b/src/libnetdata/url/url.c
@@ -0,0 +1,313 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#include "../libnetdata.h"
+
+// ----------------------------------------------------------------------------
+// URL encode / decode
+// code from: http://www.geekhideout.com/urlcode.shtml
+
+/* Converts a hex character to its integer value */
+char from_hex(char ch) {
+ return (char)(isdigit(ch) ? ch - '0' : tolower(ch) - 'a' + 10);
+}
+
+/* Converts an integer value to its hex character*/
+char to_hex(char code) {
+ static char hex[] = "0123456789abcdef";
+ return hex[code & 15];
+}
+
+/* Returns an url-encoded version of str */
+/* IMPORTANT: be sure to free() the returned string after use */
+char *url_encode(char *str) {
+ char *buf, *pbuf;
+
+ pbuf = buf = mallocz(strlen(str) * 3 + 1);
+
+ while (*str) {
+ if (isalnum((uint8_t)*str) || *str == '-' || *str == '_' || *str == '.' || *str == '~')
+ *pbuf++ = *str;
+
+ else if (*str == ' ')
+ *pbuf++ = '+';
+
+ else{
+ *pbuf++ = '%';
+ *pbuf++ = to_hex((char)(*str >> 4));
+ *pbuf++ = to_hex((char)(*str & 15));
+ }
+
+ str++;
+ }
+ *pbuf = '\0';
+
+ pbuf = strdupz(buf);
+ freez(buf);
+ return pbuf;
+}
+
+/**
+ * Percentage escape decode
+ *
+ * Decode %XX character or return 0 if cannot
+ *
+ * @param s the string to decode
+ *
+ * @return The character decoded on success and 0 otherwise
+ */
+char url_percent_escape_decode(const char *s) {
+ if(likely(s[1] && s[2]))
+ return (char)(from_hex(s[1]) << 4 | from_hex(s[2]));
+ return 0;
+}
+
+/**
+ * Get byte length
+ *
+ * This (utf8 string related) should be moved in separate file in future
+ *
+ * @param c is the utf8 character
+ * *
+ * @return It returns the length of the specific character.
+ */
+char url_utf8_get_byte_length(char c) {
+ if(!IS_UTF8_BYTE(c))
+ return 1;
+
+ char length = 0;
+ while(likely(c & 0x80)) {
+ length++;
+ c <<= 1;
+ }
+ //4 byte is max size for UTF-8 char
+ //10XX XXXX is not valid character -> check length == 1
+ if(length > 4 || length == 1)
+ return -1;
+
+ return length;
+}
+
+/**
+ * Decode Multibyte UTF8
+ *
+ * Decode % encoded UTF-8 characters and copy them to *d
+ *
+ * @param s first address
+ * @param d
+ * @param d_end last address
+ *
+ * @return count of bytes written to *d
+ */
+char url_decode_multibyte_utf8(const char *s, char *d, const char *d_end) {
+ char first_byte = url_percent_escape_decode(s);
+
+ if(unlikely(!first_byte || !IS_UTF8_STARTBYTE(first_byte)))
+ return 0;
+
+ char byte_length = url_utf8_get_byte_length(first_byte);
+
+ if(unlikely(byte_length <= 0 || d+byte_length >= d_end))
+ return 0;
+
+ char to_read = byte_length;
+ while(to_read > 0) {
+ char c = url_percent_escape_decode(s);
+
+ if(unlikely( !IS_UTF8_BYTE(c) ))
+ return 0;
+ if((to_read != byte_length) && IS_UTF8_STARTBYTE(c))
+ return 0;
+
+ *d++ = c;
+ s+=3;
+ to_read--;
+ }
+
+ return byte_length;
+}
+
+/*
+ * The utf8_check() function scans the '\0'-terminated string starting
+ * at s. It returns a pointer to the first byte of the first malformed
+ * or overlong UTF-8 sequence found, or NULL if the string contains
+ * only correct UTF-8. It also spots UTF-8 sequences that could cause
+ * trouble if converted to UTF-16, namely surrogate characters
+ * (U+D800..U+DFFF) and non-Unicode positions (U+FFFE..U+FFFF). This
+ * routine is very likely to find a malformed sequence if the input
+ * uses any other encoding than UTF-8. It therefore can be used as a
+ * very effective heuristic for distinguishing between UTF-8 and other
+ * encodings.
+ *
+ * Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> -- 2005-03-30
+ * License: http://www.cl.cam.ac.uk/~mgk25/short-license.html
+ */
+unsigned char *utf8_check(unsigned char *s)
+{
+ while (*s)
+ {
+ if (*s < 0x80)
+ /* 0xxxxxxx */
+ s++;
+ else if ((s[0] & 0xe0) == 0xc0)
+ {
+ /* 110XXXXx 10xxxxxx */
+ if ((s[1] & 0xc0) != 0x80 ||
+ (s[0] & 0xfe) == 0xc0) /* overlong? */
+ return s;
+ else
+ s += 2;
+ }
+ else if ((s[0] & 0xf0) == 0xe0)
+ {
+ /* 1110XXXX 10Xxxxxx 10xxxxxx */
+ if ((s[1] & 0xc0) != 0x80 ||
+ (s[2] & 0xc0) != 0x80 ||
+ (s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) || /* overlong? */
+ (s[0] == 0xed && (s[1] & 0xe0) == 0xa0) || /* surrogate? */
+ (s[0] == 0xef && s[1] == 0xbf &&
+ (s[2] & 0xfe) == 0xbe)) /* U+FFFE or U+FFFF? */
+ return s;
+ else
+ s += 3;
+ }
+ else if ((s[0] & 0xf8) == 0xf0)
+ {
+ /* 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx */
+ if ((s[1] & 0xc0) != 0x80 ||
+ (s[2] & 0xc0) != 0x80 ||
+ (s[3] & 0xc0) != 0x80 ||
+ (s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) || /* overlong? */
+ (s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4) /* > U+10FFFF? */
+ return s;
+ else
+ s += 4;
+ }
+ else
+ return s;
+ }
+
+ return NULL;
+}
+
+char *url_decode_r(char *to, const char *url, size_t size) {
+ const char *s = url; // source
+ char *d = to, // destination
+ *e = &to[size - 1]; // destination end
+
+ while(*s && d < e) {
+ if(unlikely(*s == '%')) {
+ char t = url_percent_escape_decode(s);
+ if(IS_UTF8_BYTE(t)) {
+ char bytes_written = url_decode_multibyte_utf8(s, d, e);
+ if(likely(bytes_written)){
+ d += bytes_written;
+ s += (bytes_written * 3)-1;
+ }
+ else {
+ goto fail_cleanup;
+ }
+ }
+ else if(likely(t) && isprint(t)) {
+ // avoid HTTP header injection
+ *d++ = t;
+ s += 2;
+ }
+ else
+ goto fail_cleanup;
+ }
+ else if(unlikely(*s == '+'))
+ *d++ = ' ';
+
+ else
+ *d++ = *s;
+
+ s++;
+ }
+
+ *d = '\0';
+
+ if(unlikely( utf8_check((unsigned char *)to) )) //NULL means success here
+ return NULL;
+
+ return to;
+
+fail_cleanup:
+ *d = '\0';
+ return NULL;
+}
+
+inline bool
+url_is_request_complete_and_extract_payload(const char *begin, const char *end, size_t length, BUFFER **post_payload) {
+ if (begin == end || length < 4)
+ return false;
+
+ if(likely(strncmp(begin, "GET ", 4)) == 0) {
+ return strstr(end - 4, "\r\n\r\n");
+ }
+ else if(unlikely(strncmp(begin, "POST ", 5) == 0 || strncmp(begin, "PUT ", 4) == 0)) {
+ const char *cl = strcasestr(begin, "Content-Length: ");
+ if(!cl) return false;
+ cl = &cl[16];
+
+ size_t content_length = str2ul(cl);
+
+ const char *payload = strstr(cl, "\r\n\r\n");
+ if(!payload) return false;
+ payload += 4;
+
+ size_t payload_length = length - (payload - begin);
+
+ if(payload_length == content_length) {
+ if(!*post_payload)
+ *post_payload = buffer_create(payload_length + 1, NULL);
+
+ buffer_contents_replace(*post_payload, payload, payload_length);
+
+ // parse the content type
+ const char *ct = strcasestr(begin, "Content-Type: ");
+ if(ct) {
+ ct = &ct[14];
+ while (*ct && isspace((uint8_t)*ct)) ct++;
+ const char *space = ct;
+ while (*space && !isspace((uint8_t)*space) && *space != ';') space++;
+ size_t ct_len = space - ct;
+
+ char ct_copy[ct_len + 1];
+ memcpy(ct_copy, ct, ct_len);
+ ct_copy[ct_len] = '\0';
+
+ (*post_payload)->content_type = content_type_string2id(ct_copy);
+ }
+ else
+ (*post_payload)->content_type = CT_TEXT_PLAIN;
+
+ return true;
+ }
+
+ return false;
+ }
+ else {
+ return strstr(end - 4, "\r\n\r\n");
+ }
+}
+
+/**
+ * Find protocol
+ *
+ * Search for the string ' HTTP/' in the message given.
+ *
+ * @param s is the start of the user request.
+ * @return
+ */
+inline char *url_find_protocol(char *s) {
+ while(*s) {
+ // find the next space
+ while (*s && *s != ' ') s++;
+
+ // is it SPACE + "HTTP/" ?
+ if(*s && !strncmp(s, " HTTP/", 6)) break;
+ else s++;
+ }
+
+ return s;
+}
diff --git a/src/libnetdata/url/url.h b/src/libnetdata/url/url.h
new file mode 100644
index 000000000..f7a67dd5c
--- /dev/null
+++ b/src/libnetdata/url/url.h
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#ifndef NETDATA_URL_H
+#define NETDATA_URL_H 1
+
+#include "../libnetdata.h"
+
+// ----------------------------------------------------------------------------
+// URL encode / decode
+// code from: http://www.geekhideout.com/urlcode.shtml
+
+/* Converts a hex character to its integer value */
+char from_hex(char ch);
+
+/* Converts an integer value to its hex character*/
+char to_hex(char code);
+
+/* Returns a url-encoded version of str */
+/* IMPORTANT: be sure to free() the returned string after use */
+char *url_encode(char *str);
+
+/* Returns a url-decoded version of str */
+/* IMPORTANT: be sure to free() the returned string after use */
+char *url_decode(char *str);
+
+char *url_decode_r(char *to, const char *url, size_t size);
+
+bool url_is_request_complete_and_extract_payload(const char *begin, const char *end, size_t length, BUFFER **post_payload);
+char *url_find_protocol(char *s);
+
+#endif /* NETDATA_URL_H */