diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-05 11:19:16 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-07-24 09:53:24 +0000 |
commit | b5f8ee61a7f7e9bd291dd26b0585d03eb686c941 (patch) | |
tree | d4d31289c39fc00da064a825df13a0b98ce95b10 /src/libnetdata | |
parent | Adding upstream version 1.44.3. (diff) | |
download | netdata-b5f8ee61a7f7e9bd291dd26b0585d03eb686c941.tar.xz netdata-b5f8ee61a7f7e9bd291dd26b0585d03eb686c941.zip |
Adding upstream version 1.46.3.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/libnetdata')
213 files changed, 76772 insertions, 0 deletions
diff --git a/src/libnetdata/README.md b/src/libnetdata/README.md new file mode 100644 index 00000000..fd2c7973 --- /dev/null +++ b/src/libnetdata/README.md @@ -0,0 +1,14 @@ +<!-- +title: "libnetdata" +custom_edit_url: https://github.com/netdata/netdata/edit/master/src/libnetdata/README.md +sidebar_label: "libnetdata" +learn_status: "Published" +learn_topic_type: "Tasks" +learn_rel_path: "Developers/libnetdata" +--> + +# libnetdata + +`libnetdata` is a collection of library code that is used by all Netdata `C` programs. + + diff --git a/src/libnetdata/adaptive_resortable_list/README.md b/src/libnetdata/adaptive_resortable_list/README.md new file mode 100644 index 00000000..9aa864c9 --- /dev/null +++ b/src/libnetdata/adaptive_resortable_list/README.md @@ -0,0 +1,103 @@ +<!-- +title: "Adaptive Re-sortable List (ARL)" +custom_edit_url: https://github.com/netdata/netdata/edit/master/src/libnetdata/adaptive_resortable_list/README.md +sidebar_label: "Adaptive Re-sortable List (ARL)" +learn_status: "Published" +learn_topic_type: "Tasks" +learn_rel_path: "Developers/libnetdata" +--> + +# Adaptive Re-sortable List (ARL) + +This library allows Netdata to read a series of `name - value` pairs +in the **fastest possible way**. + +ARLs are used all over Netdata, as they are the most +CPU utilization efficient way to process `/proc` files. They are used to +process both vertical (csv like) and horizontal (one pair per line) `name - value` pairs. + +## How ARL works + +It maintains a linked list of all `NAME` (keywords), sorted in the +order found in the data source. The linked list is kept +sorted at all times - the data source may change at any time, the +linked list will adapt at the next iteration. + +### Initialization + +During initialization (just once), the caller: + +- calls `arl_create()` to create the ARL + +- calls `arl_expect()` multiple times to register the expected keywords + +The library will call the `processor()` function (given to +`arl_create()`), for each expected keyword found. +The default `processor()` expects `dst` to be an `unsigned long long *`. + +Each `name` keyword may have a different `processor()` (by calling +`arl_expect_custom()` instead of `arl_expect()`). + +### Data collection iterations + +For each iteration through the data source, the caller: + +- calls `arl_begin()` to initiate a data collection iteration. + This is to be called just ONCE every time the source is re-evaluated. + +- calls `arl_check()` for each entry read from the file. + +### Cleanup + +When the caller exits: + +- calls `arl_free()` to destroy this and free all memory. + +### Performance + +ARL maintains a list of `name` keywords found in the data source (even the ones +that are not useful for data collection). + +If the data source maintains the same order on the `name-value` pairs, for each +each call to `arl_check()` only an `strcmp()` is executed to verify the +expected order has not changed, a counter is incremented and a pointer is changed. +So, if the data source has 100 `name-value` pairs, and their order remains constant +over time, 100 successful `strcmp()` are executed. + +In the unlikely event that an iteration sees the data source with a different order, +for each out-of-order keyword, a full search of the remaining keywords is made. But +this search uses 32bit hashes, not string comparisons, so it should also be fast. + +When all expectations are satisfied (even in the middle of an iteration), +the call to `arl_check()` will return 1, to signal the caller to stop the loop, +saving valuable CPU resources for the rest of the data source. + +In the following test we used alternative methods to process, **1M times**, +a data source like `/proc/meminfo`, already tokenized, in memory, +to extract the same number of expected metrics: + +|test|code|string comparison|number parsing|duration| +|:--:|:--:|:---------------:|:------------:|:------:| +|1|if-else-if-else-if|`strcmp()`|`strtoull()`|4630.337 ms| +|2|nested loops|inline `simple_hash()` and `strcmp()`|`strtoull()`|1597.481 ms| +|3|nested loops|inline `simple_hash()` and `strcmp()`|`str2ull()`|923.523 ms| +|4|if-else-if-else-if|inline `simple_hash()` and `strcmp()`|`strtoull()`|854.574 ms| +|5|if-else-if-else-if|statement expression `simple_hash()` and `strcmp()`|`strtoull()`|912.013 ms| +|6|if-continue|inline `simple_hash()` and `strcmp()`|`strtoull()`|842.279 ms| +|7|if-else-if-else-if|inline `simple_hash()` and `strcmp()`|`str2ull()`|602.837 ms| +|8|ARL|ARL|`strtoull()`|350.360 ms| +|9|ARL|ARL|`str2ull()`|157.237 ms| + +Compared to unoptimized code (test No 1: 4.6sec): + +- before ARL Netdata was using test No **7** with hashing and a custom `str2ull()` to achieve 602ms. +- the current ARL implementation is test No **9** that needs only 157ms (29 times faster vs unoptimized code, about 4 times faster vs optimized code). + +[Check the source code of this test](https://raw.githubusercontent.com/netdata/netdata/master/tests/profile/benchmark-value-pairs.c). + +## Limitations + +Do not use ARL if the a name/keyword may appear more than once in the +source data. + + diff --git a/src/libnetdata/adaptive_resortable_list/adaptive_resortable_list.c b/src/libnetdata/adaptive_resortable_list/adaptive_resortable_list.c new file mode 100644 index 00000000..b645927d --- /dev/null +++ b/src/libnetdata/adaptive_resortable_list/adaptive_resortable_list.c @@ -0,0 +1,280 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" + +// the default processor() of the ARL +// can be overwritten at arl_create() +inline void arl_callback_str2ull(const char *name, uint32_t hash, const char *value, void *dst) { + (void)name; + (void)hash; + + register unsigned long long *d = dst; + *d = str2ull(value, NULL); + // fprintf(stderr, "name '%s' with hash %u and value '%s' is %llu\n", name, hash, value, *d); +} + +inline void arl_callback_str2kernel_uint_t(const char *name, uint32_t hash, const char *value, void *dst) { + (void)name; + (void)hash; + + register kernel_uint_t *d = dst; + *d = str2kernel_uint_t(value); + // fprintf(stderr, "name '%s' with hash %u and value '%s' is %llu\n", name, hash, value, (unsigned long long)*d); +} + +inline void arl_callback_ssize_t(const char *name, uint32_t hash, const char *value, void *dst) { + (void)name; + (void)hash; + + register ssize_t *d = dst; + *d = (ssize_t)str2ll(value, NULL); + // fprintf(stderr, "name '%s' with hash %u and value '%s' is %zd\n", name, hash, value, *d); +} + +// create a new ARL +ARL_BASE *arl_create(const char *name, void (*processor)(const char *, uint32_t, const char *, void *), size_t rechecks) { + ARL_BASE *base = callocz(1, sizeof(ARL_BASE)); + + base->name = strdupz(name); + + if(!processor) + base->processor = arl_callback_str2ull; + else + base->processor = processor; + + base->rechecks = rechecks; + + return base; +} + +void arl_free(ARL_BASE *arl_base) { + if(unlikely(!arl_base)) + return; + + while(arl_base->head) { + ARL_ENTRY *e = arl_base->head; + arl_base->head = e->next; + + freez(e->name); +#ifdef NETDATA_INTERNAL_CHECKS + memset(e, 0, sizeof(ARL_ENTRY)); +#endif + freez(e); + } + + freez(arl_base->name); + +#ifdef NETDATA_INTERNAL_CHECKS + memset(arl_base, 0, sizeof(ARL_BASE)); +#endif + + freez(arl_base); +} + +void arl_begin(ARL_BASE *base) { + +#ifdef NETDATA_INTERNAL_CHECKS + if(likely(base->iteration > 10)) { + // do these checks after the ARL has been sorted + + if(unlikely(base->relinkings > (base->expected + base->allocated))) + netdata_log_info("ARL '%s' has %zu relinkings with %zu expected and %zu allocated entries. Is the source changing so fast?" + , base->name, base->relinkings, base->expected, base->allocated); + + if(unlikely(base->slow > base->fast)) + netdata_log_info("ARL '%s' has %zu fast searches and %zu slow searches. Is the source really changing so fast?" + , base->name, base->fast, base->slow); + + /* + if(unlikely(base->iteration % 60 == 0)) { + netdata_log_info("ARL '%s' statistics: iteration %zu, expected %zu, wanted %zu, allocated %zu, fred %zu, relinkings %zu, found %zu, added %zu, fast %zu, slow %zu" + , base->name + , base->iteration + , base->expected + , base->wanted + , base->allocated + , base->fred + , base->relinkings + , base->found + , base->added + , base->fast + , base->slow + ); + // for(e = base->head; e; e = e->next) fprintf(stderr, "%s ", e->name); + // fprintf(stderr, "\n"); + } + */ + } +#endif + + if(unlikely(base->iteration > 0 && (base->added || (base->iteration % base->rechecks) == 0))) { + int wanted_equals_expected = ((base->iteration % base->rechecks) == 0); + + // fprintf(stderr, "\n\narl_begin() rechecking, added %zu, iteration %zu, rechecks %zu, wanted_equals_expected %d\n\n\n", base->added, base->iteration, base->rechecks, wanted_equals_expected); + + base->added = 0; + base->wanted = (wanted_equals_expected)?base->expected:0; + + ARL_ENTRY *e = base->head; + while(e) { + if(e->flags & ARL_ENTRY_FLAG_FOUND) { + + // remove the found flag + e->flags &= ~ARL_ENTRY_FLAG_FOUND; + + // count it in wanted + if(!wanted_equals_expected && e->flags & ARL_ENTRY_FLAG_EXPECTED) + base->wanted++; + + } + else if(e->flags & ARL_ENTRY_FLAG_DYNAMIC && !(base->head == e && !e->next)) { // not last entry + // we can remove this entry + // it is not found, and it was created because + // it was found in the source file + + // remember the next one + ARL_ENTRY *t = e->next; + + // remove it from the list + if(e->next) e->next->prev = e->prev; + if(e->prev) e->prev->next = e->next; + if(base->head == e) base->head = e->next; + + // free it + freez(e->name); + freez(e); + + // count it + base->fred++; + + // continue + e = t; + continue; + } + + e = e->next; + } + } + + if(unlikely(!base->head)) { + // hm... no nodes at all in the list #1700 + // add a fake one to prevent a crash + // this is better than checking for the existence of nodes all the time + arl_expect(base, "a-really-not-existing-source-keyword", NULL); + } + + base->iteration++; + base->next_keyword = base->head; + base->found = 0; + +} + +// register an expected keyword to the ARL +// together with its destination ( i.e. the output of the processor() ) +ARL_ENTRY *arl_expect_custom(ARL_BASE *base, const char *keyword, void (*processor)(const char *name, uint32_t hash, const char *value, void *dst), void *dst) { + ARL_ENTRY *e = callocz(1, sizeof(ARL_ENTRY)); + e->name = strdupz(keyword); + e->hash = simple_hash(e->name); + e->processor = (processor)?processor:base->processor; + e->dst = dst; + e->flags = ARL_ENTRY_FLAG_EXPECTED; + e->prev = NULL; + e->next = base->head; + + if(base->head) base->head->prev = e; + else base->next_keyword = e; + + base->head = e; + base->expected++; + base->allocated++; + + base->wanted = base->expected; + + return e; +} + +int arl_find_or_create_and_relink(ARL_BASE *base, const char *s, const char *value) { + ARL_ENTRY *e; + + uint32_t hash = simple_hash(s); + + // find if it already exists in the data + for(e = base->head; e ; e = e->next) + if(e->hash == hash && !strcmp(e->name, s)) + break; + +#ifdef NETDATA_INTERNAL_CHECKS + if(unlikely(base->next_keyword && e == base->next_keyword)) + fatal("Internal Error: e == base->last"); +#endif + + if(e) { + // found it in the keywords + + base->relinkings++; + + // run the processor for it + if(unlikely(e->dst)) { + e->processor(e->name, hash, value, e->dst); + base->found++; + } + + // unlink it - we will relink it below + if(e->next) e->next->prev = e->prev; + if(e->prev) e->prev->next = e->next; + + // make sure the head is properly linked + if(base->head == e) + base->head = e->next; + } + else { + // not found + + // create it + e = callocz(1, sizeof(ARL_ENTRY)); + e->name = strdupz(s); + e->hash = hash; + e->flags = ARL_ENTRY_FLAG_DYNAMIC; + + base->allocated++; + base->added++; + } + +#ifdef NETDATA_INTERNAL_CHECKS + if(unlikely(base->iteration % 60 == 0 && e->flags & ARL_ENTRY_FLAG_FOUND)) + netdata_log_info("ARL '%s': entry '%s' is already found. Did you forget to call arl_begin()?", base->name, s); +#endif + + e->flags |= ARL_ENTRY_FLAG_FOUND; + + // link it here + e->next = base->next_keyword; + if(base->next_keyword) { + e->prev = base->next_keyword->prev; + base->next_keyword->prev = e; + + if(e->prev) + e->prev->next = e; + + if(base->head == base->next_keyword) + base->head = e; + } + else { + e->prev = NULL; + + if(!base->head) + base->head = e; + } + + // prepare the next iteration + base->next_keyword = e->next; + if(unlikely(!base->next_keyword)) + base->next_keyword = base->head; + + if(unlikely(base->found == base->wanted)) { + // fprintf(stderr, "FOUND ALL WANTED 1: found = %zu, wanted = %zu, expected %zu\n", base->found, base->wanted, base->expected); + return 1; + } + + return 0; +} diff --git a/src/libnetdata/adaptive_resortable_list/adaptive_resortable_list.h b/src/libnetdata/adaptive_resortable_list/adaptive_resortable_list.h new file mode 100644 index 00000000..bca0ff27 --- /dev/null +++ b/src/libnetdata/adaptive_resortable_list/adaptive_resortable_list.h @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" + +#ifndef NETDATA_ADAPTIVE_RESORTABLE_LIST_H +#define NETDATA_ADAPTIVE_RESORTABLE_LIST_H 1 + +#define ARL_ENTRY_FLAG_FOUND 0x01 // the entry has been found in the source data +#define ARL_ENTRY_FLAG_EXPECTED 0x02 // the entry is expected by the program +#define ARL_ENTRY_FLAG_DYNAMIC 0x04 // the entry was dynamically allocated, from source data + +typedef struct arl_entry { + char *name; // the keywords + uint32_t hash; // the hash of the keyword + + void *dst; // the dst to pass to the processor + + uint8_t flags; // ARL_ENTRY_FLAG_* + + // the processor to do the job + void (*processor)(const char *name, uint32_t hash, const char *value, void *dst); + + // double linked list for fast re-linkings + struct arl_entry *prev, *next; +} ARL_ENTRY; + +typedef struct arl_base { + char *name; + + size_t iteration; // incremented on each iteration (arl_begin()) + size_t found; // the number of expected keywords found in this iteration + size_t expected; // the number of expected keywords + size_t wanted; // the number of wanted keywords + // i.e. the number of keywords found and expected + + size_t relinkings; // the number of relinkings we have made so far + + size_t allocated; // the number of keywords allocated + size_t fred; // the number of keywords cleaned up + + size_t rechecks; // the number of iterations between re-checks of the + // wanted number of keywords + // this is only needed in cases where the source + // is having less lines over time. + + size_t added; // it is non-zero if new keywords have been added + // this is only needed to detect new lines have + // been added to the file, over time. + +#ifdef NETDATA_INTERNAL_CHECKS + size_t fast; // the number of times we have taken the fast path + size_t slow; // the number of times we have taken the slow path +#endif + + // the processor to do the job + void (*processor)(const char *name, uint32_t hash, const char *value, void *dst); + + // the linked list of the keywords + ARL_ENTRY *head; + + // since we keep the list of keywords sorted (as found in the source data) + // this is next keyword that we expect to find in the source data. + ARL_ENTRY *next_keyword; +} ARL_BASE; + +// create a new ARL +ARL_BASE *arl_create(const char *name, void (*processor)(const char *, uint32_t, const char *, void *), size_t rechecks); + +// free an ARL +void arl_free(ARL_BASE *arl_base); + +// register an expected keyword to the ARL +// together with its destination ( i.e. the output of the processor() ) +ARL_ENTRY *arl_expect_custom(ARL_BASE *base, const char *keyword, void (*processor)(const char *name, uint32_t hash, const char *value, void *dst), void *dst); +#define arl_expect(base, keyword, dst) arl_expect_custom(base, keyword, NULL, dst) + +// an internal call to complete the check() call +int arl_find_or_create_and_relink(ARL_BASE *base, const char *s, const char *value); + +// begin an ARL iteration +void arl_begin(ARL_BASE *base); + +void arl_callback_str2ull(const char *name, uint32_t hash, const char *value, void *dst); +void arl_callback_str2kernel_uint_t(const char *name, uint32_t hash, const char *value, void *dst); +void arl_callback_ssize_t(const char *name, uint32_t hash, const char *value, void *dst); + +// check a keyword against the ARL +// this is to be called for each keyword read from source data +// s = the keyword, as collected +// src = the src data to be passed to the processor +// it is defined in the header file in order to be inlined +static inline int arl_check(ARL_BASE *base, const char *keyword, const char *value) { + ARL_ENTRY *e = base->next_keyword; + +#ifdef NETDATA_INTERNAL_CHECKS + if(unlikely((base->fast + base->slow) % (base->expected + base->allocated) == 0 && (base->fast + base->slow) > (base->expected + base->allocated) * base->iteration)) + netdata_log_info("ARL '%s': Did you forget to call arl_begin()?", base->name); +#endif + + // it should be the first entry (pointed by base->next_keyword) + if(likely(!strcmp(keyword, e->name))) { + // it is + +#ifdef NETDATA_INTERNAL_CHECKS + base->fast++; +#endif + + e->flags |= ARL_ENTRY_FLAG_FOUND; + + // execute the processor + if(unlikely(e->dst)) { + e->processor(e->name, e->hash, value, e->dst); + base->found++; + } + + // be prepared for the next iteration + base->next_keyword = e->next; + if(unlikely(!base->next_keyword)) + base->next_keyword = base->head; + + // stop if we collected all the values for this iteration + if(unlikely(base->found == base->wanted)) { + // fprintf(stderr, "FOUND ALL WANTED 2: found = %zu, wanted = %zu, expected %zu\n", base->found, base->wanted, base->expected); + return 1; + } + + return 0; + } + +#ifdef NETDATA_INTERNAL_CHECKS + base->slow++; +#endif + + // we read from source, a not-expected keyword + return arl_find_or_create_and_relink(base, keyword, value); +} + +#endif //NETDATA_ADAPTIVE_RESORTABLE_LIST_H diff --git a/src/libnetdata/aral/README.md b/src/libnetdata/aral/README.md new file mode 100644 index 00000000..d999e820 --- /dev/null +++ b/src/libnetdata/aral/README.md @@ -0,0 +1,173 @@ +<!-- +title: "Array Allocator" +custom_edit_url: https://github.com/netdata/netdata/edit/master/src/libnetdata/aral/README.md +sidebar_label: "Array allocator" +learn_status: "Published" +learn_topic_type: "Tasks" +learn_rel_path: "Developers/libnetdata" +--> + +# Array Allocator + +Come on! Array allocators are embedded in libc! Why do we need such a thing in Netdata? + +Well, we have a couple of problems to solve: + +1. **Fragmentation** - It is important for Netdata to keeps its overall memory footprint as low as possible. libc does an amazing job when the same thread allocates and frees some memory. But it simply cannot do better without knowing the specifics of the application when memory is allocated and freed randomly between threads. +2. **Speed** - Especially when allocations and de-allocations happen across threads, the speed penalty is tremendous. + +In Netdata we have a few moments that are very tough. Imagine collecting 1 million metrics per second. You have a buffer for each metric and put append new points there. This works beautifully, of course! But then, when the buffers get full, imagine the situation. You suddenly need 1 million buffers, at once! + +To solve this problem we first spread out the buffers. So, the first time each metric asks for a buffer, it gets a smaller one. We added logic there to spread them as evenly as possible across time. Solved? Not exactly! + +We have 3 tiers for each metric. For the metrics of tier 0 (per second resolution) we have a max buffer for 1024 points and every new metrics gets a random size between 3 points and 1024. So they are distributed across time. For 1 million metrics, we have about 1000 buffers beings created every second. + +But at some point, the end of the minute will come, and suddenly all the metrics will need a new buffer for tier 1 (per minute). Oops! We will spread tier 1 buffers across time too, but the first minute is a tough one. We really need 1 million buffers instantly. + +And if that minute happens to also be the beginning of an hour... tier 2 (per hour) kicks in. For that instant we are going to need 2 million buffers instantly. + +The problem becomes even bigger when we collect 2, or even 10 million metrics... + +So solve it, Netdata uses a special implementation of an array allocator that is tightly integrated with the structures we need. + +## Features + +1. Malloc, or MMAP modes. File based MMAP is also supported to put the data in file backed up shared memory. +2. Fully asynchronous operations. There are just a couple of points where spin-locks protect a few counters and pointers. +3. Optional defragmenter, that once enabled it will make free operation slower while trying to maintain a sorted list of fragments to offer first during allocations. The defragmenter can be enabled / disabled at run time. The defragmenter can hurt performance on application with intense turn-around of allocation, like Netdata dbengine caches. So, it is disabled by default. +4. Without the defragmenter enabled, ARAL still tries to keep pages full, but the depth of the search is limited to 3 pages (so, a page with a free slot will either become 1st, 2nd, or 3rd). At the same time, during allocations, ARAL will evaluate the first 2 pages to find the one that is more full than the other, to use it for the new allocation. + +## How it works + +Allocations are organized in pages. Pages have a minimum size (a system page, usually 4KB) and a maximum defined by for each different kind of object. + +Initially every page is free. When an allocation request is made, the free space is split, and the first element is reserved. Free space is now considered there rest. + +This continuous until the page gets full, where a new page is allocated and the process is repeated. + +Each allocation returned has a pointer appended to it. The pointer points to the page the allocation belongs to. + +When a pointer is freed, the page it belongs is identified, its space is marked free, and it is prepended in a single linked list that resides in the page itself. So, each page has its own list of free slots to use. + +Pages are then on another linked list. This is a double linked list and at its beginning has the pages with free space and at the end the pages that are full. + +When the defragmenter is enabled the pages double linked list is also sorted, like this: the fewer the free slots on a page, the earlier in the linked list the page will be, except if it does not have any free slot, in which case it will be at the end. So, the defragmenter tries to have pages full. + +When a page is entirerly free, it is given back to the system immediately. There is no caching of free pages. + + +Parallelism is achieved like this: + +When some threads are waiting for a page to be allocated, free operations are allowed. If a free operation happens before a new page is allocated, any waiting thread will get the slot that is freed on another page. + +Free operations happen in parallel, even for the same page. There is a spin-lock on each page to protect the base pointer of the page's free slots single linked list. But, this is instant. All preparative work happens lockless, then to add the free slot to the page, the page spinlock is acquired, the free slot is prepended to the linked list on the page, the spinlock is released. Such free operations on different pages are totally parallel. + +Once the free operation on a page has finished, the pages double linked list spinlock is acquired to put the page first on that linked list. If the defragmenter is enabled, the spinlock is retained for a little longer, to find the exact position of the page in the linked list. + +During allocations, the reverse order is used. First get the pages double linked list spinlock, get the first page and decrement its free slots counter, then release the spinlock. If the first page does not have any free slots, a page allocation is spawn, without any locks acquired. All threads are spinning waiting for a page with free slots, either from the newly allocated one or from a free operation that may happen in parallel. + +Once a page is acquired, each thread locks its own page to get the first free slot and releases the lock immediately. This is guaranteed to succeed, because when the page was given to that thread its free slots counter was decremented. So, there is a free slot for every thread that got that page. All preparative work to return a pointer to the caller is done lock free. Allocations on different pages are done in parallel, without any intervention between them. + + +## What to expect + +Systems not designed for parallelism achieve their top performance single threaded. The single threaded speed is the baseline. Adding more threads makes them slower. + +The baseline for ARAL is the following, the included stress test when running single threaded: + +``` +Running stress test of 1 threads, with 10000 elements each, for 5 seconds... +2023-01-29 17:04:50: netdata INFO : TH[0] : set name of thread 1314983 to TH[0] +ARAL executes 12.27 M malloc and 12.26 M free operations/s +ARAL executes 12.29 M malloc and 12.29 M free operations/s +ARAL executes 12.30 M malloc and 12.30 M free operations/s +ARAL executes 12.30 M malloc and 12.29 M free operations/s +ARAL executes 12.29 M malloc and 12.29 M free operations/s +Waiting the threads to finish... +2023-01-29 17:04:55: netdata INFO : MAIN : ARAL: did 61487356 malloc, 61487356 free, using 1 threads, in 5003808 usecs +``` + +The same test with 2 threads, both threads on the same ARAL of course. As you see performance improved: + +``` +Running stress test of 2 threads, with 10000 elements each, for 5 seconds... +2023-01-29 17:05:25: netdata INFO : TH[0] : set name of thread 1315537 to TH[0] +2023-01-29 17:05:25: netdata INFO : TH[1] : set name of thread 1315538 to TH[1] +ARAL executes 17.75 M malloc and 17.73 M free operations/s +ARAL executes 17.93 M malloc and 17.93 M free operations/s +ARAL executes 18.17 M malloc and 18.18 M free operations/s +ARAL executes 18.33 M malloc and 18.32 M free operations/s +ARAL executes 18.36 M malloc and 18.36 M free operations/s +Waiting the threads to finish... +2023-01-29 17:05:30: netdata INFO : MAIN : ARAL: did 90976190 malloc, 90976190 free, using 2 threads, in 5029462 usecs +``` + +The same test with 4 threads: + +``` +Running stress test of 4 threads, with 10000 elements each, for 5 seconds... +2023-01-29 17:10:12: netdata INFO : TH[0] : set name of thread 1319552 to TH[0] +2023-01-29 17:10:12: netdata INFO : TH[1] : set name of thread 1319553 to TH[1] +2023-01-29 17:10:12: netdata INFO : TH[2] : set name of thread 1319554 to TH[2] +2023-01-29 17:10:12: netdata INFO : TH[3] : set name of thread 1319555 to TH[3] +ARAL executes 19.95 M malloc and 19.91 M free operations/s +ARAL executes 20.08 M malloc and 20.08 M free operations/s +ARAL executes 20.85 M malloc and 20.85 M free operations/s +ARAL executes 20.84 M malloc and 20.84 M free operations/s +ARAL executes 21.37 M malloc and 21.37 M free operations/s +Waiting the threads to finish... +2023-01-29 17:10:17: netdata INFO : MAIN : ARAL: did 103549747 malloc, 103549747 free, using 4 threads, in 5023325 usecs +``` + +The same with 8 threads: + +``` +Running stress test of 8 threads, with 10000 elements each, for 5 seconds... +2023-01-29 17:07:06: netdata INFO : TH[0] : set name of thread 1317608 to TH[0] +2023-01-29 17:07:06: netdata INFO : TH[1] : set name of thread 1317609 to TH[1] +2023-01-29 17:07:06: netdata INFO : TH[2] : set name of thread 1317610 to TH[2] +2023-01-29 17:07:06: netdata INFO : TH[3] : set name of thread 1317611 to TH[3] +2023-01-29 17:07:06: netdata INFO : TH[4] : set name of thread 1317612 to TH[4] +2023-01-29 17:07:06: netdata INFO : TH[5] : set name of thread 1317613 to TH[5] +2023-01-29 17:07:06: netdata INFO : TH[6] : set name of thread 1317614 to TH[6] +2023-01-29 17:07:06: netdata INFO : TH[7] : set name of thread 1317615 to TH[7] +ARAL executes 15.73 M malloc and 15.66 M free operations/s +ARAL executes 13.95 M malloc and 13.94 M free operations/s +ARAL executes 15.59 M malloc and 15.58 M free operations/s +ARAL executes 15.49 M malloc and 15.49 M free operations/s +ARAL executes 16.16 M malloc and 16.16 M free operations/s +Waiting the threads to finish... +2023-01-29 17:07:11: netdata INFO : MAIN : ARAL: did 78427750 malloc, 78427750 free, using 8 threads, in 5088591 usecs +``` + +The same with 16 threads: + +``` +Running stress test of 16 threads, with 10000 elements each, for 5 seconds... +2023-01-29 17:08:04: netdata INFO : TH[0] : set name of thread 1318663 to TH[0] +2023-01-29 17:08:04: netdata INFO : TH[1] : set name of thread 1318664 to TH[1] +2023-01-29 17:08:04: netdata INFO : TH[2] : set name of thread 1318665 to TH[2] +2023-01-29 17:08:04: netdata INFO : TH[3] : set name of thread 1318666 to TH[3] +2023-01-29 17:08:04: netdata INFO : TH[4] : set name of thread 1318667 to TH[4] +2023-01-29 17:08:04: netdata INFO : TH[5] : set name of thread 1318668 to TH[5] +2023-01-29 17:08:04: netdata INFO : TH[6] : set name of thread 1318669 to TH[6] +2023-01-29 17:08:04: netdata INFO : TH[7] : set name of thread 1318670 to TH[7] +2023-01-29 17:08:04: netdata INFO : TH[8] : set name of thread 1318671 to TH[8] +2023-01-29 17:08:04: netdata INFO : TH[9] : set name of thread 1318672 to TH[9] +2023-01-29 17:08:04: netdata INFO : TH[10] : set name of thread 1318673 to TH[10] +2023-01-29 17:08:04: netdata INFO : TH[11] : set name of thread 1318674 to TH[11] +2023-01-29 17:08:04: netdata INFO : TH[12] : set name of thread 1318675 to TH[12] +2023-01-29 17:08:04: netdata INFO : TH[13] : set name of thread 1318676 to TH[13] +2023-01-29 17:08:04: netdata INFO : TH[14] : set name of thread 1318677 to TH[14] +2023-01-29 17:08:04: netdata INFO : TH[15] : set name of thread 1318678 to TH[15] +ARAL executes 11.77 M malloc and 11.62 M free operations/s +ARAL executes 12.80 M malloc and 12.81 M free operations/s +ARAL executes 13.26 M malloc and 13.25 M free operations/s +ARAL executes 13.30 M malloc and 13.29 M free operations/s +ARAL executes 13.23 M malloc and 13.25 M free operations/s +Waiting the threads to finish... +2023-01-29 17:08:09: netdata INFO : MAIN : ARAL: did 65302122 malloc, 65302122 free, using 16 threads, in 5066009 usecs +``` + +As you can see, the top performance is with 4 threads, almost double the single thread speed. +16 threads performance is still better than single threaded, despite the intense concurrency. diff --git a/src/libnetdata/aral/aral.c b/src/libnetdata/aral/aral.c new file mode 100644 index 00000000..64b63d8e --- /dev/null +++ b/src/libnetdata/aral/aral.c @@ -0,0 +1,1098 @@ +#include "../libnetdata.h" +#include "aral.h" + +#ifdef NETDATA_TRACE_ALLOCATIONS +#define TRACE_ALLOCATIONS_FUNCTION_DEFINITION_PARAMS , const char *file, const char *function, size_t line +#define TRACE_ALLOCATIONS_FUNCTION_CALL_PARAMS , file, function, line +#else +#define TRACE_ALLOCATIONS_FUNCTION_DEFINITION_PARAMS +#define TRACE_ALLOCATIONS_FUNCTION_CALL_PARAMS +#endif + +#define ARAL_FREE_PAGES_DELTA_TO_REARRANGE_LIST 5 + +// max file size +#define ARAL_MAX_PAGE_SIZE_MMAP (1*1024*1024*1024) + +// max malloc size +// optimal at current versions of libc is up to 256k +// ideal to have the same overhead as libc is 4k +#define ARAL_MAX_PAGE_SIZE_MALLOC (65*1024) + +typedef struct aral_free { + size_t size; + struct aral_free *next; +} ARAL_FREE; + +typedef struct aral_page { + size_t size; // the allocation size of the page + const char *filename; + uint8_t *data; + + uint32_t free_elements_to_move_first; + uint32_t max_elements; // the number of elements that can fit on this page + + struct { + uint32_t used_elements; // the number of used elements on this page + uint32_t free_elements; // the number of free elements on this page + + struct aral_page *prev; // the prev page on the list + struct aral_page *next; // the next page on the list + } aral_lock; + + struct { + SPINLOCK spinlock; + ARAL_FREE *list; + } free; + +} ARAL_PAGE; + +typedef enum { + ARAL_LOCKLESS = (1 << 0), + ARAL_DEFRAGMENT = (1 << 1), + ARAL_ALLOCATED_STATS = (1 << 2), +} ARAL_OPTIONS; + +struct aral { + struct { + char name[ARAL_MAX_NAME + 1]; + + ARAL_OPTIONS options; + + size_t element_size; // calculated to take into account ARAL overheads + size_t max_allocation_size; // calculated in bytes + size_t max_page_elements; // calculated + size_t page_ptr_offset; // calculated + size_t natural_page_size; // calculated + + size_t initial_page_elements; + size_t requested_element_size; + size_t requested_max_page_size; + + struct { + bool enabled; + const char *filename; + char **cache_dir; + } mmap; + } config; + + struct { + SPINLOCK spinlock; + size_t file_number; // for mmap + struct aral_page *pages; // linked list of pages + + size_t user_malloc_operations; + size_t user_free_operations; + size_t defragment_operations; + size_t defragment_linked_list_traversals; + } aral_lock; + + struct { + SPINLOCK spinlock; + size_t allocating_elements; // currently allocating elements + size_t allocation_size; // current / next allocation size + } adders; + + struct { + size_t allocators; // the number of threads currently trying to allocate memory + } atomic; + + struct aral_statistics *stats; +}; + +size_t aral_structures_from_stats(struct aral_statistics *stats) { + return __atomic_load_n(&stats->structures.allocated_bytes, __ATOMIC_RELAXED); +} + +size_t aral_overhead_from_stats(struct aral_statistics *stats) { + return __atomic_load_n(&stats->malloc.allocated_bytes, __ATOMIC_RELAXED) - + __atomic_load_n(&stats->malloc.used_bytes, __ATOMIC_RELAXED); +} + +size_t aral_overhead(ARAL *ar) { + return aral_overhead_from_stats(ar->stats); +} + +size_t aral_structures(ARAL *ar) { + return aral_structures_from_stats(ar->stats); +} + +struct aral_statistics *aral_statistics(ARAL *ar) { + return ar->stats; +} + +#define ARAL_NATURAL_ALIGNMENT (sizeof(uintptr_t) * 2) +static inline size_t natural_alignment(size_t size, size_t alignment) { + if(unlikely(size % alignment)) + size = size + alignment - (size % alignment); + + return size; +} + +static size_t aral_align_alloc_size(ARAL *ar, uint64_t size) { + if(size % ar->config.natural_page_size) + size += ar->config.natural_page_size - (size % ar->config.natural_page_size) ; + + if(size % ar->config.element_size) + size -= size % ar->config.element_size; + + return size; +} + +static inline void aral_lock(ARAL *ar) { + if(likely(!(ar->config.options & ARAL_LOCKLESS))) + spinlock_lock(&ar->aral_lock.spinlock); +} + +static inline void aral_unlock(ARAL *ar) { + if(likely(!(ar->config.options & ARAL_LOCKLESS))) + spinlock_unlock(&ar->aral_lock.spinlock); +} + +static inline void aral_page_free_lock(ARAL *ar, ARAL_PAGE *page) { + if(likely(!(ar->config.options & ARAL_LOCKLESS))) + spinlock_lock(&page->free.spinlock); +} + +static inline void aral_page_free_unlock(ARAL *ar, ARAL_PAGE *page) { + if(likely(!(ar->config.options & ARAL_LOCKLESS))) + spinlock_unlock(&page->free.spinlock); +} + +static inline bool aral_adders_trylock(ARAL *ar) { + if(likely(!(ar->config.options & ARAL_LOCKLESS))) + return spinlock_trylock(&ar->adders.spinlock); + + return true; +} + +static inline void aral_adders_lock(ARAL *ar) { + if(likely(!(ar->config.options & ARAL_LOCKLESS))) + spinlock_lock(&ar->adders.spinlock); +} + +static inline void aral_adders_unlock(ARAL *ar) { + if(likely(!(ar->config.options & ARAL_LOCKLESS))) + spinlock_unlock(&ar->adders.spinlock); +} + +static void aral_delete_leftover_files(const char *name, const char *path, const char *required_prefix) { + DIR *dir = opendir(path); + if(!dir) return; + + char full_path[FILENAME_MAX + 1]; + size_t len = strlen(required_prefix); + + struct dirent *de = NULL; + while((de = readdir(dir))) { + if(de->d_type == DT_DIR) + continue; + + if(strncmp(de->d_name, required_prefix, len) != 0) + continue; + + snprintfz(full_path, FILENAME_MAX, "%s/%s", path, de->d_name); + netdata_log_info("ARAL: '%s' removing left-over file '%s'", name, full_path); + if(unlikely(unlink(full_path) == -1)) + netdata_log_error("ARAL: '%s' cannot delete file '%s'", name, full_path); + } + + closedir(dir); +} + +// ---------------------------------------------------------------------------- +// check a free slot + +#ifdef NETDATA_INTERNAL_CHECKS +static inline void aral_free_validate_internal_check(ARAL *ar, ARAL_FREE *fr) { + if(unlikely(fr->size < ar->config.element_size)) + fatal("ARAL: '%s' free item of size %zu, less than the expected element size %zu", + ar->config.name, fr->size, ar->config.element_size); + + if(unlikely(fr->size % ar->config.element_size)) + fatal("ARAL: '%s' free item of size %zu is not multiple to element size %zu", + ar->config.name, fr->size, ar->config.element_size); +} +#else +#define aral_free_validate_internal_check(ar, fr) debug_dummy() +#endif + +// ---------------------------------------------------------------------------- +// find the page a pointer belongs to + +#ifdef NETDATA_INTERNAL_CHECKS +static inline ARAL_PAGE *find_page_with_allocation_internal_check(ARAL *ar, void *ptr) { + aral_lock(ar); + + uintptr_t seeking = (uintptr_t)ptr; + ARAL_PAGE *page; + + for(page = ar->aral_lock.pages; page ; page = page->aral_lock.next) { + if(unlikely(seeking >= (uintptr_t)page->data && seeking < (uintptr_t)page->data + page->size)) + break; + } + + aral_unlock(ar); + + return page; +} +#endif + +// ---------------------------------------------------------------------------- +// find a page with a free slot (there shouldn't be any) + +#ifdef NETDATA_ARAL_INTERNAL_CHECKS +static inline ARAL_PAGE *find_page_with_free_slots_internal_check___with_aral_lock(ARAL *ar) { + ARAL_PAGE *page; + + for(page = ar->aral_lock.pages; page ; page = page->next) { + if(page->aral_lock.free_elements) + break; + + internal_fatal(page->size - page->aral_lock.used_elements * ar->config.element_size >= ar->config.element_size, + "ARAL: '%s' a page is marked full, but it is not!", ar->config.name); + + internal_fatal(page->size < page->aral_lock.used_elements * ar->config.element_size, + "ARAL: '%s' a page has been overflown!", ar->config.name); + } + + return page; +} +#endif + +size_t aral_next_allocation_size___adders_lock_needed(ARAL *ar) { + size_t size = ar->adders.allocation_size; + + if(size > ar->config.max_allocation_size) + size = ar->config.max_allocation_size; + else + ar->adders.allocation_size = aral_align_alloc_size(ar, (uint64_t)ar->adders.allocation_size * 2); + + return size; +} + +static ARAL_PAGE *aral_create_page___no_lock_needed(ARAL *ar, size_t size TRACE_ALLOCATIONS_FUNCTION_DEFINITION_PARAMS) { + ARAL_PAGE *page = callocz(1, sizeof(ARAL_PAGE)); + spinlock_init(&page->free.spinlock); + page->size = size; + page->max_elements = page->size / ar->config.element_size; + page->aral_lock.free_elements = page->max_elements; + page->free_elements_to_move_first = page->max_elements / 4; + if(unlikely(page->free_elements_to_move_first < 1)) + page->free_elements_to_move_first = 1; + + __atomic_add_fetch(&ar->stats->structures.allocations, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&ar->stats->structures.allocated_bytes, sizeof(ARAL_PAGE), __ATOMIC_RELAXED); + + if(unlikely(ar->config.mmap.enabled)) { + ar->aral_lock.file_number++; + char filename[FILENAME_MAX + 1]; + snprintfz(filename, FILENAME_MAX, "%s/array_alloc.mmap/%s.%zu", *ar->config.mmap.cache_dir, ar->config.mmap.filename, ar->aral_lock.file_number); + page->filename = strdupz(filename); + page->data = netdata_mmap(page->filename, page->size, MAP_SHARED, 0, false, NULL); + if (unlikely(!page->data)) + fatal("ARAL: '%s' cannot allocate aral buffer of size %zu on filename '%s'", + ar->config.name, page->size, page->filename); + __atomic_add_fetch(&ar->stats->mmap.allocations, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&ar->stats->mmap.allocated_bytes, page->size, __ATOMIC_RELAXED); + } + else { +#ifdef NETDATA_TRACE_ALLOCATIONS + page->data = mallocz_int(page->size TRACE_ALLOCATIONS_FUNCTION_CALL_PARAMS); +#else + page->data = mallocz(page->size); +#endif + __atomic_add_fetch(&ar->stats->malloc.allocations, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&ar->stats->malloc.allocated_bytes, page->size, __ATOMIC_RELAXED); + } + + // link the free space to its page + ARAL_FREE *fr = (ARAL_FREE *)page->data; + fr->size = page->size; + fr->next = NULL; + page->free.list = fr; + + aral_free_validate_internal_check(ar, fr); + + return page; +} + +void aral_del_page___no_lock_needed(ARAL *ar, ARAL_PAGE *page TRACE_ALLOCATIONS_FUNCTION_DEFINITION_PARAMS) { + + // free it + if (ar->config.mmap.enabled) { + netdata_munmap(page->data, page->size); + + if (unlikely(unlink(page->filename) == 1)) + netdata_log_error("Cannot delete file '%s'", page->filename); + + freez((void *)page->filename); + + __atomic_sub_fetch(&ar->stats->mmap.allocations, 1, __ATOMIC_RELAXED); + __atomic_sub_fetch(&ar->stats->mmap.allocated_bytes, page->size, __ATOMIC_RELAXED); + } + else { +#ifdef NETDATA_TRACE_ALLOCATIONS + freez_int(page->data TRACE_ALLOCATIONS_FUNCTION_CALL_PARAMS); +#else + freez(page->data); +#endif + __atomic_sub_fetch(&ar->stats->malloc.allocations, 1, __ATOMIC_RELAXED); + __atomic_sub_fetch(&ar->stats->malloc.allocated_bytes, page->size, __ATOMIC_RELAXED); + } + + freez(page); + + __atomic_sub_fetch(&ar->stats->structures.allocations, 1, __ATOMIC_RELAXED); + __atomic_sub_fetch(&ar->stats->structures.allocated_bytes, sizeof(ARAL_PAGE), __ATOMIC_RELAXED); +} + +static inline void aral_insert_not_linked_page_with_free_items_to_proper_position___aral_lock_needed(ARAL *ar, ARAL_PAGE *page) { + ARAL_PAGE *first = ar->aral_lock.pages; + + if (page->aral_lock.free_elements <= page->free_elements_to_move_first || + !first || + !first->aral_lock.free_elements || + page->aral_lock.free_elements <= first->aral_lock.free_elements + ARAL_FREE_PAGES_DELTA_TO_REARRANGE_LIST) { + // first position + DOUBLE_LINKED_LIST_PREPEND_ITEM_UNSAFE(ar->aral_lock.pages, page, aral_lock.prev, aral_lock.next); + } + else { + ARAL_PAGE *second = first->aral_lock.next; + + if (!second || + !second->aral_lock.free_elements || + page->aral_lock.free_elements <= second->aral_lock.free_elements) + // second position + DOUBLE_LINKED_LIST_INSERT_ITEM_AFTER_UNSAFE(ar->aral_lock.pages, first, page, aral_lock.prev, aral_lock.next); + else + // third position + DOUBLE_LINKED_LIST_INSERT_ITEM_AFTER_UNSAFE(ar->aral_lock.pages, second, page, aral_lock.prev, aral_lock.next); + } +} + +static inline ARAL_PAGE *aral_acquire_a_free_slot(ARAL *ar TRACE_ALLOCATIONS_FUNCTION_DEFINITION_PARAMS) { + __atomic_add_fetch(&ar->atomic.allocators, 1, __ATOMIC_RELAXED); + aral_lock(ar); + + ARAL_PAGE *page = ar->aral_lock.pages; + + while(!page || !page->aral_lock.free_elements) { +#ifdef NETDATA_ARAL_INTERNAL_CHECKS + internal_fatal(find_page_with_free_slots_internal_check___with_aral_lock(ar), "ARAL: '%s' found page with free slot!", ar->config.name); +#endif + aral_unlock(ar); + + if(aral_adders_trylock(ar)) { + if(ar->adders.allocating_elements < __atomic_load_n(&ar->atomic.allocators, __ATOMIC_RELAXED)) { + + size_t size = aral_next_allocation_size___adders_lock_needed(ar); + ar->adders.allocating_elements += size / ar->config.element_size; + aral_adders_unlock(ar); + + page = aral_create_page___no_lock_needed(ar, size TRACE_ALLOCATIONS_FUNCTION_CALL_PARAMS); + + aral_lock(ar); + aral_insert_not_linked_page_with_free_items_to_proper_position___aral_lock_needed(ar, page); + + aral_adders_lock(ar); + ar->adders.allocating_elements -= size / ar->config.element_size; + aral_adders_unlock(ar); + + // we have a page that is all empty + // and only aral_lock() is held, so + // break the loop + break; + } + + aral_adders_unlock(ar); + } + + aral_lock(ar); + page = ar->aral_lock.pages; + } + + __atomic_sub_fetch(&ar->atomic.allocators, 1, __ATOMIC_RELAXED); + + // we have a page + // and aral locked + + { + ARAL_PAGE *first = ar->aral_lock.pages; + ARAL_PAGE *second = first->aral_lock.next; + + if (!second || + !second->aral_lock.free_elements || + first->aral_lock.free_elements <= second->aral_lock.free_elements + ARAL_FREE_PAGES_DELTA_TO_REARRANGE_LIST) + page = first; + else { + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(ar->aral_lock.pages, second, aral_lock.prev, aral_lock.next); + DOUBLE_LINKED_LIST_PREPEND_ITEM_UNSAFE(ar->aral_lock.pages, second, aral_lock.prev, aral_lock.next); + page = second; + } + } + + internal_fatal(!page || !page->aral_lock.free_elements, + "ARAL: '%s' selected page does not have a free slot in it", + ar->config.name); + + internal_fatal(page->max_elements != page->aral_lock.used_elements + page->aral_lock.free_elements, + "ARAL: '%s' page element counters do not match, " + "page says it can handle %zu elements, " + "but there are %zu used and %zu free items, " + "total %zu items", + ar->config.name, + (size_t)page->max_elements, + (size_t)page->aral_lock.used_elements, (size_t)page->aral_lock.free_elements, + (size_t)page->aral_lock.used_elements + (size_t)page->aral_lock.free_elements + ); + + ar->aral_lock.user_malloc_operations++; + + // acquire a slot for the caller + page->aral_lock.used_elements++; + if(--page->aral_lock.free_elements == 0) { + // we are done with this page + // move the full page last + // so that pages with free items remain first in the list + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(ar->aral_lock.pages, page, aral_lock.prev, aral_lock.next); + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(ar->aral_lock.pages, page, aral_lock.prev, aral_lock.next); + } + + aral_unlock(ar); + + return page; +} + +void *aral_callocz_internal(ARAL *ar TRACE_ALLOCATIONS_FUNCTION_DEFINITION_PARAMS) { + void *r = aral_mallocz_internal(ar TRACE_ALLOCATIONS_FUNCTION_CALL_PARAMS); + memset(r, 0, ar->config.requested_element_size); + return r; +} + +void *aral_mallocz_internal(ARAL *ar TRACE_ALLOCATIONS_FUNCTION_DEFINITION_PARAMS) { +#ifdef FSANITIZE_ADDRESS + return mallocz(ar->config.requested_element_size); +#endif + + ARAL_PAGE *page = aral_acquire_a_free_slot(ar TRACE_ALLOCATIONS_FUNCTION_CALL_PARAMS); + + aral_page_free_lock(ar, page); + + internal_fatal(!page->free.list, + "ARAL: '%s' free item to use, cannot be NULL.", ar->config.name); + + internal_fatal(page->free.list->size < ar->config.element_size, + "ARAL: '%s' free item size %zu, cannot be smaller than %zu", + ar->config.name, page->free.list->size, ar->config.element_size); + + ARAL_FREE *found_fr = page->free.list; + + // check if the remaining size (after we use this slot) is not enough for another element + if(unlikely(found_fr->size - ar->config.element_size < ar->config.element_size)) { + // we can use the entire free space entry + + page->free.list = found_fr->next; + } + else { + // we can split the free space entry + + uint8_t *data = (uint8_t *)found_fr; + ARAL_FREE *fr = (ARAL_FREE *)&data[ar->config.element_size]; + fr->size = found_fr->size - ar->config.element_size; + + // link the free slot first in the page + fr->next = found_fr->next; + page->free.list = fr; + + aral_free_validate_internal_check(ar, fr); + } + + aral_page_free_unlock(ar, page); + + // put the page pointer after the element + uint8_t *data = (uint8_t *)found_fr; + ARAL_PAGE **page_ptr = (ARAL_PAGE **)&data[ar->config.page_ptr_offset]; + *page_ptr = page; + + if(unlikely(ar->config.mmap.enabled)) + __atomic_add_fetch(&ar->stats->mmap.used_bytes, ar->config.element_size, __ATOMIC_RELAXED); + else + __atomic_add_fetch(&ar->stats->malloc.used_bytes, ar->config.element_size, __ATOMIC_RELAXED); + + return (void *)found_fr; +} + +static inline ARAL_PAGE *aral_ptr_to_page___must_NOT_have_aral_lock(ARAL *ar, void *ptr) { + // given a data pointer we returned before, + // find the ARAL_PAGE it belongs to + + uint8_t *data = (uint8_t *)ptr; + ARAL_PAGE **page_ptr = (ARAL_PAGE **)&data[ar->config.page_ptr_offset]; + ARAL_PAGE *page = *page_ptr; + +#ifdef NETDATA_INTERNAL_CHECKS + // make it NULL so that we will fail on double free + // do not enable this on production, because the MMAP file + // will need to be saved again! + *page_ptr = NULL; +#endif + +#ifdef NETDATA_ARAL_INTERNAL_CHECKS + { + // find the page ptr belongs + ARAL_PAGE *page2 = find_page_with_allocation_internal_check(ar, ptr); + + internal_fatal(page != page2, + "ARAL: '%s' page pointers do not match!", + ar->name); + + internal_fatal(!page2, + "ARAL: '%s' free of pointer %p is not in ARAL address space.", + ar->name, ptr); + } +#endif + + internal_fatal(!page, + "ARAL: '%s' possible corruption or double free of pointer %p", + ar->config.name, ptr); + + return page; +} + +static void aral_defrag_sorted_page_position___aral_lock_needed(ARAL *ar, ARAL_PAGE *page) { + ARAL_PAGE *tmp; + + int action = 0; (void)action; + size_t move_later = 0, move_earlier = 0; + + for(tmp = page->aral_lock.next ; + tmp && tmp->aral_lock.free_elements && tmp->aral_lock.free_elements < page->aral_lock.free_elements ; + tmp = tmp->aral_lock.next) + move_later++; + + if(!tmp && page->aral_lock.next) { + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(ar->aral_lock.pages, page, aral_lock.prev, aral_lock.next); + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(ar->aral_lock.pages, page, aral_lock.prev, aral_lock.next); + action = 1; + } + else if(tmp != page->aral_lock.next) { + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(ar->aral_lock.pages, page, aral_lock.prev, aral_lock.next); + DOUBLE_LINKED_LIST_INSERT_ITEM_BEFORE_UNSAFE(ar->aral_lock.pages, tmp, page, aral_lock.prev, aral_lock.next); + action = 2; + } + else { + for(tmp = (page == ar->aral_lock.pages) ? NULL : page->aral_lock.prev ; + tmp && (!tmp->aral_lock.free_elements || tmp->aral_lock.free_elements > page->aral_lock.free_elements); + tmp = (tmp == ar->aral_lock.pages) ? NULL : tmp->aral_lock.prev) + move_earlier++; + + if(!tmp) { + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(ar->aral_lock.pages, page, aral_lock.prev, aral_lock.next); + DOUBLE_LINKED_LIST_PREPEND_ITEM_UNSAFE(ar->aral_lock.pages, page, aral_lock.prev, aral_lock.next); + action = 3; + } + else if(tmp != page->aral_lock.prev){ + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(ar->aral_lock.pages, page, aral_lock.prev, aral_lock.next); + DOUBLE_LINKED_LIST_INSERT_ITEM_AFTER_UNSAFE(ar->aral_lock.pages, tmp, page, aral_lock.prev, aral_lock.next); + action = 4; + } + } + + ar->aral_lock.defragment_operations++; + ar->aral_lock.defragment_linked_list_traversals += move_earlier + move_later; + + internal_fatal(page->aral_lock.next && page->aral_lock.next->aral_lock.free_elements && page->aral_lock.next->aral_lock.free_elements < page->aral_lock.free_elements, + "ARAL: '%s' item should be later in the list", ar->config.name); + + internal_fatal(page != ar->aral_lock.pages && (!page->aral_lock.prev->aral_lock.free_elements || page->aral_lock.prev->aral_lock.free_elements > page->aral_lock.free_elements), + "ARAL: '%s' item should be earlier in the list", ar->config.name); +} + +static inline void aral_move_page_with_free_list___aral_lock_needed(ARAL *ar, ARAL_PAGE *page) { + if(unlikely(page == ar->aral_lock.pages)) + // we are the first already + return; + + if(likely(!(ar->config.options & ARAL_DEFRAGMENT))) { + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(ar->aral_lock.pages, page, aral_lock.prev, aral_lock.next); + aral_insert_not_linked_page_with_free_items_to_proper_position___aral_lock_needed(ar, page); + } + else + aral_defrag_sorted_page_position___aral_lock_needed(ar, page); +} + +void aral_freez_internal(ARAL *ar, void *ptr TRACE_ALLOCATIONS_FUNCTION_DEFINITION_PARAMS) { +#ifdef FSANITIZE_ADDRESS + freez(ptr); + return; +#endif + + if(unlikely(!ptr)) return; + + // get the page pointer + ARAL_PAGE *page = aral_ptr_to_page___must_NOT_have_aral_lock(ar, ptr); + + if(unlikely(ar->config.mmap.enabled)) + __atomic_sub_fetch(&ar->stats->mmap.used_bytes, ar->config.element_size, __ATOMIC_RELAXED); + else + __atomic_sub_fetch(&ar->stats->malloc.used_bytes, ar->config.element_size, __ATOMIC_RELAXED); + + // make this element available + ARAL_FREE *fr = (ARAL_FREE *)ptr; + fr->size = ar->config.element_size; + + aral_page_free_lock(ar, page); + fr->next = page->free.list; + page->free.list = fr; + aral_page_free_unlock(ar, page); + + aral_lock(ar); + + internal_fatal(!page->aral_lock.used_elements, + "ARAL: '%s' pointer %p is inside a page without any active allocations.", + ar->config.name, ptr); + + internal_fatal(page->max_elements != page->aral_lock.used_elements + page->aral_lock.free_elements, + "ARAL: '%s' page element counters do not match, " + "page says it can handle %zu elements, " + "but there are %zu used and %zu free items, " + "total %zu items", + ar->config.name, + (size_t)page->max_elements, + (size_t)page->aral_lock.used_elements, (size_t)page->aral_lock.free_elements, + (size_t)page->aral_lock.used_elements + (size_t)page->aral_lock.free_elements + ); + + page->aral_lock.used_elements--; + page->aral_lock.free_elements++; + + ar->aral_lock.user_free_operations++; + + // if the page is empty, release it + if(unlikely(!page->aral_lock.used_elements)) { + bool is_this_page_the_last_one = ar->aral_lock.pages == page && !page->aral_lock.next; + + if(!is_this_page_the_last_one) + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(ar->aral_lock.pages, page, aral_lock.prev, aral_lock.next); + + aral_unlock(ar); + + if(!is_this_page_the_last_one) + aral_del_page___no_lock_needed(ar, page TRACE_ALLOCATIONS_FUNCTION_CALL_PARAMS); + } + else { + aral_move_page_with_free_list___aral_lock_needed(ar, page); + aral_unlock(ar); + } +} + +void aral_destroy_internal(ARAL *ar TRACE_ALLOCATIONS_FUNCTION_DEFINITION_PARAMS) { + aral_lock(ar); + + ARAL_PAGE *page; + while((page = ar->aral_lock.pages)) { + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(ar->aral_lock.pages, page, aral_lock.prev, aral_lock.next); + aral_del_page___no_lock_needed(ar, page TRACE_ALLOCATIONS_FUNCTION_CALL_PARAMS); + } + + aral_unlock(ar); + + if(ar->config.options & ARAL_ALLOCATED_STATS) + freez(ar->stats); + + freez(ar); +} + +size_t aral_element_size(ARAL *ar) { + return ar->config.requested_element_size; +} + +ARAL *aral_create(const char *name, size_t element_size, size_t initial_page_elements, size_t max_page_size, + struct aral_statistics *stats, const char *filename, char **cache_dir, bool mmap, bool lockless) { + ARAL *ar = callocz(1, sizeof(ARAL)); + ar->config.options = (lockless) ? ARAL_LOCKLESS : 0; + ar->config.requested_element_size = element_size; + ar->config.initial_page_elements = initial_page_elements; + ar->config.requested_max_page_size = max_page_size; + ar->config.mmap.filename = filename; + ar->config.mmap.cache_dir = cache_dir; + ar->config.mmap.enabled = mmap; + strncpyz(ar->config.name, name, ARAL_MAX_NAME); + spinlock_init(&ar->aral_lock.spinlock); + spinlock_init(&ar->adders.spinlock); + + if(stats) { + ar->stats = stats; + ar->config.options &= ~ARAL_ALLOCATED_STATS; + } + else { + ar->stats = callocz(1, sizeof(struct aral_statistics)); + ar->config.options |= ARAL_ALLOCATED_STATS; + } + + long int page_size = sysconf(_SC_PAGE_SIZE); + if (unlikely(page_size == -1)) + ar->config.natural_page_size = 4096; + else + ar->config.natural_page_size = page_size; + + // we need to add a page pointer after the element + // so, first align the element size to the pointer size + ar->config.element_size = natural_alignment(ar->config.requested_element_size, sizeof(uintptr_t)); + + // then add the size of a pointer to it + ar->config.element_size += sizeof(uintptr_t); + + // make sure it is at least what we need for an ARAL_FREE slot + if (ar->config.element_size < sizeof(ARAL_FREE)) + ar->config.element_size = sizeof(ARAL_FREE); + + // and finally align it to the natural alignment + ar->config.element_size = natural_alignment(ar->config.element_size, ARAL_NATURAL_ALIGNMENT); + + ar->config.max_page_elements = ar->config.requested_max_page_size / ar->config.element_size; + + // we write the page pointer just after each element + ar->config.page_ptr_offset = ar->config.element_size - sizeof(uintptr_t); + + if(ar->config.requested_element_size + sizeof(uintptr_t) > ar->config.element_size) + fatal("ARAL: '%s' failed to calculate properly page_ptr_offset: " + "element size %zu, sizeof(uintptr_t) %zu, natural alignment %zu, " + "final element size %zu, page_ptr_offset %zu", + ar->config.name, ar->config.requested_element_size, sizeof(uintptr_t), ARAL_NATURAL_ALIGNMENT, + ar->config.element_size, ar->config.page_ptr_offset); + + //netdata_log_info("ARAL: element size %zu, sizeof(uintptr_t) %zu, natural alignment %zu, final element size %zu, page_ptr_offset %zu", + // ar->element_size, sizeof(uintptr_t), ARAL_NATURAL_ALIGNMENT, ar->internal.element_size, ar->internal.page_ptr_offset); + + + if (ar->config.initial_page_elements < 2) + ar->config.initial_page_elements = 2; + + if(ar->config.mmap.enabled && (!ar->config.mmap.cache_dir || !*ar->config.mmap.cache_dir)) { + netdata_log_error("ARAL: '%s' mmap cache directory is not configured properly, disabling mmap.", ar->config.name); + ar->config.mmap.enabled = false; + internal_fatal(true, "ARAL: '%s' mmap cache directory is not configured properly", ar->config.name); + } + + uint64_t max_alloc_size; + if(!ar->config.max_page_elements) + max_alloc_size = ar->config.mmap.enabled ? ARAL_MAX_PAGE_SIZE_MMAP : ARAL_MAX_PAGE_SIZE_MALLOC; + else + max_alloc_size = ar->config.max_page_elements * ar->config.element_size; + + ar->config.max_allocation_size = aral_align_alloc_size(ar, max_alloc_size); + ar->adders.allocation_size = aral_align_alloc_size(ar, (uint64_t)ar->config.element_size * ar->config.initial_page_elements); + ar->aral_lock.pages = NULL; + ar->aral_lock.file_number = 0; + + if(ar->config.mmap.enabled) { + char directory_name[FILENAME_MAX + 1]; + snprintfz(directory_name, FILENAME_MAX, "%s/array_alloc.mmap", *ar->config.mmap.cache_dir); + int r = mkdir(directory_name, 0775); + if (r != 0 && errno != EEXIST) + fatal("Cannot create directory '%s'", directory_name); + + char file[FILENAME_MAX + 1]; + snprintfz(file, FILENAME_MAX, "%s.", ar->config.mmap.filename); + aral_delete_leftover_files(ar->config.name, directory_name, file); + } + + internal_error(true, + "ARAL: '%s' " + "element size %zu (requested %zu bytes), " + "min elements per page %zu (requested %zu), " + "max elements per page %zu, " + "max page size %zu bytes (requested %zu) " + , ar->config.name + , ar->config.element_size, ar->config.requested_element_size + , ar->adders.allocation_size / ar->config.element_size, ar->config.initial_page_elements + , ar->config.max_allocation_size / ar->config.element_size + , ar->config.max_allocation_size, ar->config.requested_max_page_size + ); + + __atomic_add_fetch(&ar->stats->structures.allocations, 1, __ATOMIC_RELAXED); + __atomic_add_fetch(&ar->stats->structures.allocated_bytes, sizeof(ARAL), __ATOMIC_RELAXED); + return ar; +} + +// ---------------------------------------------------------------------------- +// global aral caching + +#define ARAL_BY_SIZE_MAX_SIZE 1024 + +struct aral_by_size { + ARAL *ar; + int32_t refcount; +}; + +struct { + struct aral_statistics shared_statistics; + SPINLOCK spinlock; + struct aral_by_size array[ARAL_BY_SIZE_MAX_SIZE + 1]; +} aral_by_size_globals = {}; + +struct aral_statistics *aral_by_size_statistics(void) { + return &aral_by_size_globals.shared_statistics; +} + +size_t aral_by_size_structures(void) { + return aral_structures_from_stats(&aral_by_size_globals.shared_statistics); +} + +size_t aral_by_size_overhead(void) { + return aral_overhead_from_stats(&aral_by_size_globals.shared_statistics); +} + +ARAL *aral_by_size_acquire(size_t size) { + spinlock_lock(&aral_by_size_globals.spinlock); + + ARAL *ar = NULL; + + if(size <= ARAL_BY_SIZE_MAX_SIZE && aral_by_size_globals.array[size].ar) { + ar = aral_by_size_globals.array[size].ar; + aral_by_size_globals.array[size].refcount++; + + internal_fatal(aral_element_size(ar) != size, "DICTIONARY: aral has size %zu but we want %zu", + aral_element_size(ar), size); + } + + if(!ar) { + char buf[30 + 1]; + snprintf(buf, 30, "size-%zu", size); + ar = aral_create(buf, + size, + 0, + 65536 * ((size / 150) + 1), + &aral_by_size_globals.shared_statistics, + NULL, NULL, false, false); + + if(size <= ARAL_BY_SIZE_MAX_SIZE) { + aral_by_size_globals.array[size].ar = ar; + aral_by_size_globals.array[size].refcount = 1; + } + } + + spinlock_unlock(&aral_by_size_globals.spinlock); + + return ar; +} + +void aral_by_size_release(ARAL *ar) { + size_t size = aral_element_size(ar); + + if(size <= ARAL_BY_SIZE_MAX_SIZE) { + spinlock_lock(&aral_by_size_globals.spinlock); + + internal_fatal(aral_by_size_globals.array[size].ar != ar, + "ARAL BY SIZE: aral pointers do not match"); + + if(aral_by_size_globals.array[size].refcount <= 0) + fatal("ARAL BY SIZE: double release detected"); + + aral_by_size_globals.array[size].refcount--; +// if(!aral_by_size_globals.array[size].refcount) { +// aral_destroy(aral_by_size_globals.array[size].ar); +// aral_by_size_globals.array[size].ar = NULL; +// } + + spinlock_unlock(&aral_by_size_globals.spinlock); + } + else + aral_destroy(ar); +} + +// ---------------------------------------------------------------------------- +// unittest + +struct aral_unittest_config { + bool single_threaded; + bool stop; + ARAL *ar; + size_t elements; + size_t threads; + int errors; +}; + +static void *aral_test_thread(void *ptr) { + struct aral_unittest_config *auc = ptr; + ARAL *ar = auc->ar; + size_t elements = auc->elements; + + void **pointers = callocz(elements, sizeof(void *)); + + do { + for (size_t i = 0; i < elements; i++) { + pointers[i] = aral_mallocz(ar); + } + + for (size_t div = 5; div >= 2; div--) { + for (size_t i = 0; i < elements / div; i++) { + aral_freez(ar, pointers[i]); + pointers[i] = NULL; + } + + for (size_t i = 0; i < elements / div; i++) { + pointers[i] = aral_mallocz(ar); + } + } + + for (size_t step = 50; step >= 10; step -= 10) { + for (size_t i = 0; i < elements; i += step) { + aral_freez(ar, pointers[i]); + pointers[i] = NULL; + } + + for (size_t i = 0; i < elements; i += step) { + pointers[i] = aral_mallocz(ar); + } + } + + for (size_t i = 0; i < elements; i++) { + aral_freez(ar, pointers[i]); + pointers[i] = NULL; + } + + if (auc->single_threaded && ar->aral_lock.pages && ar->aral_lock.pages->aral_lock.used_elements) { + fprintf(stderr, "\n\nARAL leftovers detected (1)\n\n"); + __atomic_add_fetch(&auc->errors, 1, __ATOMIC_RELAXED); + } + + if(!auc->single_threaded && __atomic_load_n(&auc->stop, __ATOMIC_RELAXED)) + break; + + for (size_t i = 0; i < elements; i++) { + pointers[i] = aral_mallocz(ar); + } + + size_t increment = elements / ar->config.max_page_elements; + for (size_t all = increment; all <= elements / 2; all += increment) { + + size_t to_free = (all % ar->config.max_page_elements) + 1; + size_t step = elements / to_free; + if(!step) step = 1; + + // fprintf(stderr, "all %zu, to free %zu, step %zu\n", all, to_free, step); + + size_t free_list[to_free]; + for (size_t i = 0; i < to_free; i++) { + size_t pos = step * i; + aral_freez(ar, pointers[pos]); + pointers[pos] = NULL; + free_list[i] = pos; + } + + for (size_t i = 0; i < to_free; i++) { + size_t pos = free_list[i]; + pointers[pos] = aral_mallocz(ar); + } + } + + for (size_t i = 0; i < elements; i++) { + aral_freez(ar, pointers[i]); + pointers[i] = NULL; + } + + if (auc->single_threaded && ar->aral_lock.pages && ar->aral_lock.pages->aral_lock.used_elements) { + fprintf(stderr, "\n\nARAL leftovers detected (2)\n\n"); + __atomic_add_fetch(&auc->errors, 1, __ATOMIC_RELAXED); + } + + } while(!auc->single_threaded && !__atomic_load_n(&auc->stop, __ATOMIC_RELAXED)); + + freez(pointers); + + return ptr; +} + +int aral_stress_test(size_t threads, size_t elements, size_t seconds) { + fprintf(stderr, "Running stress test of %zu threads, with %zu elements each, for %zu seconds...\n", + threads, elements, seconds); + + struct aral_unittest_config auc = { + .single_threaded = false, + .threads = threads, + .ar = aral_create("aral-stress-test", 20, 0, 8192, NULL, "aral-stress-test", NULL, false, false), + .elements = elements, + .errors = 0, + }; + + usec_t started_ut = now_monotonic_usec(); + ND_THREAD *thread_ptrs[threads]; + + for(size_t i = 0; i < threads ; i++) { + char tag[ND_THREAD_TAG_MAX + 1]; + snprintfz(tag, ND_THREAD_TAG_MAX, "TH[%zu]", i); + thread_ptrs[i] = nd_thread_create( + tag, + NETDATA_THREAD_OPTION_JOINABLE | NETDATA_THREAD_OPTION_DONT_LOG, + aral_test_thread, + &auc); + } + + size_t malloc_done = 0; + size_t free_done = 0; + size_t countdown = seconds; + while(countdown-- > 0) { + sleep_usec(1 * USEC_PER_SEC); + aral_lock(auc.ar); + size_t m = auc.ar->aral_lock.user_malloc_operations; + size_t f = auc.ar->aral_lock.user_free_operations; + aral_unlock(auc.ar); + fprintf(stderr, "ARAL executes %0.2f M malloc and %0.2f M free operations/s\n", + (double)(m - malloc_done) / 1000000.0, (double)(f - free_done) / 1000000.0); + malloc_done = m; + free_done = f; + } + + __atomic_store_n(&auc.stop, true, __ATOMIC_RELAXED); + +// fprintf(stderr, "Cancelling the threads...\n"); +// for(size_t i = 0; i < threads ; i++) { +// nd_thread_signal_cancel(thread_ptrs[i]); +// } + + fprintf(stderr, "Waiting the threads to finish...\n"); + for(size_t i = 0; i < threads ; i++) { + nd_thread_join(thread_ptrs[i]); + } + + usec_t ended_ut = now_monotonic_usec(); + + if (auc.ar->aral_lock.pages && auc.ar->aral_lock.pages->aral_lock.used_elements) { + fprintf(stderr, "\n\nARAL leftovers detected (3)\n\n"); + __atomic_add_fetch(&auc.errors, 1, __ATOMIC_RELAXED); + } + + netdata_log_info("ARAL: did %zu malloc, %zu free, " + "using %zu threads, in %"PRIu64" usecs", + auc.ar->aral_lock.user_malloc_operations, + auc.ar->aral_lock.user_free_operations, + threads, + ended_ut - started_ut); + + aral_destroy(auc.ar); + + return auc.errors; +} + +int aral_unittest(size_t elements) { + char *cache_dir = "/tmp/"; + + struct aral_unittest_config auc = { + .single_threaded = true, + .threads = 1, + .ar = aral_create("aral-test", 20, 0, 8192, NULL, "aral-test", &cache_dir, false, false), + .elements = elements, + .errors = 0, + }; + + aral_test_thread(&auc); + + aral_destroy(auc.ar); + + int errors = aral_stress_test(2, elements, 5); + + return auc.errors + errors; +} diff --git a/src/libnetdata/aral/aral.h b/src/libnetdata/aral/aral.h new file mode 100644 index 00000000..2e749bc4 --- /dev/null +++ b/src/libnetdata/aral/aral.h @@ -0,0 +1,73 @@ + +#ifndef ARAL_H +#define ARAL_H 1 + +#include "../libnetdata.h" + +#define ARAL_MAX_NAME 23 + +typedef struct aral ARAL; + +struct aral_statistics { + struct { + size_t allocations; + size_t allocated_bytes; + } structures; + + struct { + size_t allocations; + size_t allocated_bytes; + size_t used_bytes; + } malloc; + + struct { + size_t allocations; + size_t allocated_bytes; + size_t used_bytes; + } mmap; +}; + +ARAL *aral_create(const char *name, size_t element_size, size_t initial_page_elements, size_t max_page_size, + struct aral_statistics *stats, const char *filename, char **cache_dir, bool mmap, bool lockless); +size_t aral_element_size(ARAL *ar); +size_t aral_overhead(ARAL *ar); +size_t aral_structures(ARAL *ar); +struct aral_statistics *aral_statistics(ARAL *ar); +size_t aral_structures_from_stats(struct aral_statistics *stats); +size_t aral_overhead_from_stats(struct aral_statistics *stats); + +ARAL *aral_by_size_acquire(size_t size); +void aral_by_size_release(ARAL *ar); +size_t aral_by_size_structures(void); +size_t aral_by_size_overhead(void); +struct aral_statistics *aral_by_size_statistics(void); + +int aral_unittest(size_t elements); + +#ifdef NETDATA_TRACE_ALLOCATIONS + +#define aral_callocz(ar) aral_callocz_internal(ar, __FILE__, __FUNCTION__, __LINE__) +#define aral_mallocz(ar) aral_mallocz_internal(ar, __FILE__, __FUNCTION__, __LINE__) +#define aral_freez(ar, ptr) aral_freez_internal(ar, ptr, __FILE__, __FUNCTION__, __LINE__) +#define aral_destroy(ar) aral_destroy_internal(ar, __FILE__, __FUNCTION__, __LINE__) + +void *aral_callocz_internal(ARAL *ar, const char *file, const char *function, size_t line); +void *aral_mallocz_internal(ARAL *ar, const char *file, const char *function, size_t line); +void aral_freez_internal(ARAL *ar, void *ptr, const char *file, const char *function, size_t line); +void aral_destroy_internal(ARAL *ar, const char *file, const char *function, size_t line); + +#else // NETDATA_TRACE_ALLOCATIONS + +#define aral_mallocz(ar) aral_mallocz_internal(ar) +#define aral_callocz(ar) aral_callocz_internal(ar) +#define aral_freez(ar, ptr) aral_freez_internal(ar, ptr) +#define aral_destroy(ar) aral_destroy_internal(ar) + +void *aral_callocz_internal(ARAL *ar); +void *aral_mallocz_internal(ARAL *ar); +void aral_freez_internal(ARAL *ar, void *ptr); +void aral_destroy_internal(ARAL *ar); + +#endif // NETDATA_TRACE_ALLOCATIONS + +#endif // ARAL_H diff --git a/src/libnetdata/avl/README.md b/src/libnetdata/avl/README.md new file mode 100644 index 00000000..eb85f884 --- /dev/null +++ b/src/libnetdata/avl/README.md @@ -0,0 +1,21 @@ +<!-- +title: "AVL" +custom_edit_url: https://github.com/netdata/netdata/edit/master/src/libnetdata/avl/README.md +sidebar_label: "AVL" +learn_status: "Published" +learn_topic_type: "Tasks" +learn_rel_path: "Developers/libnetdata" +--> + +# AVL + +AVL is a library indexing objects in B-Trees. + +`avl_insert()`, `avl_remove()` and `avl_search()` are adaptations +of the AVL algorithm found in `libavl` v2.0.3, so that they do not +use any memory allocations and their memory footprint is optimized +(by eliminating non-necessary data members). + +In addition to the above, this version of AVL, provides versions using locks +and traversal functions. + diff --git a/src/libnetdata/avl/avl.c b/src/libnetdata/avl/avl.c new file mode 100644 index 00000000..e1d4064d --- /dev/null +++ b/src/libnetdata/avl/avl.c @@ -0,0 +1,405 @@ +// SPDX-License-Identifier: LGPL-3.0-or-later + +#include "../libnetdata.h" + +/* ------------------------------------------------------------------------- */ +/* + * avl_insert(), avl_remove() and avl_search() + * are adaptations (by Costa Tsaousis) of the AVL algorithm found in libavl + * v2.0.3, so that they do not use any memory allocations and their memory + * footprint is optimized (by eliminating non-necessary data members). + * + * libavl - library for manipulation of binary trees. + * Copyright (C) 1998, 1999, 2000, 2001, 2002, 2004 Free Software + * Foundation, Inc. +*/ + + +/* Search |tree| for an item matching |item|, and return it if found. + Otherwise return |NULL|. */ +avl_t *avl_search(avl_tree_type *tree, avl_t *item) { + avl_t *p; + + // assert (tree != NULL && item != NULL); + + for (p = tree->root; p != NULL; ) { + int cmp = tree->compar(item, p); + + if (cmp < 0) + p = p->avl_link[0]; + else if (cmp > 0) + p = p->avl_link[1]; + else /* |cmp == 0| */ + return p; + } + + return NULL; +} + +/* Inserts |item| into |tree| and returns a pointer to |item|'s address. + If a duplicate item is found in the tree, + returns a pointer to the duplicate without inserting |item|. + */ +avl_t *avl_insert(avl_tree_type *tree, avl_t *item) { + avl_t *y, *z; /* Top node to update balance factor, and parent. */ + avl_t *p, *q; /* Iterator, and parent. */ + avl_t *n; /* Newly inserted node. */ + avl_t *w; /* New root of rebalanced subtree. */ + unsigned char dir; /* Direction to descend. */ + + unsigned char da[AVL_MAX_HEIGHT]; /* Cached comparison results. */ + int k = 0; /* Number of cached results. */ + + // assert(tree != NULL && item != NULL); + + z = (avl_t *) &tree->root; + y = tree->root; + dir = 0; + for (q = z, p = y; p != NULL; q = p, p = p->avl_link[dir]) { + int cmp = tree->compar(item, p); + if (cmp == 0) + return p; + + if (p->avl_balance != 0) + z = q, y = p, k = 0; + da[k++] = dir = (unsigned char)(cmp > 0); + } + + n = q->avl_link[dir] = item; + + // tree->avl_count++; + n->avl_link[0] = n->avl_link[1] = NULL; + n->avl_balance = 0; + if (y == NULL) return n; + + for (p = y, k = 0; p != n; p = p->avl_link[da[k]], k++) + if (da[k] == 0) + p->avl_balance--; + else + p->avl_balance++; + + if (y->avl_balance == -2) { + avl_t *x = y->avl_link[0]; + if (x->avl_balance == -1) { + w = x; + y->avl_link[0] = x->avl_link[1]; + x->avl_link[1] = y; + x->avl_balance = y->avl_balance = 0; + } + else { + // assert (x->avl_balance == +1); + w = x->avl_link[1]; + x->avl_link[1] = w->avl_link[0]; + w->avl_link[0] = x; + y->avl_link[0] = w->avl_link[1]; + w->avl_link[1] = y; + if (w->avl_balance == -1) + x->avl_balance = 0, y->avl_balance = +1; + else if (w->avl_balance == 0) + x->avl_balance = y->avl_balance = 0; + else /* |w->avl_balance == +1| */ + x->avl_balance = -1, y->avl_balance = 0; + w->avl_balance = 0; + } + } + else if (y->avl_balance == +2) { + avl_t *x = y->avl_link[1]; + if (x->avl_balance == +1) { + w = x; + y->avl_link[1] = x->avl_link[0]; + x->avl_link[0] = y; + x->avl_balance = y->avl_balance = 0; + } + else { + // assert (x->avl_balance == -1); + w = x->avl_link[0]; + x->avl_link[0] = w->avl_link[1]; + w->avl_link[1] = x; + y->avl_link[1] = w->avl_link[0]; + w->avl_link[0] = y; + if (w->avl_balance == +1) + x->avl_balance = 0, y->avl_balance = -1; + else if (w->avl_balance == 0) + x->avl_balance = y->avl_balance = 0; + else /* |w->avl_balance == -1| */ + x->avl_balance = +1, y->avl_balance = 0; + w->avl_balance = 0; + } + } + else return n; + + z->avl_link[y != z->avl_link[0]] = w; + + // tree->avl_generation++; + return n; +} + +/* Deletes from |tree| and returns an item matching |item|. + Returns a null pointer if no matching item found. */ +avl_t *avl_remove(avl_tree_type *tree, avl_t *item) { + /* Stack of nodes. */ + avl_t *pa[AVL_MAX_HEIGHT]; /* Nodes. */ + unsigned char da[AVL_MAX_HEIGHT]; /* |avl_link[]| indexes. */ + int k; /* Stack pointer. */ + + avl_t *p; /* Traverses tree to find node to delete. */ + int cmp; /* Result of comparison between |item| and |p|. */ + + // assert (tree != NULL && item != NULL); + + k = 0; + p = (avl_t *) &tree->root; + for(cmp = -1; cmp != 0; cmp = tree->compar(item, p)) { + unsigned char dir = (unsigned char)(cmp > 0); + + pa[k] = p; + da[k++] = dir; + + p = p->avl_link[dir]; + if(p == NULL) return NULL; + } + + item = p; + + if (p->avl_link[1] == NULL) + pa[k - 1]->avl_link[da[k - 1]] = p->avl_link[0]; + else { + avl_t *r = p->avl_link[1]; + if (r->avl_link[0] == NULL) { + r->avl_link[0] = p->avl_link[0]; + r->avl_balance = p->avl_balance; + pa[k - 1]->avl_link[da[k - 1]] = r; + da[k] = 1; + pa[k++] = r; + } + else { + avl_t *s; + int j = k++; + + for (;;) { + da[k] = 0; + pa[k++] = r; + s = r->avl_link[0]; + if (s->avl_link[0] == NULL) break; + + r = s; + } + + s->avl_link[0] = p->avl_link[0]; + r->avl_link[0] = s->avl_link[1]; + s->avl_link[1] = p->avl_link[1]; + s->avl_balance = p->avl_balance; + + pa[j - 1]->avl_link[da[j - 1]] = s; + da[j] = 1; + pa[j] = s; + } + } + + // assert (k > 0); + while (--k > 0) { + avl_t *y = pa[k]; + + if (da[k] == 0) { + y->avl_balance++; + if (y->avl_balance == +1) break; + else if (y->avl_balance == +2) { + avl_t *x = y->avl_link[1]; + if (x->avl_balance == -1) { + avl_t *w; + // assert (x->avl_balance == -1); + w = x->avl_link[0]; + x->avl_link[0] = w->avl_link[1]; + w->avl_link[1] = x; + y->avl_link[1] = w->avl_link[0]; + w->avl_link[0] = y; + if (w->avl_balance == +1) + x->avl_balance = 0, y->avl_balance = -1; + else if (w->avl_balance == 0) + x->avl_balance = y->avl_balance = 0; + else /* |w->avl_balance == -1| */ + x->avl_balance = +1, y->avl_balance = 0; + w->avl_balance = 0; + pa[k - 1]->avl_link[da[k - 1]] = w; + } + else { + y->avl_link[1] = x->avl_link[0]; + x->avl_link[0] = y; + pa[k - 1]->avl_link[da[k - 1]] = x; + if (x->avl_balance == 0) { + x->avl_balance = -1; + y->avl_balance = +1; + break; + } + else x->avl_balance = y->avl_balance = 0; + } + } + } + else + { + y->avl_balance--; + if (y->avl_balance == -1) break; + else if (y->avl_balance == -2) { + avl_t *x = y->avl_link[0]; + if (x->avl_balance == +1) { + avl_t *w; + // assert (x->avl_balance == +1); + w = x->avl_link[1]; + x->avl_link[1] = w->avl_link[0]; + w->avl_link[0] = x; + y->avl_link[0] = w->avl_link[1]; + w->avl_link[1] = y; + if (w->avl_balance == -1) + x->avl_balance = 0, y->avl_balance = +1; + else if (w->avl_balance == 0) + x->avl_balance = y->avl_balance = 0; + else /* |w->avl_balance == +1| */ + x->avl_balance = -1, y->avl_balance = 0; + w->avl_balance = 0; + pa[k - 1]->avl_link[da[k - 1]] = w; + } + else { + y->avl_link[0] = x->avl_link[1]; + x->avl_link[1] = y; + pa[k - 1]->avl_link[da[k - 1]] = x; + if (x->avl_balance == 0) { + x->avl_balance = +1; + y->avl_balance = -1; + break; + } + else x->avl_balance = y->avl_balance = 0; + } + } + } + } + + // tree->avl_count--; + // tree->avl_generation++; + return item; +} + +/* ------------------------------------------------------------------------- */ +// below are functions by (C) Costa Tsaousis + +// --------------------------- +// traversing + +int avl_walker(avl_t *node, int (*callback)(void * /*entry*/, void * /*data*/), void *data) { + int total = 0, ret = 0; + + if(node->avl_link[0]) { + ret = avl_walker(node->avl_link[0], callback, data); + if(ret < 0) return ret; + total += ret; + } + + ret = callback(node, data); + if(ret < 0) return ret; + total += ret; + + if(node->avl_link[1]) { + ret = avl_walker(node->avl_link[1], callback, data); + if (ret < 0) return ret; + total += ret; + } + + return total; +} + +int avl_traverse(avl_tree_type *tree, int (*callback)(void * /*entry*/, void * /*data*/), void *data) { + if(tree->root) + return avl_walker(tree->root, callback, data); + else + return 0; +} + +// --------------------------- +// locks + +static inline void avl_read_lock(avl_tree_lock *t) { +#if defined(AVL_LOCK_WITH_RWLOCK) + netdata_rwlock_rdlock(&t->rwlock); +#else + rw_spinlock_read_lock(&t->rwlock); +#endif +} + +static inline void avl_write_lock(avl_tree_lock *t) { +#if defined(AVL_LOCK_WITH_RWLOCK) + netdata_rwlock_wrlock(&t->rwlock); +#else + rw_spinlock_write_lock(&t->rwlock); +#endif +} + +static inline void avl_read_unlock(avl_tree_lock *t) { +#if defined(AVL_LOCK_WITH_RWLOCK) + netdata_rwlock_rdunlock(&t->rwlock); +#else + rw_spinlock_read_unlock(&t->rwlock); +#endif +} + +static inline void avl_write_unlock(avl_tree_lock *t) { +#if defined(AVL_LOCK_WITH_RWLOCK) + netdata_rwlock_wrunlock(&t->rwlock); +#else + rw_spinlock_write_unlock(&t->rwlock); +#endif +} + +// --------------------------- +// operations with locking + +void avl_init_lock(avl_tree_lock *tree, int (*compar)(void * /*a*/, void * /*b*/)) { + avl_init(&tree->avl_tree, compar); + +#if defined(AVL_LOCK_WITH_RWLOCK) + if(netdata_rwlock_init(&tree->rwlock) != 0) + fatal("Failed to initialize AVL rwlock"); +#else + rw_spinlock_init(&tree->rwlock); +#endif +} + +void avl_destroy_lock(avl_tree_lock *tree __maybe_unused) { +#if defined(AVL_LOCK_WITH_RWLOCK) + if(netdata_rwlock_destroy(&tree->rwlock) != 0) + fatal("Failed to destroy AVL rwlock"); +#endif +} + +avl_t *avl_search_lock(avl_tree_lock *tree, avl_t *item) { + avl_read_lock(tree); + avl_t *ret = avl_search(&tree->avl_tree, item); + avl_read_unlock(tree); + return ret; +} + +avl_t * avl_remove_lock(avl_tree_lock *tree, avl_t *item) { + avl_write_lock(tree); + avl_t *ret = avl_remove(&tree->avl_tree, item); + avl_write_unlock(tree); + return ret; +} + +avl_t *avl_insert_lock(avl_tree_lock *tree, avl_t *item) { + avl_write_lock(tree); + avl_t * ret = avl_insert(&tree->avl_tree, item); + avl_write_unlock(tree); + return ret; +} + +int avl_traverse_lock(avl_tree_lock *tree, int (*callback)(void * /*entry*/, void * /*data*/), void *data) { + avl_read_lock(tree); + int ret = avl_traverse(&tree->avl_tree, callback, data); + avl_read_unlock(tree); + return ret; +} + +void avl_init(avl_tree_type *tree, int (*compar)(void * /*a*/, void * /*b*/)) { + tree->root = NULL; + tree->compar = compar; +} + +// ------------------ diff --git a/src/libnetdata/avl/avl.h b/src/libnetdata/avl/avl.h new file mode 100644 index 00000000..595d6ec6 --- /dev/null +++ b/src/libnetdata/avl/avl.h @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: LGPL-3.0-or-later + +#ifndef _AVL_H +#define _AVL_H 1 + +#include "../libnetdata.h" + +/* Maximum AVL tree height. */ +#ifndef AVL_MAX_HEIGHT +#define AVL_MAX_HEIGHT 92 +#endif + +#if defined(AVL_LOCK_WITH_RWLOCK) +#define AVL_LOCK_INITIALIZER NETDATA_RWLOCK_INITIALIZER +#else +#define AVL_LOCK_INITIALIZER NETDATA_RW_SPINLOCK_INITIALIZER +#endif + +/* Data structures */ + +/* One element of the AVL tree */ +typedef struct avl_element { + struct avl_element *avl_link[2]; /* Subtrees. */ + signed char avl_balance; /* Balance factor. */ +} avl_t; + +typedef struct __attribute__((packed)) avl_element_packed { + struct avl_element *avl_link[2]; /* Subtrees. */ + signed char avl_balance; /* Balance factor. */ +} avl_t_packed; + +/* An AVL tree */ +typedef struct avl_tree_type { + avl_t *root; + int (*compar)(void *a, void *b); +} avl_tree_type; + +typedef struct avl_tree_lock { + avl_tree_type avl_tree; + +#if defined(AVL_LOCK_WITH_RWLOCK) + netdata_rwlock_t rwlock; +#else + RW_SPINLOCK rwlock; +#endif +} avl_tree_lock; + +/* Public methods */ + +/* Insert element a into the AVL tree t + * returns the added element a, or a pointer the + * element that is equal to a (as returned by t->compar()) + * a is linked directly to the tree, so it has to + * be properly allocated by the caller. + */ +avl_t *avl_insert_lock(avl_tree_lock *tree, avl_t *item) NEVERNULL WARNUNUSED; +avl_t *avl_insert(avl_tree_type *tree, avl_t *item) NEVERNULL WARNUNUSED; + +/* Remove an element a from the AVL tree t + * returns a pointer to the removed element + * or NULL if an element equal to a is not found + * (equal as returned by t->compar()) + */ +avl_t *avl_remove_lock(avl_tree_lock *tree, avl_t *item) WARNUNUSED; +avl_t *avl_remove(avl_tree_type *tree, avl_t *item) WARNUNUSED; + +/* Find the element into the tree that equal to a + * (equal as returned by t->compar()) + * returns NULL is no element is equal to a + */ +avl_t *avl_search_lock(avl_tree_lock *tree, avl_t *item); +avl_t *avl_search(avl_tree_type *tree, avl_t *item); + +/* Initialize the avl_tree_lock + */ +void avl_init_lock(avl_tree_lock *tree, int (*compar)(void *a, void *b)); +void avl_init(avl_tree_type *tree, int (*compar)(void *a, void *b)); + +/* Destroy the avl_tree_lock locks + */ +void avl_destroy_lock(avl_tree_lock *tree); + +int avl_traverse_lock(avl_tree_lock *tree, int (*callback)(void *entry, void *data), void *data); +int avl_traverse(avl_tree_type *tree, int (*callback)(void *entry, void *data), void *data); + +#endif /* avl.h */ diff --git a/src/libnetdata/bitmap64.h b/src/libnetdata/bitmap64.h new file mode 100644 index 00000000..425f3fd2 --- /dev/null +++ b/src/libnetdata/bitmap64.h @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_BITMAP64_H +#define NETDATA_BITMAP64_H + +#include <stdbool.h> +#include <stdint.h> +#include <assert.h> + +typedef uint64_t bitmap64_t; + +#define BITMAP64_INITIALIZER 0 + +static inline void bitmap64_set(bitmap64_t *bitmap, int position) +{ + assert(position >= 0 && position < 64); + + *bitmap |= (1ULL << position); +} + +static inline void bitmap64_clear(bitmap64_t *bitmap, int position) +{ + assert(position >= 0 && position < 64); + + *bitmap &= ~(1ULL << position); +} + +static inline bool bitmap64_get(const bitmap64_t *bitmap, int position) +{ + assert(position >= 0 && position < 64); + + return (*bitmap & (1ULL << position)); +} + +#endif // NETDATA_BITMAP64_H diff --git a/src/libnetdata/buffer/README.md b/src/libnetdata/buffer/README.md new file mode 100644 index 00000000..a7850df7 --- /dev/null +++ b/src/libnetdata/buffer/README.md @@ -0,0 +1,20 @@ +<!-- +title: "BUFFER" +custom_edit_url: https://github.com/netdata/netdata/edit/master/src/libnetdata/buffer/README.md +sidebar_label: "BUFFER library" +learn_status: "Published" +learn_topic_type: "Tasks" +learn_rel_path: "Developers/libnetdata" +--> + +# BUFFER + +`BUFFER` is a convenience library for working with strings in `C`. +Mainly, `BUFFER`s eliminate the need for tracking the string length, thus providing +a safe alternative for string operations. + +Also, they are super fast in printing and appending data to the string and its `buffer_strlen()` +is just a lookup (it does not traverse the string). + +Netdata uses `BUFFER`s for preparing web responses and buffering data to be sent upstream or +to external databases. diff --git a/src/libnetdata/buffer/buffer.c b/src/libnetdata/buffer/buffer.c new file mode 100644 index 00000000..119216dd --- /dev/null +++ b/src/libnetdata/buffer/buffer.c @@ -0,0 +1,496 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" + +static inline void buffer_overflow_init(BUFFER *b) +{ + b->buffer[b->size] = '\0'; + strcpy(&b->buffer[b->size + 1], BUFFER_OVERFLOW_EOF); +} + +void buffer_reset(BUFFER *wb) { + buffer_flush(wb); + + wb->content_type = CT_TEXT_PLAIN; + wb->options = 0; + wb->date = 0; + wb->expires = 0; + buffer_no_cacheable(wb); + + buffer_overflow_check(wb); +} + +void buffer_char_replace(BUFFER *wb, char from, char to) { + char *s = wb->buffer, *end = &wb->buffer[wb->len]; + + while(s != end) { + if(*s == from) *s = to; + s++; + } + + buffer_overflow_check(wb); +} + +void buffer_print_sn_flags(BUFFER *wb, SN_FLAGS flags, bool send_anomaly_bit) { + if(unlikely(flags == SN_EMPTY_SLOT)) { + buffer_fast_strcat(wb, "E", 1); + return; + } + + size_t printed = 0; + if(likely(send_anomaly_bit && (flags & SN_FLAG_NOT_ANOMALOUS))) { + buffer_fast_strcat(wb, "A", 1); + printed++; + } + + if(unlikely(flags & SN_FLAG_RESET)) { + buffer_fast_strcat(wb, "R", 1); + printed++; + } + + if(!printed) + buffer_fast_strcat(wb, "''", 2); +} + +void buffer_strcat_htmlescape(BUFFER *wb, const char *txt) +{ + while(*txt) { + switch(*txt) { + case '&': buffer_strcat(wb, "&"); break; + case '<': buffer_strcat(wb, "<"); break; + case '>': buffer_strcat(wb, ">"); break; + case '"': buffer_strcat(wb, """); break; + case '/': buffer_strcat(wb, "/"); break; + case '\'': buffer_strcat(wb, "'"); break; + default: { + buffer_need_bytes(wb, 1); + wb->buffer[wb->len++] = *txt; + } + } + txt++; + } + + buffer_overflow_check(wb); +} + +void buffer_snprintf(BUFFER *wb, size_t len, const char *fmt, ...) +{ + if(unlikely(!fmt || !*fmt)) return; + + buffer_need_bytes(wb, len + 1); + + va_list args; + va_start(args, fmt); + // vsnprintfz() returns the number of bytes actually written - after possible truncation + wb->len += vsnprintfz(&wb->buffer[wb->len], len, fmt, args); + va_end(args); + + buffer_overflow_check(wb); + + // the buffer is \0 terminated by vsnprintfz +} + +inline void buffer_vsprintf(BUFFER *wb, const char *fmt, va_list args) { + if(unlikely(!fmt || !*fmt)) return; + + size_t full_size_bytes = 0, need = 2, space_remaining = 0; + + do { + need += full_size_bytes + 2; + + buffer_need_bytes(wb, need); + + space_remaining = wb->size - wb->len - 1; + + // Use the copy of va_list for vsnprintf + va_list args_copy; + va_copy(args_copy, args); + // vsnprintf() returns the number of bytes required, even if bigger than the buffer provided + full_size_bytes = (size_t) vsnprintf(&wb->buffer[wb->len], space_remaining, fmt, args_copy); + va_end(args_copy); + + } while(full_size_bytes >= space_remaining); + + wb->len += full_size_bytes; + + wb->buffer[wb->len] = '\0'; + buffer_overflow_check(wb); +} + +void buffer_sprintf(BUFFER *wb, const char *fmt, ...) +{ + va_list args; + va_start(args, fmt); + buffer_vsprintf(wb, fmt, args); + va_end(args); +} + +// generate a javascript date, the fastest possible way... +void buffer_jsdate(BUFFER *wb, int year, int month, int day, int hours, int minutes, int seconds) +{ + // 10 20 30 = 35 + // 01234567890123456789012345678901234 + // Date(2014,04,01,03,28,20) + + buffer_need_bytes(wb, 30); + + char *b = &wb->buffer[wb->len], *p; + unsigned int *q = (unsigned int *)b; + + #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + *q++ = 0x65746144; // "Date" backwards. + #else + *q++ = 0x44617465; // "Date" + #endif + p = (char *)q; + + *p++ = '('; + *p++ = '0' + year / 1000; year %= 1000; + *p++ = '0' + year / 100; year %= 100; + *p++ = '0' + year / 10; + *p++ = '0' + year % 10; + *p++ = ','; + *p = '0' + month / 10; if (*p != '0') p++; + *p++ = '0' + month % 10; + *p++ = ','; + *p = '0' + day / 10; if (*p != '0') p++; + *p++ = '0' + day % 10; + *p++ = ','; + *p = '0' + hours / 10; if (*p != '0') p++; + *p++ = '0' + hours % 10; + *p++ = ','; + *p = '0' + minutes / 10; if (*p != '0') p++; + *p++ = '0' + minutes % 10; + *p++ = ','; + *p = '0' + seconds / 10; if (*p != '0') p++; + *p++ = '0' + seconds % 10; + + unsigned short *r = (unsigned short *)p; + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + *r++ = 0x0029; // ")\0" backwards. + #else + *r++ = 0x2900; // ")\0" + #endif + + wb->len += (size_t)((char *)r - b - 1); + + // terminate it + wb->buffer[wb->len] = '\0'; + buffer_overflow_check(wb); +} + +// generate a date, the fastest possible way... +void buffer_date(BUFFER *wb, int year, int month, int day, int hours, int minutes, int seconds) +{ + // 10 20 30 = 35 + // 01234567890123456789012345678901234 + // 2014-04-01 03:28:20 + + buffer_need_bytes(wb, 36); + + char *b = &wb->buffer[wb->len]; + char *p = b; + + *p++ = '0' + year / 1000; year %= 1000; + *p++ = '0' + year / 100; year %= 100; + *p++ = '0' + year / 10; + *p++ = '0' + year % 10; + *p++ = '-'; + *p++ = '0' + month / 10; + *p++ = '0' + month % 10; + *p++ = '-'; + *p++ = '0' + day / 10; + *p++ = '0' + day % 10; + *p++ = ' '; + *p++ = '0' + hours / 10; + *p++ = '0' + hours % 10; + *p++ = ':'; + *p++ = '0' + minutes / 10; + *p++ = '0' + minutes % 10; + *p++ = ':'; + *p++ = '0' + seconds / 10; + *p++ = '0' + seconds % 10; + *p = '\0'; + + wb->len += (size_t)(p - b); + + // terminate it + wb->buffer[wb->len] = '\0'; + buffer_overflow_check(wb); +} + +BUFFER *buffer_create(size_t size, size_t *statistics) +{ + BUFFER *b; + + netdata_log_debug(D_WEB_BUFFER, "Creating new web buffer of size %zu.", size); + + b = callocz(1, sizeof(BUFFER)); + b->buffer = mallocz(size + sizeof(BUFFER_OVERFLOW_EOF) + 2); + b->buffer[0] = '\0'; + b->size = size; + b->content_type = CT_TEXT_PLAIN; + b->statistics = statistics; + buffer_no_cacheable(b); + buffer_overflow_init(b); + buffer_overflow_check(b); + + if(b->statistics) + __atomic_add_fetch(b->statistics, b->size + sizeof(BUFFER) + sizeof(BUFFER_OVERFLOW_EOF) + 2, __ATOMIC_RELAXED); + + return(b); +} + +void buffer_free(BUFFER *b) { + if(unlikely(!b)) return; + + buffer_overflow_check(b); + + netdata_log_debug(D_WEB_BUFFER, "Freeing web buffer of size %zu.", b->size); + + if(b->statistics) + __atomic_sub_fetch(b->statistics, b->size + sizeof(BUFFER) + sizeof(BUFFER_OVERFLOW_EOF) + 2, __ATOMIC_RELAXED); + + freez(b->buffer); + freez(b); +} + +void buffer_increase(BUFFER *b, size_t free_size_required) { + buffer_overflow_check(b); + + size_t remaining = b->size - b->len; + if(remaining >= free_size_required) return; + + size_t increase = free_size_required - remaining; + size_t minimum = 128; + if(minimum > increase) increase = minimum; + + size_t optimal = (b->size > 5*1024*1024) ? b->size / 2 : b->size; + if(optimal > increase) increase = optimal; + + netdata_log_debug(D_WEB_BUFFER, "Increasing data buffer from size %zu to %zu.", b->size, b->size + increase); + + b->buffer = reallocz(b->buffer, b->size + increase + sizeof(BUFFER_OVERFLOW_EOF) + 2); + b->size += increase; + + if(b->statistics) + __atomic_add_fetch(b->statistics, increase, __ATOMIC_RELAXED); + + buffer_overflow_init(b); + buffer_overflow_check(b); +} + +// ---------------------------------------------------------------------------- + +void buffer_json_initialize(BUFFER *wb, const char *key_quote, const char *value_quote, int depth, + bool add_anonymous_object, BUFFER_JSON_OPTIONS options) { + strncpyz(wb->json.key_quote, key_quote, BUFFER_QUOTE_MAX_SIZE); + strncpyz(wb->json.value_quote, value_quote, BUFFER_QUOTE_MAX_SIZE); + + wb->json.depth = (int8_t)(depth - 1); + _buffer_json_depth_push(wb, BUFFER_JSON_OBJECT); + + if(add_anonymous_object) + buffer_fast_strcat(wb, "{", 1); + else + options |= BUFFER_JSON_OPTIONS_NON_ANONYMOUS; + + wb->json.options = options; + + wb->content_type = CT_APPLICATION_JSON; + buffer_no_cacheable(wb); +} + +void buffer_json_finalize(BUFFER *wb) { + while(wb->json.depth >= 0) { + switch(wb->json.stack[wb->json.depth].type) { + case BUFFER_JSON_OBJECT: + if (wb->json.depth == 0) + if (!(wb->json.options & BUFFER_JSON_OPTIONS_NON_ANONYMOUS)) + buffer_json_object_close(wb); + else + _buffer_json_depth_pop(wb); + else + buffer_json_object_close(wb); + break; + case BUFFER_JSON_ARRAY: + buffer_json_array_close(wb); + break; + + default: + internal_fatal(true, "BUFFER: unknown json member type in stack"); + break; + } + } + + if(!(wb->json.options & BUFFER_JSON_OPTIONS_MINIFY)) + buffer_fast_strcat(wb, "\n", 1); +} + +// ---------------------------------------------------------------------------- + +const char hex_digits[16] = "0123456789ABCDEF"; +const char base64_digits[64] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; +unsigned char hex_value_from_ascii[256]; +unsigned char base64_value_from_ascii[256]; + +__attribute__((constructor)) void initialize_ascii_maps(void) { + for(size_t i = 0 ; i < 256 ; i++) { + hex_value_from_ascii[i] = 255; + base64_value_from_ascii[i] = 255; + } + + for(size_t i = 0; i < 16 ; i++) { + hex_value_from_ascii[(int)toupper(hex_digits[i])] = i; + hex_value_from_ascii[(int)tolower(hex_digits[i])] = i; + } + + for(size_t i = 0; i < 64 ; i++) + base64_value_from_ascii[(int)base64_digits[i]] = i; +} + +// ---------------------------------------------------------------------------- +// unit test + +static int buffer_expect(BUFFER *wb, const char *expected) { + const char *generated = buffer_tostring(wb); + + if(strcmp(generated, expected) != 0) { + netdata_log_error("BUFFER: mismatch.\nGenerated:\n%s\nExpected:\n%s\n", + generated, expected); + return 1; + } + + return 0; +} + +static int buffer_uint64_roundtrip(BUFFER *wb, NUMBER_ENCODING encoding, uint64_t value, const char *expected) { + int errors = 0; + buffer_flush(wb); + buffer_print_uint64_encoded(wb, encoding, value); + + if(expected) + errors += buffer_expect(wb, expected); + + uint64_t v = str2ull_encoded(buffer_tostring(wb)); + if(v != value) { + netdata_log_error("BUFFER: string '%s' does resolves to %llu, expected %llu", + buffer_tostring(wb), (unsigned long long)v, (unsigned long long)value); + errors++; + } + buffer_flush(wb); + return errors; +} + +static int buffer_int64_roundtrip(BUFFER *wb, NUMBER_ENCODING encoding, int64_t value, const char *expected) { + int errors = 0; + buffer_flush(wb); + buffer_print_int64_encoded(wb, encoding, value); + + if(expected) + errors += buffer_expect(wb, expected); + + int64_t v = str2ll_encoded(buffer_tostring(wb)); + if(v != value) { + netdata_log_error("BUFFER: string '%s' does resolves to %lld, expected %lld", + buffer_tostring(wb), (long long)v, (long long)value); + errors++; + } + buffer_flush(wb); + return errors; +} + +static int buffer_double_roundtrip(BUFFER *wb, NUMBER_ENCODING encoding, NETDATA_DOUBLE value, const char *expected) { + int errors = 0; + buffer_flush(wb); + buffer_print_netdata_double_encoded(wb, encoding, value); + + if(expected) + errors += buffer_expect(wb, expected); + + NETDATA_DOUBLE v = str2ndd_encoded(buffer_tostring(wb), NULL); + if(v != value) { + netdata_log_error("BUFFER: string '%s' does resolves to %.12f, expected %.12f", + buffer_tostring(wb), v, value); + errors++; + } + buffer_flush(wb); + return errors; +} + +int buffer_unittest(void) { + int errors = 0; + BUFFER *wb = buffer_create(0, NULL); + + buffer_uint64_roundtrip(wb, NUMBER_ENCODING_DECIMAL, 0, "0"); + buffer_uint64_roundtrip(wb, NUMBER_ENCODING_HEX, 0, "0x0"); + buffer_uint64_roundtrip(wb, NUMBER_ENCODING_BASE64, 0, "#A"); + + buffer_uint64_roundtrip(wb, NUMBER_ENCODING_DECIMAL, 1676071986, "1676071986"); + buffer_uint64_roundtrip(wb, NUMBER_ENCODING_HEX, 1676071986, "0x63E6D432"); + buffer_uint64_roundtrip(wb, NUMBER_ENCODING_BASE64, 1676071986, "#Bj5tQy"); + + buffer_uint64_roundtrip(wb, NUMBER_ENCODING_DECIMAL, 18446744073709551615ULL, "18446744073709551615"); + buffer_uint64_roundtrip(wb, NUMBER_ENCODING_HEX, 18446744073709551615ULL, "0xFFFFFFFFFFFFFFFF"); + buffer_uint64_roundtrip(wb, NUMBER_ENCODING_BASE64, 18446744073709551615ULL, "#P//////////"); + + buffer_int64_roundtrip(wb, NUMBER_ENCODING_DECIMAL, 0, "0"); + buffer_int64_roundtrip(wb, NUMBER_ENCODING_HEX, 0, "0x0"); + buffer_int64_roundtrip(wb, NUMBER_ENCODING_BASE64, 0, "#A"); + + buffer_int64_roundtrip(wb, NUMBER_ENCODING_DECIMAL, -1676071986, "-1676071986"); + buffer_int64_roundtrip(wb, NUMBER_ENCODING_HEX, -1676071986, "-0x63E6D432"); + buffer_int64_roundtrip(wb, NUMBER_ENCODING_BASE64, -1676071986, "-#Bj5tQy"); + + buffer_int64_roundtrip(wb, NUMBER_ENCODING_DECIMAL, (int64_t)-9223372036854775807ULL, "-9223372036854775807"); + buffer_int64_roundtrip(wb, NUMBER_ENCODING_HEX, (int64_t)-9223372036854775807ULL, "-0x7FFFFFFFFFFFFFFF"); + buffer_int64_roundtrip(wb, NUMBER_ENCODING_BASE64, (int64_t)-9223372036854775807ULL, "-#H//////////"); + + buffer_double_roundtrip(wb, NUMBER_ENCODING_DECIMAL, 0, "0"); + buffer_double_roundtrip(wb, NUMBER_ENCODING_HEX, 0, "%0"); + buffer_double_roundtrip(wb, NUMBER_ENCODING_BASE64, 0, "@A"); + + buffer_double_roundtrip(wb, NUMBER_ENCODING_DECIMAL, 1.5, "1.5"); + buffer_double_roundtrip(wb, NUMBER_ENCODING_HEX, 1.5, "%3FF8000000000000"); + buffer_double_roundtrip(wb, NUMBER_ENCODING_BASE64, 1.5, "@D/4AAAAAAAA"); + + buffer_double_roundtrip(wb, NUMBER_ENCODING_DECIMAL, 1.23e+14, "123000000000000"); + buffer_double_roundtrip(wb, NUMBER_ENCODING_HEX, 1.23e+14, "%42DBF78AD3AC0000"); + buffer_double_roundtrip(wb, NUMBER_ENCODING_BASE64, 1.23e+14, "@ELb94rTrAAA"); + + buffer_double_roundtrip(wb, NUMBER_ENCODING_DECIMAL, 9.12345678901234567890123456789e+45, "9.123456789012346128e+45"); + buffer_double_roundtrip(wb, NUMBER_ENCODING_HEX, 9.12345678901234567890123456789e+45, "%497991C25C9E4309"); + buffer_double_roundtrip(wb, NUMBER_ENCODING_BASE64, 9.12345678901234567890123456789e+45, "@El5kcJcnkMJ"); + + buffer_flush(wb); + + { + char buf[1024 + 1]; + for(size_t i = 0; i < 1024 ;i++) + buf[i] = (char)(i % 26) + 'A'; + buf[1024] = '\0'; + + buffer_strcat(wb, buf); + errors += buffer_expect(wb, buf); + } + + buffer_flush(wb); + + buffer_json_initialize(wb, "\"", "\"", 0, true, BUFFER_JSON_OPTIONS_DEFAULT); + buffer_json_finalize(wb); + errors += buffer_expect(wb, "{\n}\n"); + + buffer_flush(wb); + + buffer_json_initialize(wb, "\"", "\"", 0, true, BUFFER_JSON_OPTIONS_DEFAULT); + buffer_json_member_add_string(wb, "hello", "world"); + buffer_json_member_add_string(wb, "alpha", "this: \" is a double quote"); + buffer_json_member_add_object(wb, "object1"); + buffer_json_member_add_string(wb, "hello", "world"); + buffer_json_finalize(wb); + errors += buffer_expect(wb, "{\n \"hello\":\"world\",\n \"alpha\":\"this: \\\" is a double quote\",\n \"object1\":{\n \"hello\":\"world\"\n }\n}\n"); + + buffer_free(wb); + return errors; +} diff --git a/src/libnetdata/buffer/buffer.h b/src/libnetdata/buffer/buffer.h new file mode 100644 index 00000000..92e14afb --- /dev/null +++ b/src/libnetdata/buffer/buffer.h @@ -0,0 +1,1236 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_WEB_BUFFER_H +#define NETDATA_WEB_BUFFER_H 1 + +#include "../string/utf8.h" +#include "../libnetdata.h" + +#define BUFFER_JSON_MAX_DEPTH 32 // max is 255 + +extern const char hex_digits[16]; +extern const char base64_digits[64]; +extern unsigned char hex_value_from_ascii[256]; +extern unsigned char base64_value_from_ascii[256]; + +typedef enum __attribute__ ((__packed__)) { + BUFFER_JSON_EMPTY = 0, + BUFFER_JSON_OBJECT, + BUFFER_JSON_ARRAY, +} BUFFER_JSON_NODE_TYPE; + +typedef struct web_buffer_json_node { + BUFFER_JSON_NODE_TYPE type; + uint32_t count:24; +} BUFFER_JSON_NODE; + +#define BUFFER_QUOTE_MAX_SIZE 7 + +typedef enum __attribute__ ((__packed__)) { + WB_CONTENT_CACHEABLE = (1 << 0), + WB_CONTENT_NO_CACHEABLE = (1 << 1), +} BUFFER_OPTIONS; + +typedef enum __attribute__ ((__packed__)) { + BUFFER_JSON_OPTIONS_DEFAULT = 0, + BUFFER_JSON_OPTIONS_MINIFY = (1 << 0), + BUFFER_JSON_OPTIONS_NEWLINE_ON_ARRAY_ITEMS = (1 << 1), + BUFFER_JSON_OPTIONS_NON_ANONYMOUS = (1 << 2), +} BUFFER_JSON_OPTIONS; + +typedef struct web_buffer { + size_t size; // allocation size of buffer, in bytes + size_t len; // current data length in buffer, in bytes + char *buffer; // the buffer itself + HTTP_CONTENT_TYPE content_type; // the content type of the data in the buffer + BUFFER_OPTIONS options; // options related to the content + time_t date; // the timestamp this content has been generated + time_t expires; // the timestamp this content expires + size_t *statistics; + + struct { + char key_quote[BUFFER_QUOTE_MAX_SIZE + 1]; + char value_quote[BUFFER_QUOTE_MAX_SIZE + 1]; + int8_t depth; + BUFFER_JSON_OPTIONS options; + BUFFER_JSON_NODE stack[BUFFER_JSON_MAX_DEPTH]; + } json; +} BUFFER; + +#define CLEAN_BUFFER _cleanup_(buffer_freep) BUFFER + +#define buffer_cacheable(wb) do { (wb)->options |= WB_CONTENT_CACHEABLE; if((wb)->options & WB_CONTENT_NO_CACHEABLE) (wb)->options &= ~WB_CONTENT_NO_CACHEABLE; } while(0) +#define buffer_no_cacheable(wb) do { (wb)->options |= WB_CONTENT_NO_CACHEABLE; if((wb)->options & WB_CONTENT_CACHEABLE) (wb)->options &= ~WB_CONTENT_CACHEABLE; (wb)->expires = 0; } while(0) + +#define buffer_strlen(wb) ((wb)->len) + +#define BUFFER_OVERFLOW_EOF "EOF" + +#ifdef NETDATA_INTERNAL_CHECKS +#define buffer_overflow_check(b) _buffer_overflow_check(b) +#else +#define buffer_overflow_check(b) +#endif + +static inline void _buffer_overflow_check(BUFFER *b __maybe_unused) { + assert(b->len <= b->size && + "BUFFER: length is above buffer size."); + + assert(!(b->buffer && (b->buffer[b->size] != '\0' || strcmp(&b->buffer[b->size + 1], BUFFER_OVERFLOW_EOF) != 0)) && + "BUFFER: detected overflow."); +} + +static inline void buffer_flush(BUFFER *wb) { + wb->len = 0; + + wb->json.depth = 0; + wb->json.stack[0].type = BUFFER_JSON_EMPTY; + wb->json.stack[0].count = 0; + + if(wb->buffer) + wb->buffer[0] = '\0'; +} + +void buffer_reset(BUFFER *wb); + +void buffer_date(BUFFER *wb, int year, int month, int day, int hours, int minutes, int seconds); +void buffer_jsdate(BUFFER *wb, int year, int month, int day, int hours, int minutes, int seconds); + +BUFFER *buffer_create(size_t size, size_t *statistics); +void buffer_free(BUFFER *b); +void buffer_increase(BUFFER *b, size_t free_size_required); + +static inline void buffer_freep(BUFFER **bp) { + if(bp) buffer_free(*bp); +} + +void buffer_snprintf(BUFFER *wb, size_t len, const char *fmt, ...) PRINTFLIKE(3, 4); +void buffer_vsprintf(BUFFER *wb, const char *fmt, va_list args); +void buffer_sprintf(BUFFER *wb, const char *fmt, ...) PRINTFLIKE(2,3); +void buffer_strcat_htmlescape(BUFFER *wb, const char *txt); + +void buffer_char_replace(BUFFER *wb, char from, char to); + +void buffer_print_sn_flags(BUFFER *wb, SN_FLAGS flags, bool send_anomaly_bit); + +static inline void buffer_need_bytes(BUFFER *buffer, size_t needed_free_size) { + if(unlikely(buffer->len + needed_free_size >= buffer->size)) + buffer_increase(buffer, needed_free_size + 1); +} + +void buffer_json_initialize(BUFFER *wb, const char *key_quote, const char *value_quote, int depth, + bool add_anonymous_object, BUFFER_JSON_OPTIONS options); + +void buffer_json_finalize(BUFFER *wb); + +static const char *buffer_tostring(BUFFER *wb) +{ + if(unlikely(!wb)) + return NULL; + + buffer_need_bytes(wb, 1); + wb->buffer[wb->len] = '\0'; + + buffer_overflow_check(wb); + + return(wb->buffer); +} + +static inline void _buffer_json_depth_push(BUFFER *wb, BUFFER_JSON_NODE_TYPE type) { +#ifdef NETDATA_INTERNAL_CHECKS + assert(wb->json.depth <= BUFFER_JSON_MAX_DEPTH && "BUFFER JSON: max nesting reached"); +#endif + wb->json.depth++; +#ifdef NETDATA_INTERNAL_CHECKS + assert(wb->json.depth >= 0 && "Depth wrapped around and is negative"); +#endif + wb->json.stack[wb->json.depth].count = 0; + wb->json.stack[wb->json.depth].type = type; +} + +static inline void _buffer_json_depth_pop(BUFFER *wb) { + wb->json.depth--; +} + +static inline void buffer_fast_charcat(BUFFER *wb, const char c) { + + buffer_need_bytes(wb, 2); + *(&wb->buffer[wb->len]) = c; + wb->len += 1; + wb->buffer[wb->len] = '\0'; + + buffer_overflow_check(wb); +} + +static inline void buffer_fast_rawcat(BUFFER *wb, const char *txt, size_t len) { + if(unlikely(!txt || !*txt || !len)) return; + + buffer_need_bytes(wb, len + 1); + + const char *t = txt; + const char *e = &txt[len]; + + char *d = &wb->buffer[wb->len]; + + while(t != e) + *d++ = *t++; + + wb->len += len; + wb->buffer[wb->len] = '\0'; + + buffer_overflow_check(wb); +} + +static inline void buffer_putc(BUFFER *wb, char c) { + buffer_need_bytes(wb, 2); + wb->buffer[wb->len++] = c; + wb->buffer[wb->len] = '\0'; + buffer_overflow_check(wb); +} + +static inline void buffer_fast_strcat(BUFFER *wb, const char *txt, size_t len) { + if(unlikely(!txt || !*txt || !len)) return; + + buffer_need_bytes(wb, len + 1); + + const char *t = txt; + const char *e = &txt[len]; + + char *d = &wb->buffer[wb->len]; + + while(t != e +#ifdef NETDATA_INTERNAL_CHECKS + && *t +#endif + ) + *d++ = *t++; + +#ifdef NETDATA_INTERNAL_CHECKS + assert(!(t != e && !*t) && "BUFFER: source string is shorter than the length given."); +#endif + + wb->len += len; + wb->buffer[wb->len] = '\0'; + + buffer_overflow_check(wb); +} + +static inline void buffer_strcat(BUFFER *wb, const char *txt) { + if(unlikely(!txt || !*txt)) return; + + const char *t = txt; + while(*t) { + buffer_need_bytes(wb, 100); + char *s = &wb->buffer[wb->len]; + char *d = s; + const char *e = &wb->buffer[wb->size]; + + while(*t && d < e) + *d++ = *t++; + + wb->len += d - s; + } + + buffer_need_bytes(wb, 1); + wb->buffer[wb->len] = '\0'; + + buffer_overflow_check(wb); +} + +static inline void buffer_contents_replace(BUFFER *wb, const char *txt, size_t len) { + wb->len = 0; + buffer_need_bytes(wb, len + 1); + + memcpy(wb->buffer, txt, len); + wb->len = len; + wb->buffer[wb->len] = '\0'; + + buffer_overflow_check(wb); +} + +static inline void buffer_strncat(BUFFER *wb, const char *txt, size_t len) { + if(unlikely(!txt || !*txt)) return; + + buffer_need_bytes(wb, len + 1); + + memcpy(&wb->buffer[wb->len], txt, len); + + wb->len += len; + wb->buffer[wb->len] = '\0'; + + buffer_overflow_check(wb); +} + +static inline void buffer_memcat(BUFFER *wb, const void *mem, size_t bytes) { + if(unlikely(!mem)) return; + + buffer_need_bytes(wb, bytes + 1); + + memcpy(&wb->buffer[wb->len], mem, bytes); + + wb->len += bytes; + wb->buffer[wb->len] = '\0'; + + buffer_overflow_check(wb); +} + +static inline void buffer_json_strcat(BUFFER *wb, const char *txt) +{ + if(unlikely(!txt || !*txt)) return; + + const unsigned char *t = (const unsigned char *)txt; + while(*t) { + buffer_need_bytes(wb, 110); + unsigned char *s = (unsigned char *)&wb->buffer[wb->len]; + unsigned char *d = s; + const unsigned char *e = (unsigned char *)&wb->buffer[wb->size - 10]; // make room for the max escape sequence + + while(*t && d < e) { +#ifdef BUFFER_JSON_ESCAPE_UTF + if(unlikely(IS_UTF8_STARTBYTE(*t) && IS_UTF8_BYTE(t[1]))) { + // UTF-8 multi-byte encoded character + + // find how big this character is (2-4 bytes) + size_t utf_character_size = 2; + while(utf_character_size < 4 && t[utf_character_size] && IS_UTF8_BYTE(t[utf_character_size]) && !IS_UTF8_STARTBYTE(t[utf_character_size])) + utf_character_size++; + + uint32_t code_point = 0; + for (size_t i = 0; i < utf_character_size; i++) { + code_point <<= 6; + code_point |= (t[i] & 0x3F); + } + + t += utf_character_size; + + // encode as \u escape sequence + *d++ = '\\'; + *d++ = 'u'; + *d++ = hex_digits[(code_point >> 12) & 0xf]; + *d++ = hex_digits[(code_point >> 8) & 0xf]; + *d++ = hex_digits[(code_point >> 4) & 0xf]; + *d++ = hex_digits[code_point & 0xf]; + } + else +#endif + if(unlikely(*t < ' ')) { + uint32_t v = *t++; + *d++ = '\\'; + *d++ = 'u'; + *d++ = hex_digits[(v >> 12) & 0xf]; + *d++ = hex_digits[(v >> 8) & 0xf]; + *d++ = hex_digits[(v >> 4) & 0xf]; + *d++ = hex_digits[v & 0xf]; + } + else { + if (unlikely(*t == '\\' || *t == '\"')) + *d++ = '\\'; + + *d++ = *t++; + } + } + + wb->len += d - s; + } + + buffer_need_bytes(wb, 1); + wb->buffer[wb->len] = '\0'; + + buffer_overflow_check(wb); +} + +static inline void buffer_json_quoted_strcat(BUFFER *wb, const char *txt) { + if(unlikely(!txt || !*txt)) return; + + if(*txt == '"') + txt++; + + const char *t = txt; + while(*t) { + buffer_need_bytes(wb, 100); + char *s = &wb->buffer[wb->len]; + char *d = s; + const char *e = &wb->buffer[wb->size - 1]; // remove 1 to make room for the escape character + + while(*t && d < e) { + if(unlikely(*t == '"' && !t[1])) { + t++; + continue; + } + + if(unlikely(*t == '\\' || *t == '"')) + *d++ = '\\'; + + *d++ = *t++; + } + + wb->len += d - s; + } + + buffer_need_bytes(wb, 1); + wb->buffer[wb->len] = '\0'; + + buffer_overflow_check(wb); +} + +// This trick seems to give an 80% speed increase in 32bit systems +// print_number_llu_r() will just print the digits up to the +// point the remaining value fits in 32 bits, and then calls +// print_number_lu_r() to print the rest with 32 bit arithmetic. + +static inline char *print_uint32_reversed(char *dst, uint32_t value) { + char *d = dst; + do *d++ = (char)('0' + (value % 10)); while((value /= 10)); + return d; +} + +static inline char *print_uint64_reversed(char *dst, uint64_t value) { +#ifdef ENV32BIT + if(value <= (uint64_t)0xffffffff) + return print_uint32_reversed(dst, value); + + char *d = dst; + do *d++ = (char)('0' + (value % 10)); while((value /= 10) && value > (uint64_t)0xffffffff); + if(value) return print_uint32_reversed(d, value); + return d; +#else + char *d = dst; + do *d++ = (char)('0' + (value % 10)); while((value /= 10)); + return d; +#endif +} + +static inline char *print_uint32_hex_reversed(char *dst, uint32_t value) { + static const char *digits = "0123456789ABCDEF"; + char *d = dst; + do *d++ = digits[value & 0xf]; while((value >>= 4)); + return d; +} + +static inline char *print_uint64_hex_reversed(char *dst, uint64_t value) { +#ifdef ENV32BIT + if(value <= (uint64_t)0xffffffff) + return print_uint32_hex_reversed(dst, value); + + char *d = dst; + do *d++ = hex_digits[value & 0xf]; while((value >>= 4) && value > (uint64_t)0xffffffff); + if(value) return print_uint32_hex_reversed(d, value); + return d; +#else + char *d = dst; + do *d++ = hex_digits[value & 0xf]; while((value >>= 4)); + return d; +#endif +} + +static inline char *print_uint64_base64_reversed(char *dst, uint64_t value) { + char *d = dst; + do *d++ = base64_digits[value & 63]; while ((value >>= 6)); + return d; +} + +static inline void char_array_reverse(char *from, char *to) { + // from and to are inclusive + char *begin = from, *end = to, aux; + while (end > begin) aux = *end, *end-- = *begin, *begin++ = aux; +} + +static inline int print_netdata_double(char *dst, NETDATA_DOUBLE value) { + char *s = dst; + + if(unlikely(value < 0)) { + *s++ = '-'; + value = fabsndd(value); + } + + uint64_t fractional_precision = 10000000ULL; // fractional part 7 digits + int fractional_wanted_digits = 7; + int exponent = 0; + if(unlikely(value >= (NETDATA_DOUBLE)(UINT64_MAX / 10))) { + // the number is too big to print using 64bit numbers + // so, let's convert it to exponential notation + exponent = (int)(floorndd(log10ndd(value))); + value /= powndd(10, exponent); + + // the max precision we can support is 18 digits + // (UINT64_MAX is 20, but the first is 1) + fractional_precision = 1000000000000000000ULL; // fractional part 18 digits + fractional_wanted_digits = 18; + } + + char *d = s; + NETDATA_DOUBLE integral_d, fractional_d; + fractional_d = modfndd(value, &integral_d); + + // get the integral and the fractional parts as 64-bit integers + uint64_t integral = (uint64_t)integral_d; + uint64_t fractional = (uint64_t)llrintndd(fractional_d * (NETDATA_DOUBLE)fractional_precision); + if(unlikely(fractional >= fractional_precision)) { + integral++; + fractional -= fractional_precision; + } + + // convert the integral part to string (reversed) + d = print_uint64_reversed(d, integral); + char_array_reverse(s, d - 1); // copy reversed the integral string + + if(likely(fractional != 0)) { + *d++ = '.'; // add the dot + + // convert the fractional part to string (reversed) + d = print_uint64_reversed(s = d, fractional); + + while(d - s < fractional_wanted_digits) *d++ = '0'; // prepend zeros to reach precision + char_array_reverse(s, d - 1); // copy reversed the fractional string + + // remove trailing zeros from the fractional part + while(*(d - 1) == '0') d--; + } + + if(unlikely(exponent != 0)) { + *d++ = 'e'; + *d++ = '+'; + d = print_uint32_reversed(s = d, exponent); + char_array_reverse(s, d - 1); + } + + *d = '\0'; + return (int)(d - dst); +} + +static inline void buffer_print_uint64(BUFFER *wb, uint64_t value) { + buffer_need_bytes(wb, 50); + + char *s = &wb->buffer[wb->len]; + char *d = print_uint64_reversed(s, value); + char_array_reverse(s, d - 1); + *d = '\0'; + wb->len += d - s; + + buffer_overflow_check(wb); +} + +static inline void buffer_print_int64(BUFFER *wb, int64_t value) { + buffer_need_bytes(wb, 50); + + if(value < 0) { + buffer_fast_strcat(wb, "-", 1); + value = -value; + } + + buffer_print_uint64(wb, (uint64_t)value); + + buffer_overflow_check(wb); +} + +static inline void buffer_print_uint64_hex(BUFFER *wb, uint64_t value) { + buffer_need_bytes(wb, sizeof(uint64_t) * 2 + 2 + 1); + + buffer_fast_strcat(wb, HEX_PREFIX, sizeof(HEX_PREFIX) - 1); + + char *s = &wb->buffer[wb->len]; + char *d = print_uint64_hex_reversed(s, value); + char_array_reverse(s, d - 1); + *d = '\0'; + wb->len += d - s; + + buffer_overflow_check(wb); +} + +static inline void buffer_print_uint64_base64(BUFFER *wb, uint64_t value) { + buffer_need_bytes(wb, sizeof(uint64_t) * 2 + 2 + 1); + + buffer_fast_strcat(wb, IEEE754_UINT64_B64_PREFIX, sizeof(IEEE754_UINT64_B64_PREFIX) - 1); + + char *s = &wb->buffer[wb->len]; + char *d = print_uint64_base64_reversed(s, value); + char_array_reverse(s, d - 1); + *d = '\0'; + wb->len += d - s; + + buffer_overflow_check(wb); +} + +static inline void buffer_print_int64_hex(BUFFER *wb, int64_t value) { + buffer_need_bytes(wb, 2); + + if(value < 0) { + buffer_fast_strcat(wb, "-", 1); + value = -value; + } + + buffer_print_uint64_hex(wb, (uint64_t)value); + + buffer_overflow_check(wb); +} + +static inline void buffer_print_int64_base64(BUFFER *wb, int64_t value) { + buffer_need_bytes(wb, 2); + + if(value < 0) { + buffer_fast_strcat(wb, "-", 1); + value = -value; + } + + buffer_print_uint64_base64(wb, (uint64_t)value); + + buffer_overflow_check(wb); +} + +static inline void buffer_print_netdata_double(BUFFER *wb, NETDATA_DOUBLE value) { + buffer_need_bytes(wb, 512 + 2); + + if(isnan(value) || isinf(value)) { + buffer_fast_strcat(wb, "null", 4); + return; + } + else + wb->len += print_netdata_double(&wb->buffer[wb->len], value); + + // terminate it + buffer_need_bytes(wb, 1); + wb->buffer[wb->len] = '\0'; + + buffer_overflow_check(wb); +} + +static inline void buffer_print_netdata_double_hex(BUFFER *wb, NETDATA_DOUBLE value) { + buffer_need_bytes(wb, sizeof(uint64_t) * 2 + 2 + 1 + 1); + + uint64_t *ptr = (uint64_t *) (&value); + buffer_fast_strcat(wb, IEEE754_DOUBLE_HEX_PREFIX, sizeof(IEEE754_DOUBLE_HEX_PREFIX) - 1); + + char *s = &wb->buffer[wb->len]; + char *d = print_uint64_hex_reversed(s, *ptr); + char_array_reverse(s, d - 1); + *d = '\0'; + wb->len += d - s; + + buffer_overflow_check(wb); +} + +static inline void buffer_print_netdata_double_base64(BUFFER *wb, NETDATA_DOUBLE value) { + buffer_need_bytes(wb, sizeof(uint64_t) * 2 + 2 + 1 + 1); + + uint64_t *ptr = (uint64_t *) (&value); + buffer_fast_strcat(wb, IEEE754_DOUBLE_B64_PREFIX, sizeof(IEEE754_DOUBLE_B64_PREFIX) - 1); + + char *s = &wb->buffer[wb->len]; + char *d = print_uint64_base64_reversed(s, *ptr); + char_array_reverse(s, d - 1); + *d = '\0'; + wb->len += d - s; + + buffer_overflow_check(wb); +} + +typedef enum { + NUMBER_ENCODING_DECIMAL, + NUMBER_ENCODING_HEX, + NUMBER_ENCODING_BASE64, +} NUMBER_ENCODING; + +static inline void buffer_print_int64_encoded(BUFFER *wb, NUMBER_ENCODING encoding, int64_t value) { + if(encoding == NUMBER_ENCODING_BASE64) + return buffer_print_int64_base64(wb, value); + + if(encoding == NUMBER_ENCODING_HEX) + return buffer_print_int64_hex(wb, value); + + return buffer_print_int64(wb, value); +} + +static inline void buffer_print_uint64_encoded(BUFFER *wb, NUMBER_ENCODING encoding, uint64_t value) { + if(encoding == NUMBER_ENCODING_BASE64) + return buffer_print_uint64_base64(wb, value); + + if(encoding == NUMBER_ENCODING_HEX) + return buffer_print_uint64_hex(wb, value); + + return buffer_print_uint64(wb, value); +} + +static inline void buffer_print_netdata_double_encoded(BUFFER *wb, NUMBER_ENCODING encoding, NETDATA_DOUBLE value) { + if(encoding == NUMBER_ENCODING_BASE64) + return buffer_print_netdata_double_base64(wb, value); + + if(encoding == NUMBER_ENCODING_HEX) + return buffer_print_netdata_double_hex(wb, value); + + return buffer_print_netdata_double(wb, value); +} + +static inline void buffer_print_spaces(BUFFER *wb, size_t spaces) { + buffer_need_bytes(wb, spaces * 4 + 1); + + char *d = &wb->buffer[wb->len]; + for(size_t i = 0; i < spaces; i++) { + *d++ = ' '; + *d++ = ' '; + *d++ = ' '; + *d++ = ' '; + } + + *d = '\0'; + wb->len += spaces * 4; + + buffer_overflow_check(wb); +} + +static inline void buffer_print_json_comma(BUFFER *wb) { + if(wb->json.stack[wb->json.depth].count) + buffer_fast_strcat(wb, ",", 1); +} + +static inline void buffer_print_json_comma_newline_spacing(BUFFER *wb) { + buffer_print_json_comma(wb); + + if((wb->json.options & BUFFER_JSON_OPTIONS_MINIFY) || + (wb->json.stack[wb->json.depth].type == BUFFER_JSON_ARRAY && !(wb->json.options & BUFFER_JSON_OPTIONS_NEWLINE_ON_ARRAY_ITEMS))) + return; + + buffer_fast_strcat(wb, "\n", 1); + buffer_print_spaces(wb, wb->json.depth + 1); +} + +static inline void buffer_print_json_key(BUFFER *wb, const char *key) { + buffer_strcat(wb, wb->json.key_quote); + buffer_json_strcat(wb, key); + buffer_strcat(wb, wb->json.key_quote); +} + +static inline void buffer_json_add_string_value(BUFFER *wb, const char *value) { + if(value) { + buffer_strcat(wb, wb->json.value_quote); + buffer_json_strcat(wb, value); + buffer_strcat(wb, wb->json.value_quote); + } + else + buffer_fast_strcat(wb, "null", 4); +} + +static inline void buffer_json_add_quoted_string_value(BUFFER *wb, const char *value) { + if(value) { + buffer_strcat(wb, wb->json.value_quote); + buffer_json_quoted_strcat(wb, value); + buffer_strcat(wb, wb->json.value_quote); + } + else + buffer_fast_strcat(wb, "null", 4); +} + +static inline void buffer_json_member_add_object(BUFFER *wb, const char *key) { + buffer_print_json_comma_newline_spacing(wb); + buffer_print_json_key(wb, key); + buffer_fast_strcat(wb, ":{", 2); + wb->json.stack[wb->json.depth].count++; + + _buffer_json_depth_push(wb, BUFFER_JSON_OBJECT); +} + +static inline void buffer_json_object_close(BUFFER *wb) { +#ifdef NETDATA_INTERNAL_CHECKS + assert(wb->json.depth >= 0 && "BUFFER JSON: nothing is open to close it"); + assert(wb->json.stack[wb->json.depth].type == BUFFER_JSON_OBJECT && "BUFFER JSON: an object is not open to close it"); +#endif + if(!(wb->json.options & BUFFER_JSON_OPTIONS_MINIFY)) { + buffer_fast_strcat(wb, "\n", 1); + buffer_print_spaces(wb, wb->json.depth); + } + buffer_fast_strcat(wb, "}", 1); + _buffer_json_depth_pop(wb); +} + +static inline void buffer_json_member_add_string(BUFFER *wb, const char *key, const char *value) { + buffer_print_json_comma_newline_spacing(wb); + buffer_print_json_key(wb, key); + buffer_fast_strcat(wb, ":", 1); + buffer_json_add_string_value(wb, value); + + wb->json.stack[wb->json.depth].count++; +} + +static inline void buffer_json_member_add_string_or_omit(BUFFER *wb, const char *key, const char *value) { + if(value && *value) + buffer_json_member_add_string(wb, key, value); +} + +static inline void buffer_json_member_add_string_or_empty(BUFFER *wb, const char *key, const char *value) { + if(!value) + value = ""; + + buffer_json_member_add_string(wb, key, value); +} + +static inline void buffer_json_member_add_quoted_string(BUFFER *wb, const char *key, const char *value) { + buffer_print_json_comma_newline_spacing(wb); + buffer_print_json_key(wb, key); + buffer_fast_strcat(wb, ":", 1); + + if(!value || strcmp(value, "null") == 0) + buffer_fast_strcat(wb, "null", 4); + else + buffer_json_add_quoted_string_value(wb, value); + + wb->json.stack[wb->json.depth].count++; +} + +static inline void buffer_json_member_add_uuid(BUFFER *wb, const char *key, nd_uuid_t *value) { + buffer_print_json_comma_newline_spacing(wb); + buffer_print_json_key(wb, key); + buffer_fast_strcat(wb, ":", 1); + + if(value && !uuid_is_null(*value)) { + char uuid[GUID_LEN + 1]; + uuid_unparse_lower(*value, uuid); + buffer_json_add_string_value(wb, uuid); + } + else + buffer_json_add_string_value(wb, NULL); + + wb->json.stack[wb->json.depth].count++; +} + +static inline void buffer_json_member_add_boolean(BUFFER *wb, const char *key, bool value) { + buffer_print_json_comma_newline_spacing(wb); + buffer_print_json_key(wb, key); + buffer_fast_strcat(wb, ":", 1); + buffer_strcat(wb, value?"true":"false"); + + wb->json.stack[wb->json.depth].count++; +} + +static inline void buffer_json_member_add_array(BUFFER *wb, const char *key) { + buffer_print_json_comma_newline_spacing(wb); + if (key) { + buffer_print_json_key(wb, key); + buffer_fast_strcat(wb, ":[", 2); + } + else + buffer_fast_strcat(wb, "[", 1); + + wb->json.stack[wb->json.depth].count++; + + _buffer_json_depth_push(wb, BUFFER_JSON_ARRAY); +} + +static inline void buffer_json_add_array_item_array(BUFFER *wb) { + if(!(wb->json.options & BUFFER_JSON_OPTIONS_MINIFY) && wb->json.stack[wb->json.depth].type == BUFFER_JSON_ARRAY) { + // an array inside another array + buffer_print_json_comma(wb); + buffer_fast_strcat(wb, "\n", 1); + buffer_print_spaces(wb, wb->json.depth + 1); + } + else + buffer_print_json_comma_newline_spacing(wb); + + buffer_fast_strcat(wb, "[", 1); + wb->json.stack[wb->json.depth].count++; + + _buffer_json_depth_push(wb, BUFFER_JSON_ARRAY); +} + +static inline void buffer_json_add_array_item_string(BUFFER *wb, const char *value) { + buffer_print_json_comma_newline_spacing(wb); + + buffer_json_add_string_value(wb, value); + wb->json.stack[wb->json.depth].count++; +} + +static inline void buffer_json_add_array_item_uuid(BUFFER *wb, nd_uuid_t *value) { + if(value && !uuid_is_null(*value)) { + char uuid[GUID_LEN + 1]; + uuid_unparse_lower(*value, uuid); + buffer_json_add_array_item_string(wb, uuid); + } + else + buffer_json_add_array_item_string(wb, NULL); +} + +static inline void buffer_json_add_array_item_uuid_compact(BUFFER *wb, nd_uuid_t *value) { + if(value && !uuid_is_null(*value)) { + char uuid[GUID_LEN + 1]; + uuid_unparse_lower_compact(*value, uuid); + buffer_json_add_array_item_string(wb, uuid); + } + else + buffer_json_add_array_item_string(wb, NULL); +} + +static inline void buffer_json_add_array_item_double(BUFFER *wb, NETDATA_DOUBLE value) { + buffer_print_json_comma_newline_spacing(wb); + + buffer_print_netdata_double(wb, value); + wb->json.stack[wb->json.depth].count++; +} + +static inline void buffer_json_add_array_item_int64(BUFFER *wb, int64_t value) { + buffer_print_json_comma_newline_spacing(wb); + + buffer_print_int64(wb, value); + wb->json.stack[wb->json.depth].count++; +} + +static inline void buffer_json_add_array_item_uint64(BUFFER *wb, uint64_t value) { + buffer_print_json_comma_newline_spacing(wb); + + buffer_print_uint64(wb, value); + wb->json.stack[wb->json.depth].count++; +} + +static inline void buffer_json_add_array_item_boolean(BUFFER *wb, bool value) { + buffer_print_json_comma_newline_spacing(wb); + + buffer_strcat(wb, value ? "true" : "false"); + wb->json.stack[wb->json.depth].count++; +} + +static inline void buffer_json_add_array_item_time_t(BUFFER *wb, time_t value) { + buffer_print_json_comma_newline_spacing(wb); + + buffer_print_int64(wb, value); + wb->json.stack[wb->json.depth].count++; +} + +static inline void buffer_json_add_array_item_time_ms(BUFFER *wb, time_t value) { + buffer_print_json_comma_newline_spacing(wb); + + buffer_print_int64(wb, value); + buffer_fast_strcat(wb, "000", 3); + wb->json.stack[wb->json.depth].count++; +} + +static inline void buffer_json_add_array_item_time_t2ms(BUFFER *wb, time_t value) { + buffer_print_json_comma_newline_spacing(wb); + + buffer_print_int64(wb, value); + buffer_fast_strcat(wb, "000", 3); + wb->json.stack[wb->json.depth].count++; +} + +static inline void buffer_json_add_array_item_object(BUFFER *wb) { + buffer_print_json_comma_newline_spacing(wb); + + buffer_fast_strcat(wb, "{", 1); + wb->json.stack[wb->json.depth].count++; + + _buffer_json_depth_push(wb, BUFFER_JSON_OBJECT); +} + +static inline void buffer_json_member_add_time_t(BUFFER *wb, const char *key, time_t value) { + buffer_print_json_comma_newline_spacing(wb); + buffer_print_json_key(wb, key); + buffer_fast_strcat(wb, ":", 1); + buffer_print_int64(wb, value); + + wb->json.stack[wb->json.depth].count++; +} + +static inline void buffer_json_member_add_time_t2ms(BUFFER *wb, const char *key, time_t value) { + buffer_print_json_comma_newline_spacing(wb); + buffer_print_json_key(wb, key); + buffer_fast_strcat(wb, ":", 1); + buffer_print_int64(wb, value); + buffer_fast_strcat(wb, "000", 3); + + wb->json.stack[wb->json.depth].count++; +} + +static inline void buffer_json_member_add_uint64(BUFFER *wb, const char *key, uint64_t value) { + buffer_print_json_comma_newline_spacing(wb); + buffer_print_json_key(wb, key); + buffer_fast_strcat(wb, ":", 1); + buffer_print_uint64(wb, value); + + wb->json.stack[wb->json.depth].count++; +} + +static inline void buffer_json_member_add_int64(BUFFER *wb, const char *key, int64_t value) { + buffer_print_json_comma_newline_spacing(wb); + buffer_print_json_key(wb, key); + buffer_fast_strcat(wb, ":", 1); + buffer_print_int64(wb, value); + + wb->json.stack[wb->json.depth].count++; +} + +static inline void buffer_json_member_add_double(BUFFER *wb, const char *key, NETDATA_DOUBLE value) { + buffer_print_json_comma_newline_spacing(wb); + buffer_print_json_key(wb, key); + buffer_fast_strcat(wb, ":", 1); + buffer_print_netdata_double(wb, value); + + wb->json.stack[wb->json.depth].count++; +} + +static inline void buffer_json_array_close(BUFFER *wb) { +#ifdef NETDATA_INTERNAL_CHECKS + assert(wb->json.depth >= 0 && "BUFFER JSON: nothing is open to close it"); + assert(wb->json.stack[wb->json.depth].type == BUFFER_JSON_ARRAY && "BUFFER JSON: an array is not open to close it"); +#endif + if(wb->json.options & BUFFER_JSON_OPTIONS_NEWLINE_ON_ARRAY_ITEMS) { + buffer_fast_strcat(wb, "\n", 1); + buffer_print_spaces(wb, wb->json.depth); + } + + buffer_fast_strcat(wb, "]", 1); + _buffer_json_depth_pop(wb); +} + +typedef enum __attribute__((packed)) { + RRDF_FIELD_OPTS_NONE = 0, + RRDF_FIELD_OPTS_UNIQUE_KEY = (1 << 0), // the field is the unique key of the row + RRDF_FIELD_OPTS_VISIBLE = (1 << 1), // the field should be visible by default + RRDF_FIELD_OPTS_STICKY = (1 << 2), // the field should be sticky + RRDF_FIELD_OPTS_FULL_WIDTH = (1 << 3), // the field should get full width + RRDF_FIELD_OPTS_WRAP = (1 << 4), // the field should wrap + RRDF_FIELD_OPTS_DUMMY = (1 << 5), // not a presentable field + RRDF_FIELD_OPTS_EXPANDED_FILTER = (1 << 6), // show the filter expanded +} RRDF_FIELD_OPTIONS; + +typedef enum __attribute__((packed)) { + RRDF_FIELD_TYPE_NONE, + RRDF_FIELD_TYPE_INTEGER, + RRDF_FIELD_TYPE_BOOLEAN, + RRDF_FIELD_TYPE_STRING, + RRDF_FIELD_TYPE_DETAIL_STRING, + RRDF_FIELD_TYPE_BAR_WITH_INTEGER, + RRDF_FIELD_TYPE_DURATION, + RRDF_FIELD_TYPE_TIMESTAMP, + RRDF_FIELD_TYPE_ARRAY, +} RRDF_FIELD_TYPE; + +static inline const char *rrdf_field_type_to_string(RRDF_FIELD_TYPE type) { + switch(type) { + default: + case RRDF_FIELD_TYPE_NONE: + return "none"; + + case RRDF_FIELD_TYPE_INTEGER: + return "integer"; + + case RRDF_FIELD_TYPE_BOOLEAN: + return "boolean"; + + case RRDF_FIELD_TYPE_STRING: + return "string"; + + case RRDF_FIELD_TYPE_DETAIL_STRING: + return "detail-string"; + + case RRDF_FIELD_TYPE_BAR_WITH_INTEGER: + return "bar-with-integer"; + + case RRDF_FIELD_TYPE_DURATION: + return "duration"; + + case RRDF_FIELD_TYPE_TIMESTAMP: + return "timestamp"; + + case RRDF_FIELD_TYPE_ARRAY: + return "array"; + } +} + +typedef enum __attribute__((packed)) { + RRDF_FIELD_VISUAL_VALUE, // show the value, possibly applying a transformation + RRDF_FIELD_VISUAL_BAR, // show the value and a bar, respecting the max field to fill the bar at 100% + RRDF_FIELD_VISUAL_PILL, // + RRDF_FIELD_VISUAL_RICH, // + RRDR_FIELD_VISUAL_ROW_OPTIONS, // this is a dummy column that is used for row options +} RRDF_FIELD_VISUAL; + +static inline const char *rrdf_field_visual_to_string(RRDF_FIELD_VISUAL visual) { + switch(visual) { + default: + case RRDF_FIELD_VISUAL_VALUE: + return "value"; + + case RRDF_FIELD_VISUAL_BAR: + return "bar"; + + case RRDF_FIELD_VISUAL_PILL: + return "pill"; + + case RRDF_FIELD_VISUAL_RICH: + return "richValue"; + + case RRDR_FIELD_VISUAL_ROW_OPTIONS: + return "rowOptions"; + } +} + +typedef enum __attribute__((packed)) { + RRDF_FIELD_TRANSFORM_NONE, // show the value as-is + RRDF_FIELD_TRANSFORM_NUMBER, // show the value respecting the decimal_points + RRDF_FIELD_TRANSFORM_DURATION_S, // transform as duration in second to a human-readable duration + RRDF_FIELD_TRANSFORM_DATETIME_MS, // UNIX epoch timestamp in ms + RRDF_FIELD_TRANSFORM_DATETIME_USEC, // UNIX epoch timestamp in usec +} RRDF_FIELD_TRANSFORM; + +static inline const char *rrdf_field_transform_to_string(RRDF_FIELD_TRANSFORM transform) { + switch(transform) { + default: + case RRDF_FIELD_TRANSFORM_NONE: + return "none"; + + case RRDF_FIELD_TRANSFORM_NUMBER: + return "number"; + + case RRDF_FIELD_TRANSFORM_DURATION_S: + return "duration"; + + case RRDF_FIELD_TRANSFORM_DATETIME_MS: + return "datetime"; + + case RRDF_FIELD_TRANSFORM_DATETIME_USEC: + return "datetime_usec"; + } +} + +typedef enum __attribute__((packed)) { + RRDF_FIELD_SORT_ASCENDING = (1 << 0), + RRDF_FIELD_SORT_DESCENDING = (1 << 1), + + RRDF_FIELD_SORT_FIXED = (1 << 7), +} RRDF_FIELD_SORT; + +static inline const char *rrdf_field_sort_to_string(RRDF_FIELD_SORT sort) { + if(sort & RRDF_FIELD_SORT_DESCENDING) + return "descending"; + + else + return "ascending"; +} + +typedef enum __attribute__((packed)) { + RRDF_FIELD_SUMMARY_UNIQUECOUNT, // Finds the number of unique values of a group of rows + RRDF_FIELD_SUMMARY_SUM, // Sums the values of a group of rows + RRDF_FIELD_SUMMARY_MIN, // Finds the minimum value of a group of rows + RRDF_FIELD_SUMMARY_MAX, // Finds the maximum value of a group of rows + // RRDF_FIELD_SUMMARY_EXTENT, // Finds the minimum and maximum values of a group of rows + RRDF_FIELD_SUMMARY_MEAN, // Finds the mean/average value of a group of rows + RRDF_FIELD_SUMMARY_MEDIAN, // Finds the median value of a group of rows + // RRDF_FIELD_SUMMARY_UNIQUE, // Finds the unique values of a group of rows + RRDF_FIELD_SUMMARY_COUNT, // Calculates the number of rows in a group +} RRDF_FIELD_SUMMARY; + +static inline const char *rrdf_field_summary_to_string(RRDF_FIELD_SUMMARY summary) { + switch(summary) { + default: + case RRDF_FIELD_SUMMARY_COUNT: + return "count"; + + case RRDF_FIELD_SUMMARY_UNIQUECOUNT: + return "uniqueCount"; + + case RRDF_FIELD_SUMMARY_SUM: + return "sum"; + + case RRDF_FIELD_SUMMARY_MIN: + return "min"; + + case RRDF_FIELD_SUMMARY_MEAN: + return "mean"; + + case RRDF_FIELD_SUMMARY_MEDIAN: + return "median"; + + case RRDF_FIELD_SUMMARY_MAX: + return "max"; + } +} + +typedef enum __attribute__((packed)) { + RRDF_FIELD_FILTER_NONE = 0, + RRDF_FIELD_FILTER_RANGE, + RRDF_FIELD_FILTER_MULTISELECT, + RRDF_FIELD_FILTER_FACET, +} RRDF_FIELD_FILTER; + +static inline const char *rrdf_field_filter_to_string(RRDF_FIELD_FILTER filter) { + switch(filter) { + case RRDF_FIELD_FILTER_RANGE: + return "range"; + + case RRDF_FIELD_FILTER_MULTISELECT: + return "multiselect"; + + case RRDF_FIELD_FILTER_FACET: + return "facet"; + + default: + case RRDF_FIELD_FILTER_NONE: + return "none"; + } +} + +static inline void +buffer_rrdf_table_add_field(BUFFER *wb, size_t field_id, const char *key, const char *name, RRDF_FIELD_TYPE type, + RRDF_FIELD_VISUAL visual, RRDF_FIELD_TRANSFORM transform, size_t decimal_points, + const char *units, NETDATA_DOUBLE max, RRDF_FIELD_SORT sort, const char *pointer_to, + RRDF_FIELD_SUMMARY summary, RRDF_FIELD_FILTER filter, RRDF_FIELD_OPTIONS options, + const char *default_value) { + + buffer_json_member_add_object(wb, key); + { + buffer_json_member_add_uint64(wb, "index", field_id); + buffer_json_member_add_boolean(wb, "unique_key", options & RRDF_FIELD_OPTS_UNIQUE_KEY); + buffer_json_member_add_string(wb, "name", name); + buffer_json_member_add_boolean(wb, "visible", options & RRDF_FIELD_OPTS_VISIBLE); + buffer_json_member_add_string(wb, "type", rrdf_field_type_to_string(type)); + buffer_json_member_add_string_or_omit(wb, "units", units); + buffer_json_member_add_string(wb, "visualization", rrdf_field_visual_to_string(visual)); + + buffer_json_member_add_object(wb, "value_options"); + { + buffer_json_member_add_string_or_omit(wb, "units", units); + buffer_json_member_add_string(wb, "transform", rrdf_field_transform_to_string(transform)); + buffer_json_member_add_uint64(wb, "decimal_points", decimal_points); + buffer_json_member_add_string(wb, "default_value", default_value); + } + buffer_json_object_close(wb); + + if (!isnan((NETDATA_DOUBLE) (max))) + buffer_json_member_add_double(wb, "max", (NETDATA_DOUBLE) (max)); + + buffer_json_member_add_string_or_omit(wb, "pointer_to", pointer_to); + buffer_json_member_add_string(wb, "sort", rrdf_field_sort_to_string(sort)); + buffer_json_member_add_boolean(wb, "sortable", !(sort & RRDF_FIELD_SORT_FIXED)); + buffer_json_member_add_boolean(wb, "sticky", options & RRDF_FIELD_OPTS_STICKY); + buffer_json_member_add_string(wb, "summary", rrdf_field_summary_to_string(summary)); + buffer_json_member_add_string(wb, "filter", rrdf_field_filter_to_string(filter)); + + buffer_json_member_add_boolean(wb, "full_width", options & RRDF_FIELD_OPTS_FULL_WIDTH); + buffer_json_member_add_boolean(wb, "wrap", options & RRDF_FIELD_OPTS_WRAP); + buffer_json_member_add_boolean(wb, "default_expanded_filter", options & RRDF_FIELD_OPTS_EXPANDED_FILTER); + + if(options & RRDF_FIELD_OPTS_DUMMY) + buffer_json_member_add_boolean(wb, "dummy", true); + } + buffer_json_object_close(wb); +} + +static inline void buffer_copy(BUFFER *dst, BUFFER *src) { + if(!src || !dst) + return; + + buffer_contents_replace(dst, buffer_tostring(src), buffer_strlen(src)); + + dst->content_type = src->content_type; + dst->options = src->options; + dst->date = src->date; + dst->expires = src->expires; + dst->json = src->json; +} + +static inline BUFFER *buffer_dup(BUFFER *src) { + if(!src) + return NULL; + + BUFFER *dst = buffer_create(buffer_strlen(src) + 1, src->statistics); + buffer_copy(dst, src); + return dst; +} + +#endif /* NETDATA_WEB_BUFFER_H */ diff --git a/src/libnetdata/buffered_reader/README.md b/src/libnetdata/buffered_reader/README.md new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/src/libnetdata/buffered_reader/README.md diff --git a/src/libnetdata/buffered_reader/buffered_reader.c b/src/libnetdata/buffered_reader/buffered_reader.c new file mode 100644 index 00000000..7cd17abf --- /dev/null +++ b/src/libnetdata/buffered_reader/buffered_reader.c @@ -0,0 +1,3 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" diff --git a/src/libnetdata/buffered_reader/buffered_reader.h b/src/libnetdata/buffered_reader/buffered_reader.h new file mode 100644 index 00000000..1ec1d762 --- /dev/null +++ b/src/libnetdata/buffered_reader/buffered_reader.h @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" + +#ifndef NETDATA_BUFFERED_READER_H +#define NETDATA_BUFFERED_READER_H + +struct buffered_reader { + ssize_t read_len; + ssize_t pos; + char read_buffer[PLUGINSD_LINE_MAX + 1]; +}; + +static inline void buffered_reader_init(struct buffered_reader *reader) { + reader->read_buffer[0] = '\0'; + reader->read_len = 0; + reader->pos = 0; +} + +typedef enum { + BUFFERED_READER_READ_OK = 0, + BUFFERED_READER_READ_FAILED = -1, + BUFFERED_READER_READ_BUFFER_FULL = -2, + BUFFERED_READER_READ_POLLERR = -3, + BUFFERED_READER_READ_POLLHUP = -4, + BUFFERED_READER_READ_POLLNVAL = -5, + BUFFERED_READER_READ_POLL_UNKNOWN = -6, + BUFFERED_READER_READ_POLL_TIMEOUT = -7, + BUFFERED_READER_READ_POLL_CANCELLED = -8, +} buffered_reader_ret_t; + + +static inline buffered_reader_ret_t buffered_reader_read(struct buffered_reader *reader, int fd) { +#ifdef NETDATA_INTERNAL_CHECKS + if(reader->read_buffer[reader->read_len] != '\0') + fatal("read_buffer does not start with zero"); +#endif + + char *read_at = reader->read_buffer + reader->read_len; + ssize_t remaining = sizeof(reader->read_buffer) - reader->read_len - 1; + + if(unlikely(remaining <= 0)) + return BUFFERED_READER_READ_BUFFER_FULL; + + ssize_t bytes_read = read(fd, read_at, remaining); + if(unlikely(bytes_read <= 0)) + return BUFFERED_READER_READ_FAILED; + + reader->read_len += bytes_read; + reader->read_buffer[reader->read_len] = '\0'; + + return BUFFERED_READER_READ_OK; +} + +static inline buffered_reader_ret_t buffered_reader_read_timeout(struct buffered_reader *reader, int fd, int timeout_ms, bool log_error) { + short int revents = 0; + switch(wait_on_socket_or_cancel_with_timeout( +#ifdef ENABLE_HTTPS + NULL, +#endif + fd, timeout_ms, POLLIN, &revents)) { + + case 0: // data are waiting + return buffered_reader_read(reader, fd); + + case 1: // timeout reached + if(log_error) + netdata_log_error("PARSER: timeout while waiting for data."); + return BUFFERED_READER_READ_POLL_TIMEOUT; + + case -1: // thread cancelled + netdata_log_error("PARSER: thread cancelled while waiting for data."); + return BUFFERED_READER_READ_POLL_CANCELLED; + + default: + case 2: // error on socket + if(revents & POLLERR) { + if(log_error) + netdata_log_error("PARSER: read failed: POLLERR."); + return BUFFERED_READER_READ_POLLERR; + } + if(revents & POLLHUP) { + if(log_error) + netdata_log_error("PARSER: read failed: POLLHUP."); + return BUFFERED_READER_READ_POLLHUP; + } + if(revents & POLLNVAL) { + if(log_error) + netdata_log_error("PARSER: read failed: POLLNVAL."); + return BUFFERED_READER_READ_POLLNVAL; + } + } + + if(log_error) + netdata_log_error("PARSER: poll() returned positive number, but POLLIN|POLLERR|POLLHUP|POLLNVAL are not set."); + return BUFFERED_READER_READ_POLL_UNKNOWN; +} + +/* Produce a full line if one exists, statefully return where we start next time. + * When we hit the end of the buffer with a partial line move it to the beginning for the next fill. + */ +static inline bool buffered_reader_next_line(struct buffered_reader *reader, BUFFER *dst) { + buffer_need_bytes(dst, reader->read_len - reader->pos + 2); + + size_t start = reader->pos; + + char *ss = &reader->read_buffer[start]; + char *se = &reader->read_buffer[reader->read_len]; + char *ds = &dst->buffer[dst->len]; + char *de = &ds[dst->size - dst->len - 2]; + + if(ss >= se) { + *ds = '\0'; + reader->pos = 0; + reader->read_len = 0; + reader->read_buffer[reader->read_len] = '\0'; + return false; + } + + // copy all bytes to buffer + while(ss < se && ds < de && *ss != '\n') { + *ds++ = *ss++; + dst->len++; + } + + // if we have a newline, return the buffer + if(ss < se && ds < de && *ss == '\n') { + // newline found in the r->read_buffer + + *ds++ = *ss++; // copy the newline too + dst->len++; + + *ds = '\0'; + + reader->pos = ss - reader->read_buffer; + return true; + } + + reader->pos = 0; + reader->read_len = 0; + reader->read_buffer[reader->read_len] = '\0'; + return false; +} + +#endif //NETDATA_BUFFERED_READER_H diff --git a/src/libnetdata/circular_buffer/README.md b/src/libnetdata/circular_buffer/README.md new file mode 100644 index 00000000..b2d580cb --- /dev/null +++ b/src/libnetdata/circular_buffer/README.md @@ -0,0 +1,14 @@ +<!-- +title: "Circular Buffer" +custom_edit_url: https://github.com/netdata/netdata/edit/master/src/libnetdata/circular_buffer/README.md +sidebar_label: "Circular Buffer" +learn_status: "Published" +learn_topic_type: "Tasks" +learn_rel_path: "Developers/libnetdata" +--> + +# Circular Buffer + +`struct circular_buffer` is an adaptive circular buffer. It will start at an initial size +and grow up to a maximum size as it fills. Two indices within the structure track the current +`read` and `write` position for data. diff --git a/src/libnetdata/circular_buffer/circular_buffer.c b/src/libnetdata/circular_buffer/circular_buffer.c new file mode 100644 index 00000000..7ffe6b8b --- /dev/null +++ b/src/libnetdata/circular_buffer/circular_buffer.c @@ -0,0 +1,113 @@ +#include "../libnetdata.h" + +struct circular_buffer *cbuffer_new(size_t initial, size_t max, size_t *statistics) { + struct circular_buffer *buf = mallocz(sizeof(struct circular_buffer)); + buf->size = initial; + buf->data = mallocz(initial); + buf->write = 0; + buf->read = 0; + buf->max_size = max; + buf->statistics = statistics; + + if(buf->statistics) + __atomic_add_fetch(buf->statistics, sizeof(struct circular_buffer) + buf->size, __ATOMIC_RELAXED); + + return buf; +} + +void cbuffer_free(struct circular_buffer *buf) { + if (unlikely(!buf)) + return; + + if(buf->statistics) + __atomic_sub_fetch(buf->statistics, sizeof(struct circular_buffer) + buf->size, __ATOMIC_RELAXED); + + freez(buf->data); + freez(buf); +} + +static int cbuffer_realloc_unsafe(struct circular_buffer *buf) { + // Check that we can grow + if (buf->size >= buf->max_size) + return 1; + + size_t old_size = buf->size; + size_t new_size = buf->size * 2; + if (new_size > buf->max_size) + new_size = buf->max_size; + + // We know that: size < new_size <= max_size + // For simplicity align the current data at the bottom of the new buffer + char *new_data = mallocz(new_size); + if (buf->read == buf->write) + buf->write = 0; // buffer is empty + else if (buf->read < buf->write) { + memcpy(new_data, buf->data + buf->read, buf->write - buf->read); + buf->write -= buf->read; + } else { + size_t top_part = buf->size - buf->read; + memcpy(new_data, buf->data + buf->read, top_part); + memcpy(new_data + top_part, buf->data, buf->write); + buf->write = top_part + buf->write; + } + buf->read = 0; + + // Switch buffers + freez(buf->data); + buf->data = new_data; + buf->size = new_size; + + if(buf->statistics) + __atomic_add_fetch(buf->statistics, new_size - old_size, __ATOMIC_RELAXED); + + return 0; +} + +size_t cbuffer_available_size_unsafe(struct circular_buffer *buf) { + size_t len = (buf->write >= buf->read) ? (buf->write - buf->read) : (buf->size - buf->read + buf->write); + return buf->max_size - len; +} + +int cbuffer_add_unsafe(struct circular_buffer *buf, const char *d, size_t d_len) { + size_t len = (buf->write >= buf->read) ? (buf->write - buf->read) : (buf->size - buf->read + buf->write); + while (d_len + len >= buf->size) { + if (cbuffer_realloc_unsafe(buf)) { + return 1; + } + } + // Guarantee: write + d_len cannot hit read + if (buf->write + d_len < buf->size) { + memcpy(buf->data + buf->write, d, d_len); + buf->write += d_len; + } + else { + size_t top_part = buf->size - buf->write; + memcpy(buf->data + buf->write, d, top_part); + memcpy(buf->data, d + top_part, d_len - top_part); + buf->write = d_len - top_part; + } + return 0; +} + +// Assume caller does not remove too many bytes (i.e. read will jump over write) +void cbuffer_remove_unsafe(struct circular_buffer *buf, size_t num) { + buf->read += num; + // Assume num < size (i.e. caller cannot remove more bytes than are in the buffer) + if (buf->read >= buf->size) + buf->read -= buf->size; +} + +size_t cbuffer_next_unsafe(struct circular_buffer *buf, char **start) { + if (start != NULL) + *start = buf->data + buf->read; + + if (buf->read <= buf->write) { + return buf->write - buf->read; // Includes empty case + } + return buf->size - buf->read; +} + +void cbuffer_flush(struct circular_buffer*buf) { + buf->write = 0; + buf->read = 0; +}
\ No newline at end of file diff --git a/src/libnetdata/circular_buffer/circular_buffer.h b/src/libnetdata/circular_buffer/circular_buffer.h new file mode 100644 index 00000000..9d29a84d --- /dev/null +++ b/src/libnetdata/circular_buffer/circular_buffer.h @@ -0,0 +1,20 @@ +#ifndef CIRCULAR_BUFFER_H +#define CIRCULAR_BUFFER_H 1 + +#include <string.h> + +struct circular_buffer { + size_t size, write, read, max_size; + size_t *statistics; + char *data; +}; + +struct circular_buffer *cbuffer_new(size_t initial, size_t max, size_t *statistics); +void cbuffer_free(struct circular_buffer *buf); +int cbuffer_add_unsafe(struct circular_buffer *buf, const char *d, size_t d_len); +void cbuffer_remove_unsafe(struct circular_buffer *buf, size_t num); +size_t cbuffer_next_unsafe(struct circular_buffer *buf, char **start); +size_t cbuffer_available_size_unsafe(struct circular_buffer *buf); +void cbuffer_flush(struct circular_buffer*buf); + +#endif diff --git a/src/libnetdata/clocks/README.md b/src/libnetdata/clocks/README.md new file mode 100644 index 00000000..0ede05e8 --- /dev/null +++ b/src/libnetdata/clocks/README.md @@ -0,0 +1,10 @@ +<!-- +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/libnetdata/clocks/README.md" +title: "Clocks" +sidebar_label: "Clocks" +learn_status: "Published" +learn_topic_type: "References" +learn_rel_path: "Developers/libnetdata" +--> + +# Clocks
\ No newline at end of file diff --git a/src/libnetdata/clocks/clocks.c b/src/libnetdata/clocks/clocks.c new file mode 100644 index 00000000..e1a3e64c --- /dev/null +++ b/src/libnetdata/clocks/clocks.c @@ -0,0 +1,478 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" + +// defaults are for compatibility +// call clocks_init() once, to optimize these default settings +static clockid_t clock_boottime_to_use = CLOCK_MONOTONIC; +static clockid_t clock_monotonic_to_use = CLOCK_MONOTONIC; + +// the default clock resolution is 1ms +#define DEFAULT_CLOCK_RESOLUTION_UT ((usec_t)0 * USEC_PER_SEC + (usec_t)1 * USEC_PER_MS) + +// the max clock resolution is 10ms +#define MAX_CLOCK_RESOLUTION_UT ((usec_t)0 * USEC_PER_SEC + (usec_t)10 * USEC_PER_MS) + +usec_t clock_monotonic_resolution = DEFAULT_CLOCK_RESOLUTION_UT; +usec_t clock_realtime_resolution = DEFAULT_CLOCK_RESOLUTION_UT; + +#ifndef HAVE_CLOCK_GETTIME +inline int clock_gettime(clockid_t clk_id __maybe_unused, struct timespec *ts) { + struct timeval tv; + if(unlikely(gettimeofday(&tv, NULL) == -1)) { + netdata_log_error("gettimeofday() failed."); + return -1; + } + ts->tv_sec = tv.tv_sec; + ts->tv_nsec = (long)((tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC); + return 0; +} +#endif + +// Similar to CLOCK_MONOTONIC, but provides access to a raw hardware-based time that is not subject to NTP adjustments +// or the incremental adjustments performed by adjtime(3). This clock does not count time that the system is suspended + +static void test_clock_monotonic_raw(void) { +#ifdef CLOCK_MONOTONIC_RAW + struct timespec ts; + if(clock_gettime(CLOCK_MONOTONIC_RAW, &ts) == -1 && errno == EINVAL) + clock_monotonic_to_use = CLOCK_MONOTONIC; + else + clock_monotonic_to_use = CLOCK_MONOTONIC_RAW; +#else + clock_monotonic_to_use = CLOCK_MONOTONIC; +#endif +} + +// When running a binary with CLOCK_BOOTTIME defined on a system with a linux kernel older than Linux 2.6.39 the +// clock_gettime(2) system call fails with EINVAL. In that case it must fall-back to CLOCK_MONOTONIC. + +static void test_clock_boottime(void) { + struct timespec ts; + if(clock_gettime(CLOCK_BOOTTIME, &ts) == -1 && errno == EINVAL) + clock_boottime_to_use = clock_monotonic_to_use; + else + clock_boottime_to_use = CLOCK_BOOTTIME; +} + +static usec_t get_clock_resolution(clockid_t clock) { + struct timespec ts = { 0 }; + + if(clock_getres(clock, &ts) == 0) { + usec_t ret = (usec_t)ts.tv_sec * USEC_PER_SEC + (usec_t)ts.tv_nsec / NSEC_PER_USEC; + if(!ret && ts.tv_nsec > 0 && ts.tv_nsec < (long int)NSEC_PER_USEC) + return (usec_t)1; + + else if(ret > MAX_CLOCK_RESOLUTION_UT) { + nd_log(NDLS_DAEMON, NDLP_ERR, "clock_getres(%d) returned %"PRIu64" usec is out of range, using defaults for clock resolution.", (int)clock, ret); + return DEFAULT_CLOCK_RESOLUTION_UT; + } + + return ret; + } + else { + nd_log(NDLS_DAEMON, NDLP_ERR, "clock_getres(%d) failed, using defaults for clock resolution.", (int)clock); + return DEFAULT_CLOCK_RESOLUTION_UT; + } +} + +// perform any initializations required for clocks + +void clocks_init(void) { + // monotonic raw has to be tested before boottime + test_clock_monotonic_raw(); + + // boottime has to be tested after monotonic coarse + test_clock_boottime(); + + clock_monotonic_resolution = get_clock_resolution(clock_monotonic_to_use); + clock_realtime_resolution = get_clock_resolution(CLOCK_REALTIME); +} + +inline time_t now_sec(clockid_t clk_id) { + struct timespec ts; + if(unlikely(clock_gettime(clk_id, &ts) == -1)) { + netdata_log_error("clock_gettime(%ld, ×pec) failed.", (long int)clk_id); + return 0; + } + return ts.tv_sec; +} + +inline usec_t now_usec(clockid_t clk_id) { + struct timespec ts; + if(unlikely(clock_gettime(clk_id, &ts) == -1)) { + netdata_log_error("clock_gettime(%ld, ×pec) failed.", (long int)clk_id); + return 0; + } + return (usec_t)ts.tv_sec * USEC_PER_SEC + (usec_t)(ts.tv_nsec % NSEC_PER_SEC) / NSEC_PER_USEC; +} + +inline int now_timeval(clockid_t clk_id, struct timeval *tv) { + struct timespec ts; + + if(unlikely(clock_gettime(clk_id, &ts) == -1)) { + netdata_log_error("clock_gettime(%ld, ×pec) failed.", (long int)clk_id); + tv->tv_sec = 0; + tv->tv_usec = 0; + return -1; + } + + tv->tv_sec = ts.tv_sec; + tv->tv_usec = (suseconds_t)((ts.tv_nsec % NSEC_PER_SEC) / NSEC_PER_USEC); + return 0; +} + +inline time_t now_realtime_sec(void) { + return now_sec(CLOCK_REALTIME); +} + +inline msec_t now_realtime_msec(void) { + return now_usec(CLOCK_REALTIME) / USEC_PER_MS; +} + +inline usec_t now_realtime_usec(void) { + return now_usec(CLOCK_REALTIME); +} + +inline int now_realtime_timeval(struct timeval *tv) { + return now_timeval(CLOCK_REALTIME, tv); +} + +inline time_t now_monotonic_sec(void) { + return now_sec(clock_monotonic_to_use); +} + +inline usec_t now_monotonic_usec(void) { + return now_usec(clock_monotonic_to_use); +} + +inline int now_monotonic_timeval(struct timeval *tv) { + return now_timeval(clock_monotonic_to_use, tv); +} + +inline time_t now_monotonic_high_precision_sec(void) { + return now_sec(CLOCK_MONOTONIC); +} + +inline usec_t now_monotonic_high_precision_usec(void) { + return now_usec(CLOCK_MONOTONIC); +} + +inline int now_monotonic_high_precision_timeval(struct timeval *tv) { + return now_timeval(CLOCK_MONOTONIC, tv); +} + +inline time_t now_boottime_sec(void) { + return now_sec(clock_boottime_to_use); +} + +inline usec_t now_boottime_usec(void) { + return now_usec(clock_boottime_to_use); +} + +inline int now_boottime_timeval(struct timeval *tv) { + return now_timeval(clock_boottime_to_use, tv); +} + +inline usec_t timeval_usec(struct timeval *tv) { + return (usec_t)tv->tv_sec * USEC_PER_SEC + (tv->tv_usec % USEC_PER_SEC); +} + +inline msec_t timeval_msec(struct timeval *tv) { + return (msec_t)tv->tv_sec * MSEC_PER_SEC + ((tv->tv_usec % USEC_PER_SEC) / MSEC_PER_SEC); +} + +inline susec_t dt_usec_signed(struct timeval *now, struct timeval *old) { + usec_t ts1 = timeval_usec(now); + usec_t ts2 = timeval_usec(old); + + if(likely(ts1 >= ts2)) return (susec_t)(ts1 - ts2); + return -((susec_t)(ts2 - ts1)); +} + +inline usec_t dt_usec(struct timeval *now, struct timeval *old) { + usec_t ts1 = timeval_usec(now); + usec_t ts2 = timeval_usec(old); + return (ts1 > ts2) ? (ts1 - ts2) : (ts2 - ts1); +} + +#ifdef __linux__ +void sleep_to_absolute_time(usec_t usec) { + static int einval_printed = 0, enotsup_printed = 0, eunknown_printed = 0; + clockid_t clock = CLOCK_REALTIME; + + struct timespec req = { + .tv_sec = (time_t)(usec / USEC_PER_SEC), + .tv_nsec = (suseconds_t)((usec % USEC_PER_SEC) * NSEC_PER_USEC) + }; + + errno = 0; + int ret = 0; + while( (ret = clock_nanosleep(clock, TIMER_ABSTIME, &req, NULL)) != 0 ) { + if(ret == EINTR) { + errno = 0; + continue; + } + else { + if (ret == EINVAL) { + if (!einval_printed) { + einval_printed++; + netdata_log_error("Invalid time given to clock_nanosleep(): clockid = %d, tv_sec = %lld, tv_nsec = %ld", + clock, + (long long)req.tv_sec, + req.tv_nsec); + } + } else if (ret == ENOTSUP) { + if (!enotsup_printed) { + enotsup_printed++; + netdata_log_error("Invalid clock id given to clock_nanosleep(): clockid = %d, tv_sec = %lld, tv_nsec = %ld", + clock, + (long long)req.tv_sec, + req.tv_nsec); + } + } else { + if (!eunknown_printed) { + eunknown_printed++; + netdata_log_error("Unknown return value %d from clock_nanosleep(): clockid = %d, tv_sec = %lld, tv_nsec = %ld", + ret, + clock, + (long long)req.tv_sec, + req.tv_nsec); + } + } + sleep_usec(usec); + } + } +} +#endif + +#define HEARTBEAT_ALIGNMENT_STATISTICS_SIZE 10 +netdata_mutex_t heartbeat_alignment_mutex = NETDATA_MUTEX_INITIALIZER; +static size_t heartbeat_alignment_id = 0; + +struct heartbeat_thread_statistics { + size_t sequence; + usec_t dt; +}; +static struct heartbeat_thread_statistics heartbeat_alignment_values[HEARTBEAT_ALIGNMENT_STATISTICS_SIZE] = { 0 }; + +void heartbeat_statistics(usec_t *min_ptr, usec_t *max_ptr, usec_t *average_ptr, size_t *count_ptr) { + struct heartbeat_thread_statistics current[HEARTBEAT_ALIGNMENT_STATISTICS_SIZE]; + static struct heartbeat_thread_statistics old[HEARTBEAT_ALIGNMENT_STATISTICS_SIZE] = { 0 }; + + memcpy(current, heartbeat_alignment_values, sizeof(struct heartbeat_thread_statistics) * HEARTBEAT_ALIGNMENT_STATISTICS_SIZE); + + usec_t min = 0, max = 0, total = 0, average = 0; + size_t i, count = 0; + for(i = 0; i < HEARTBEAT_ALIGNMENT_STATISTICS_SIZE ;i++) { + if(current[i].sequence == old[i].sequence) continue; + usec_t value = current[i].dt - old[i].dt; + + if(!count) { + min = max = total = value; + count = 1; + } + else { + total += value; + if(value < min) min = value; + if(value > max) max = value; + count++; + } + } + if(count) + average = total / count; + + if(min_ptr) *min_ptr = min; + if(max_ptr) *max_ptr = max; + if(average_ptr) *average_ptr = average; + if(count_ptr) *count_ptr = count; + + memcpy(old, current, sizeof(struct heartbeat_thread_statistics) * HEARTBEAT_ALIGNMENT_STATISTICS_SIZE); +} + +inline void heartbeat_init(heartbeat_t *hb) { + hb->realtime = 0ULL; + hb->randomness = (usec_t)250 * USEC_PER_MS + ((usec_t)(now_realtime_usec() * clock_realtime_resolution) % (250 * USEC_PER_MS)); + hb->randomness -= (hb->randomness % clock_realtime_resolution); + + netdata_mutex_lock(&heartbeat_alignment_mutex); + hb->statistics_id = heartbeat_alignment_id; + heartbeat_alignment_id++; + netdata_mutex_unlock(&heartbeat_alignment_mutex); + + if(hb->statistics_id < HEARTBEAT_ALIGNMENT_STATISTICS_SIZE) { + heartbeat_alignment_values[hb->statistics_id].dt = 0; + heartbeat_alignment_values[hb->statistics_id].sequence = 0; + } +} + +// waits for the next heartbeat +// it waits using the monotonic clock +// it returns the dt using the realtime clock + +usec_t heartbeat_next(heartbeat_t *hb, usec_t tick) { + if(unlikely(hb->randomness > tick / 2)) { + // TODO: The heartbeat tick should be specified at the heartbeat_init() function + usec_t tmp = (now_realtime_usec() * clock_realtime_resolution) % (tick / 2); + + nd_log_limit_static_global_var(erl, 10, 0); + nd_log_limit(&erl, NDLS_DAEMON, NDLP_NOTICE, + "heartbeat randomness of %"PRIu64" is too big for a tick of %"PRIu64" - setting it to %"PRIu64"", + hb->randomness, tick, tmp); + hb->randomness = tmp; + } + + usec_t dt; + usec_t now = now_realtime_usec(); + usec_t next = now - (now % tick) + tick + hb->randomness; + + // align the next time we want to the clock resolution + if(next % clock_realtime_resolution) + next = next - (next % clock_realtime_resolution) + clock_realtime_resolution; + + // sleep_usec() has a loop to guarantee we will sleep for at least the requested time. + // According the specs, when we sleep for a relative time, clock adjustments should not affect the duration + // we sleep. + sleep_usec_with_now(next - now, now); + now = now_realtime_usec(); + dt = now - hb->realtime; + + if(hb->statistics_id < HEARTBEAT_ALIGNMENT_STATISTICS_SIZE) { + heartbeat_alignment_values[hb->statistics_id].dt += now - next; + heartbeat_alignment_values[hb->statistics_id].sequence++; + } + + if(unlikely(now < next)) { + errno = 0; + nd_log_limit_static_global_var(erl, 10, 0); + nd_log_limit(&erl, NDLS_DAEMON, NDLP_NOTICE, + "heartbeat clock: woke up %"PRIu64" microseconds earlier than expected " + "(can be due to the CLOCK_REALTIME set to the past).", + next - now); + } + else if(unlikely(now - next > tick / 2)) { + errno = 0; + nd_log_limit_static_global_var(erl, 10, 0); + nd_log_limit(&erl, NDLS_DAEMON, NDLP_NOTICE, + "heartbeat clock: woke up %"PRIu64" microseconds later than expected " + "(can be due to system load or the CLOCK_REALTIME set to the future).", + now - next); + } + + if(unlikely(!hb->realtime)) { + // the first time return zero + dt = 0; + } + + hb->realtime = now; + return dt; +} + +void sleep_usec_with_now(usec_t usec, usec_t started_ut) { + // we expect microseconds (1.000.000 per second) + // but timespec is nanoseconds (1.000.000.000 per second) + struct timespec rem = { 0, 0 }, req = { + .tv_sec = (time_t) (usec / USEC_PER_SEC), + .tv_nsec = (suseconds_t) ((usec % USEC_PER_SEC) * NSEC_PER_USEC) + }; + + // make sure errno is not EINTR + errno = 0; + + if(!started_ut) + started_ut = now_realtime_usec(); + + usec_t end_ut = started_ut + usec; + + while (nanosleep(&req, &rem) != 0) { + if (likely(errno == EINTR && (rem.tv_sec || rem.tv_nsec))) { + req = rem; + rem = (struct timespec){ 0, 0 }; + + // break an infinite loop + errno = 0; + + usec_t now_ut = now_realtime_usec(); + if(now_ut >= end_ut) + break; + + usec_t remaining_ut = (usec_t)req.tv_sec * USEC_PER_SEC + (usec_t)req.tv_nsec * NSEC_PER_USEC > usec; + usec_t check_ut = now_ut - started_ut; + if(remaining_ut > check_ut) { + req = (struct timespec){ + .tv_sec = (time_t) ( check_ut / USEC_PER_SEC), + .tv_nsec = (suseconds_t) ((check_ut % USEC_PER_SEC) * NSEC_PER_USEC) + }; + } + } + else { + netdata_log_error("Cannot nanosleep() for %"PRIu64" microseconds.", usec); + break; + } + } +} + +static inline collected_number uptime_from_boottime(void) { +#ifdef CLOCK_BOOTTIME_IS_AVAILABLE + return (collected_number)(now_boottime_usec() / USEC_PER_MS); +#else + netdata_log_error("uptime cannot be read from CLOCK_BOOTTIME on this system."); + return 0; +#endif +} + +static procfile *read_proc_uptime_ff = NULL; +static inline collected_number read_proc_uptime(char *filename) { + if(unlikely(!read_proc_uptime_ff)) { + read_proc_uptime_ff = procfile_open(filename, " \t", PROCFILE_FLAG_DEFAULT); + if(unlikely(!read_proc_uptime_ff)) return 0; + } + + read_proc_uptime_ff = procfile_readall(read_proc_uptime_ff); + if(unlikely(!read_proc_uptime_ff)) return 0; + + if(unlikely(procfile_lines(read_proc_uptime_ff) < 1)) { + netdata_log_error("/proc/uptime has no lines."); + return 0; + } + if(unlikely(procfile_linewords(read_proc_uptime_ff, 0) < 1)) { + netdata_log_error("/proc/uptime has less than 1 word in it."); + return 0; + } + + return (collected_number)(strtondd(procfile_lineword(read_proc_uptime_ff, 0, 0), NULL) * 1000.0); +} + +inline collected_number uptime_msec(char *filename){ + static int use_boottime = -1; + + if(unlikely(use_boottime == -1)) { + collected_number uptime_boottime = uptime_from_boottime(); + collected_number uptime_proc = read_proc_uptime(filename); + + long long delta = (long long)uptime_boottime - (long long)uptime_proc; + if(delta < 0) delta = -delta; + + if(delta <= 1000 && uptime_boottime != 0) { + procfile_close(read_proc_uptime_ff); + netdata_log_info("Using now_boottime_usec() for uptime (dt is %lld ms)", delta); + use_boottime = 1; + } + else if(uptime_proc != 0) { + netdata_log_info("Using /proc/uptime for uptime (dt is %lld ms)", delta); + use_boottime = 0; + } + else { + netdata_log_error("Cannot find any way to read uptime on this system."); + return 1; + } + } + + collected_number uptime; + if(use_boottime) + uptime = uptime_from_boottime(); + else + uptime = read_proc_uptime(filename); + + return uptime; +} diff --git a/src/libnetdata/clocks/clocks.h b/src/libnetdata/clocks/clocks.h new file mode 100644 index 00000000..f989fd6b --- /dev/null +++ b/src/libnetdata/clocks/clocks.h @@ -0,0 +1,161 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_CLOCKS_H +#define NETDATA_CLOCKS_H 1 + +#include "../libnetdata.h" + +#ifndef HAVE_CLOCK_GETTIME +struct timespec { + time_t tv_sec; /* seconds */ + long tv_nsec; /* nanoseconds */ +}; +#endif + +#ifndef HAVE_CLOCK_GETTIME +#endif + +typedef uint64_t nsec_t; +typedef uint64_t msec_t; +typedef uint64_t usec_t; +typedef int64_t susec_t; + +typedef struct heartbeat { + usec_t realtime; + usec_t randomness; + size_t statistics_id; +} heartbeat_t; + +/* Linux value is as good as any other */ +#ifndef CLOCK_REALTIME +#define CLOCK_REALTIME 0 +#endif + +#ifndef CLOCK_MONOTONIC +/* fallback to CLOCK_REALTIME if not available */ +#define CLOCK_MONOTONIC CLOCK_REALTIME +#endif + +#ifndef CLOCK_BOOTTIME + +#ifdef CLOCK_UPTIME +/* CLOCK_BOOTTIME falls back to CLOCK_UPTIME on FreeBSD */ +#define CLOCK_BOOTTIME CLOCK_UPTIME +#else // CLOCK_UPTIME +/* CLOCK_BOOTTIME falls back to CLOCK_REALTIME */ +#define CLOCK_BOOTTIME CLOCK_REALTIME +#endif // CLOCK_UPTIME + +#else // CLOCK_BOOTTIME + +#ifdef HAVE_CLOCK_GETTIME +#define CLOCK_BOOTTIME_IS_AVAILABLE 1 // required for /proc/uptime +#endif // HAVE_CLOCK_GETTIME + +#endif // CLOCK_BOOTTIME + +#ifndef NSEC_PER_MSEC +#define NSEC_PER_MSEC 1000000ULL +#endif + +#ifndef NSEC_PER_SEC +#define NSEC_PER_SEC 1000000000ULL +#endif +#ifndef NSEC_PER_USEC +#define NSEC_PER_USEC 1000ULL +#endif + +#ifndef USEC_PER_SEC +#define USEC_PER_SEC 1000000ULL +#endif +#ifndef MSEC_PER_SEC +#define MSEC_PER_SEC 1000ULL +#endif + +#define USEC_PER_MS 1000ULL + +#ifndef HAVE_CLOCK_GETTIME +/* Fallback function for POSIX.1-2001 clock_gettime() function. + * + * We use a realtime clock from gettimeofday(), this will + * make systems without clock_gettime() support sensitive + * to time jumps or hibernation/suspend side effects. + */ +int clock_gettime(clockid_t clk_id, struct timespec *ts); +#endif + +/* + * Three clocks are available (cf. man 3 clock_gettime): + * + * REALTIME clock (i.e. wall-clock): + * This clock is affected by discontinuous jumps in the system time + * (e.g., if the system administrator manually changes the clock), and by the incremental adjustments performed by adjtime(3) and NTP. + * + * MONOTONIC clock + * Clock that cannot be set and represents monotonic time since some unspecified starting point. + * This clock is not affected by discontinuous jumps in the system time + * (e.g., if the system administrator manually changes the clock), but is affected by the incremental adjustments performed by adjtime(3) and NTP. + * If not available on the system, this clock falls back to REALTIME clock. + * + * BOOTTIME clock + * Identical to CLOCK_MONOTONIC, except it also includes any time that the system is suspended. + * This allows applications to get a suspend-aware monotonic clock without having to deal with the complications of CLOCK_REALTIME, + * which may have discontinuities if the time is changed using settimeofday(2). + * If not available on the system, this clock falls back to MONOTONIC clock. + * + * All now_*_timeval() functions fill the `struct timeval` with the time from the appropriate clock. + * Those functions return 0 on success, -1 else with errno set appropriately. + * + * All now_*_sec() functions return the time in seconds from the appropriate clock, or 0 on error. + * All now_*_usec() functions return the time in microseconds from the appropriate clock, or 0 on error. + * + */ +int now_realtime_timeval(struct timeval *tv); +time_t now_realtime_sec(void); +usec_t now_realtime_usec(void); + +int now_monotonic_timeval(struct timeval *tv); +time_t now_monotonic_sec(void); +msec_t now_realtime_msec(void); +usec_t now_monotonic_usec(void); +int now_monotonic_high_precision_timeval(struct timeval *tv); +time_t now_monotonic_high_precision_sec(void); +usec_t now_monotonic_high_precision_usec(void); + +int now_boottime_timeval(struct timeval *tv); +time_t now_boottime_sec(void); +usec_t now_boottime_usec(void); + +usec_t timeval_usec(struct timeval *tv); +msec_t timeval_msec(struct timeval *tv); + +usec_t dt_usec(struct timeval *now, struct timeval *old); +susec_t dt_usec_signed(struct timeval *now, struct timeval *old); + +void heartbeat_init(heartbeat_t *hb); + +/* Sleeps until next multiple of tick using monotonic clock. + * Returns elapsed time in microseconds since previous heartbeat + */ +usec_t heartbeat_next(heartbeat_t *hb, usec_t tick); + +void heartbeat_statistics(usec_t *min_ptr, usec_t *max_ptr, usec_t *average_ptr, size_t *count_ptr); + +void sleep_usec_with_now(usec_t usec, usec_t started_ut); +#define sleep_usec(usec) sleep_usec_with_now(usec, 0) + +void clocks_init(void); + +// lower level functions - avoid using directly +time_t now_sec(clockid_t clk_id); +usec_t now_usec(clockid_t clk_id); +int now_timeval(clockid_t clk_id, struct timeval *tv); + +collected_number uptime_msec(char *filename); + +extern usec_t clock_monotonic_resolution; +extern usec_t clock_realtime_resolution; + +void sleep_to_absolute_time(usec_t usec); + +#endif /* NETDATA_CLOCKS_H */ diff --git a/src/libnetdata/completion/completion.c b/src/libnetdata/completion/completion.c new file mode 100644 index 00000000..11342383 --- /dev/null +++ b/src/libnetdata/completion/completion.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "completion.h" + +void completion_init(struct completion *p) +{ + p->completed = 0; + p->completed_jobs = 0; + fatal_assert(0 == uv_cond_init(&p->cond)); + fatal_assert(0 == uv_mutex_init(&p->mutex)); +} + +void completion_destroy(struct completion *p) +{ + uv_cond_destroy(&p->cond); + uv_mutex_destroy(&p->mutex); +} + +void completion_wait_for(struct completion *p) +{ + uv_mutex_lock(&p->mutex); + while (0 == p->completed) { + uv_cond_wait(&p->cond, &p->mutex); + } + fatal_assert(1 == p->completed); + uv_mutex_unlock(&p->mutex); +} + +bool completion_timedwait_for(struct completion *p, uint64_t timeout) +{ + timeout *= NSEC_PER_SEC; + + uint64_t start_time = uv_hrtime(); + bool result = true; + + uv_mutex_lock(&p->mutex); + while (!p->completed) { + int rc = uv_cond_timedwait(&p->cond, &p->mutex, timeout); + + if (rc == 0) { + result = true; + break; + } else if (rc == UV_ETIMEDOUT) { + result = false; + break; + } + + /* + * handle spurious wakeups + */ + + uint64_t elapsed = uv_hrtime() - start_time; + if (elapsed >= timeout) { + result = false; + break; + } + timeout -= elapsed; + } + uv_mutex_unlock(&p->mutex); + + return result; +} + +void completion_mark_complete(struct completion *p) +{ + uv_mutex_lock(&p->mutex); + p->completed = 1; + uv_cond_broadcast(&p->cond); + uv_mutex_unlock(&p->mutex); +} + +unsigned completion_wait_for_a_job(struct completion *p, unsigned completed_jobs) +{ + uv_mutex_lock(&p->mutex); + while (0 == p->completed && p->completed_jobs <= completed_jobs) { + uv_cond_wait(&p->cond, &p->mutex); + } + completed_jobs = p->completed_jobs; + uv_mutex_unlock(&p->mutex); + + return completed_jobs; +} + +void completion_mark_complete_a_job(struct completion *p) +{ + uv_mutex_lock(&p->mutex); + p->completed_jobs++; + uv_cond_broadcast(&p->cond); + uv_mutex_unlock(&p->mutex); +} + +bool completion_is_done(struct completion *p) +{ + bool ret; + uv_mutex_lock(&p->mutex); + ret = p->completed; + uv_mutex_unlock(&p->mutex); + return ret; +} diff --git a/src/libnetdata/completion/completion.h b/src/libnetdata/completion/completion.h new file mode 100644 index 00000000..908ccfaf --- /dev/null +++ b/src/libnetdata/completion/completion.h @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_COMPLETION_H +#define NETDATA_COMPLETION_H + +#include "../libnetdata.h" + +struct completion { + uv_mutex_t mutex; + uv_cond_t cond; + volatile unsigned completed; + volatile unsigned completed_jobs; +}; + +void completion_init(struct completion *p); + +void completion_destroy(struct completion *p); + +void completion_wait_for(struct completion *p); + +// Wait for at most `timeout` seconds. Return true on success, false on +// error or timeout. +bool completion_timedwait_for(struct completion *p, uint64_t timeout); + +void completion_mark_complete(struct completion *p); + +unsigned completion_wait_for_a_job(struct completion *p, unsigned completed_jobs); +void completion_mark_complete_a_job(struct completion *p); +bool completion_is_done(struct completion *p); + +#endif /* NETDATA_COMPLETION_H */ diff --git a/src/libnetdata/config/README.md b/src/libnetdata/config/README.md new file mode 100644 index 00000000..665a7196 --- /dev/null +++ b/src/libnetdata/config/README.md @@ -0,0 +1,58 @@ +<!-- +title: "Netdata ini config files" +custom_edit_url: https://github.com/netdata/netdata/edit/master/src/libnetdata/config/README.md +sidebar_label: "Netdata ini config files" +learn_status: "Published" +learn_topic_type: "Tasks" +learn_rel_path: "Developers/libnetdata" +--> + +# Netdata ini config files + +Configuration files `netdata.conf` and `stream.conf` are Netdata ini files. + +## Motivation + +The whole idea came up when we were evaluating the documentation involved +in maintaining a complex configuration system. Our intention was to give +configuration options for everything imaginable. But then, documenting all +these options would require a tremendous amount of time, users would have +to search through endless pages for the option they need, etc. + +We concluded then that **configuring software like that is a waste of time +and effort**. Of course there must be plenty of configuration options, but +the implementation itself should require a lot less effort for both the +developers and the users. + +So, we did this: + +1. No configuration is required to run Netdata +2. There are plenty of options to tweak +3. There is minimal documentation (or no at all) + +## Why this works? + +The configuration file is a `name = value` dictionary with `[sections]`. +Write whatever you like there as long as it follows this simple format. + +Netdata loads this dictionary and then when the code needs a value from +it, it just looks up the `name` in the dictionary at the proper `section`. +In all places, in the code, there are both the `names` and their +`default values`, so if something is not found in the configuration +file, the default is used. The lookup is made using B-Trees and hashes +(no string comparisons), so they are super fast. Also the `names` of the +settings can be `my super duper setting that once set to yes, will turn the world upside down = no` + +- so goodbye to most of the documentation involved. + +Next, Netdata can generate a valid configuration for the user to edit. +No need to remember anything or copy and paste settings. Just get the +configuration from the server (`/netdata.conf` on your Netdata server), +edit it and save it. + +Last, what about options you believe you have set, but you misspelled? +When you get the configuration file from the server, there will be a +comment above all `name = value` pairs the server does not use. +So you know that whatever you wrote there, is not used. + + diff --git a/src/libnetdata/config/appconfig.c b/src/libnetdata/config/appconfig.c new file mode 100644 index 00000000..81946b59 --- /dev/null +++ b/src/libnetdata/config/appconfig.c @@ -0,0 +1,961 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" + +/* + * @Input: + * Connector / instance to add to an internal structure + * @Return + * The current head of the linked list of connector_instance + * + */ + +_CONNECTOR_INSTANCE *add_connector_instance(struct section *connector, struct section *instance) +{ + static struct _connector_instance *global_connector_instance = NULL; + struct _connector_instance *local_ci, *local_ci_tmp; + + if (unlikely(!connector)) { + if (unlikely(!instance)) + return global_connector_instance; + + local_ci = global_connector_instance; + while (local_ci) { + local_ci_tmp = local_ci->next; + freez(local_ci); + local_ci = local_ci_tmp; + } + global_connector_instance = NULL; + return NULL; + } + + local_ci = callocz(1, sizeof(struct _connector_instance)); + local_ci->instance = instance; + local_ci->connector = connector; + strncpyz(local_ci->instance_name, instance->name, CONFIG_MAX_NAME); + strncpyz(local_ci->connector_name, connector->name, CONFIG_MAX_NAME); + local_ci->next = global_connector_instance; + global_connector_instance = local_ci; + + return global_connector_instance; +} + +int is_valid_connector(char *type, int check_reserved) +{ + int rc = 1; + + if (unlikely(!type)) + return 0; + + if (!check_reserved) { + if (unlikely(is_valid_connector(type,1))) { + return 0; + } + //if (unlikely(*type == ':') + // return 0; + char *separator = strrchr(type, ':'); + if (likely(separator)) { + *separator = '\0'; + rc = separator - type; + } else + return 0; + } +// else { +// if (unlikely(is_valid_connector(type,1))) { +// netdata_log_error("Section %s invalid -- reserved name", type); +// return 0; +// } +// } + + if (!strcmp(type, "graphite") || !strcmp(type, "graphite:plaintext")) { + return rc; + } else if (!strcmp(type, "graphite:http") || !strcmp(type, "graphite:https")) { + return rc; + } else if (!strcmp(type, "json") || !strcmp(type, "json:plaintext")) { + return rc; + } else if (!strcmp(type, "json:http") || !strcmp(type, "json:https")) { + return rc; + } else if (!strcmp(type, "opentsdb") || !strcmp(type, "opentsdb:telnet")) { + return rc; + } else if (!strcmp(type, "opentsdb:http") || !strcmp(type, "opentsdb:https")) { + return rc; + } else if (!strcmp(type, "prometheus_remote_write")) { + return rc; + } else if (!strcmp(type, "prometheus_remote_write:http") || !strcmp(type, "prometheus_remote_write:https")) { + return rc; + } else if (!strcmp(type, "kinesis") || !strcmp(type, "kinesis:plaintext")) { + return rc; + } else if (!strcmp(type, "pubsub") || !strcmp(type, "pubsub:plaintext")) { + return rc; + } else if (!strcmp(type, "mongodb") || !strcmp(type, "mongodb:plaintext")) { + return rc; + } + + return 0; +} + +// ---------------------------------------------------------------------------- +// locking + +inline void appconfig_wrlock(struct config *root) { + netdata_mutex_lock(&root->mutex); +} + +inline void appconfig_unlock(struct config *root) { + netdata_mutex_unlock(&root->mutex); +} + +inline void config_section_wrlock(struct section *co) { + netdata_mutex_lock(&co->mutex); +} + +inline void config_section_unlock(struct section *co) { + netdata_mutex_unlock(&co->mutex); +} + + +// ---------------------------------------------------------------------------- +// config name-value index + +static int appconfig_option_compare(void *a, void *b) { + if(((struct config_option *)a)->hash < ((struct config_option *)b)->hash) return -1; + else if(((struct config_option *)a)->hash > ((struct config_option *)b)->hash) return 1; + else return strcmp(((struct config_option *)a)->name, ((struct config_option *)b)->name); +} + +#define appconfig_option_index_add(co, cv) (struct config_option *)avl_insert_lock(&((co)->values_index), (avl_t *)(cv)) +#define appconfig_option_index_del(co, cv) (struct config_option *)avl_remove_lock(&((co)->values_index), (avl_t *)(cv)) + +static struct config_option *appconfig_option_index_find(struct section *co, const char *name, uint32_t hash) { + struct config_option tmp; + tmp.hash = (hash)?hash:simple_hash(name); + tmp.name = (char *)name; + + return (struct config_option *)avl_search_lock(&(co->values_index), (avl_t *) &tmp); +} + + +// ---------------------------------------------------------------------------- +// config sections index + +int appconfig_section_compare(void *a, void *b) { + if(((struct section *)a)->hash < ((struct section *)b)->hash) return -1; + else if(((struct section *)a)->hash > ((struct section *)b)->hash) return 1; + else return strcmp(((struct section *)a)->name, ((struct section *)b)->name); +} + +#define appconfig_index_add(root, cfg) (struct section *)avl_insert_lock(&(root)->index, (avl_t *)(cfg)) +#define appconfig_index_del(root, cfg) (struct section *)avl_remove_lock(&(root)->index, (avl_t *)(cfg)) + +static struct section *appconfig_index_find(struct config *root, const char *name, uint32_t hash) { + struct section tmp; + tmp.hash = (hash)?hash:simple_hash(name); + tmp.name = (char *)name; + + return (struct section *)avl_search_lock(&root->index, (avl_t *) &tmp); +} + + +// ---------------------------------------------------------------------------- +// config section methods + +static inline struct section *appconfig_section_find(struct config *root, const char *section) { + return appconfig_index_find(root, section, 0); +} + +static inline struct section *appconfig_section_create(struct config *root, const char *section) { + netdata_log_debug(D_CONFIG, "Creating section '%s'.", section); + + struct section *co = callocz(1, sizeof(struct section)); + co->name = strdupz(section); + co->hash = simple_hash(co->name); + netdata_mutex_init(&co->mutex); + + avl_init_lock(&co->values_index, appconfig_option_compare); + + if(unlikely(appconfig_index_add(root, co) != co)) + netdata_log_error("INTERNAL ERROR: indexing of section '%s', already exists.", co->name); + + appconfig_wrlock(root); + struct section *co2 = root->last_section; + if(co2) { + co2->next = co; + } else { + root->first_section = co; + } + root->last_section = co; + appconfig_unlock(root); + + return co; +} + +void appconfig_section_destroy_non_loaded(struct config *root, const char *section) +{ + struct section *co; + struct config_option *cv, *cv_next; + + netdata_log_debug(D_CONFIG, "Destroying section '%s'.", section); + + co = appconfig_section_find(root, section); + if(!co) { + netdata_log_error("Could not destroy section '%s'. Not found.", section); + return; + } + + config_section_wrlock(co); + for(cv = co->values; cv ; cv = cv->next) { + if (cv->flags & CONFIG_VALUE_LOADED) { + /* Do not destroy values that were loaded from the configuration files. */ + config_section_unlock(co); + return; + } + } + for(cv = co->values ; cv ; cv = cv_next) { + cv_next = cv->next; + if(unlikely(!appconfig_option_index_del(co, cv))) + netdata_log_error("Cannot remove config option '%s' from section '%s'.", cv->name, co->name); + freez(cv->value); + freez(cv->name); + freez(cv); + } + co->values = NULL; + config_section_unlock(co); + + if (unlikely(!appconfig_index_del(root, co))) { + netdata_log_error("Cannot remove section '%s' from config.", section); + return; + } + + appconfig_wrlock(root); + + if (root->first_section == co) { + root->first_section = co->next; + + if (root->last_section == co) + root->last_section = root->first_section; + } else { + struct section *co_cur = root->first_section, *co_prev = NULL; + + while(co_cur && co_cur != co) { + co_prev = co_cur; + co_cur = co_cur->next; + } + + if (co_cur) { + co_prev->next = co_cur->next; + + if (root->last_section == co_cur) + root->last_section = co_prev; + } + } + + appconfig_unlock(root); + + avl_destroy_lock(&co->values_index); + freez(co->name); + pthread_mutex_destroy(&co->mutex); + freez(co); +} + +void appconfig_section_option_destroy_non_loaded(struct config *root, const char *section, const char *name) +{ + netdata_log_debug(D_CONFIG, "Destroying section option '%s -> %s'.", section, name); + + struct section *co; + co = appconfig_section_find(root, section); + if (!co) { + netdata_log_error("Could not destroy section option '%s -> %s'. The section not found.", section, name); + return; + } + + config_section_wrlock(co); + + struct config_option *cv; + + cv = appconfig_option_index_find(co, name, simple_hash(name)); + + if (cv && cv->flags & CONFIG_VALUE_LOADED) { + config_section_unlock(co); + return; + } + + if (unlikely(!(cv && appconfig_option_index_del(co, cv)))) { + config_section_unlock(co); + netdata_log_error("Could not destroy section option '%s -> %s'. The option not found.", section, name); + return; + } + + if (co->values == cv) { + co->values = co->values->next; + } else { + struct config_option *cv_cur = co->values, *cv_prev = NULL; + while (cv_cur && cv_cur != cv) { + cv_prev = cv_cur; + cv_cur = cv_cur->next; + } + if (cv_cur) { + cv_prev->next = cv_cur->next; + } + } + + freez(cv->value); + freez(cv->name); + freez(cv); + + config_section_unlock(co); + return; +} + +// ---------------------------------------------------------------------------- +// config name-value methods + +static inline struct config_option *appconfig_value_create(struct section *co, const char *name, const char *value) { + netdata_log_debug(D_CONFIG, "Creating config entry for name '%s', value '%s', in section '%s'.", name, value, co->name); + + struct config_option *cv = callocz(1, sizeof(struct config_option)); + cv->name = strdupz(name); + cv->hash = simple_hash(cv->name); + cv->value = strdupz(value); + + struct config_option *found = appconfig_option_index_add(co, cv); + if(found != cv) { + netdata_log_error("indexing of config '%s' in section '%s': already exists - using the existing one.", cv->name, co->name); + freez(cv->value); + freez(cv->name); + freez(cv); + return found; + } + + config_section_wrlock(co); + struct config_option *cv2 = co->values; + if(cv2) { + while (cv2->next) cv2 = cv2->next; + cv2->next = cv; + } + else co->values = cv; + config_section_unlock(co); + + return cv; +} + +int appconfig_exists(struct config *root, const char *section, const char *name) { + struct config_option *cv; + + netdata_log_debug(D_CONFIG, "request to get config in section '%s', name '%s'", section, name); + + struct section *co = appconfig_section_find(root, section); + if(!co) return 0; + + cv = appconfig_option_index_find(co, name, 0); + if(!cv) return 0; + + return 1; +} + +int appconfig_move(struct config *root, const char *section_old, const char *name_old, const char *section_new, const char *name_new) { + struct config_option *cv_old, *cv_new; + int ret = -1; + + netdata_log_debug(D_CONFIG, "request to rename config in section '%s', old name '%s', to section '%s', new name '%s'", section_old, name_old, section_new, name_new); + + struct section *co_old = appconfig_section_find(root, section_old); + if(!co_old) return ret; + + struct section *co_new = appconfig_section_find(root, section_new); + if(!co_new) co_new = appconfig_section_create(root, section_new); + + config_section_wrlock(co_old); + if(co_old != co_new) + config_section_wrlock(co_new); + + cv_old = appconfig_option_index_find(co_old, name_old, 0); + if(!cv_old) goto cleanup; + + cv_new = appconfig_option_index_find(co_new, name_new, 0); + if(cv_new) goto cleanup; + + if(unlikely(appconfig_option_index_del(co_old, cv_old) != cv_old)) + netdata_log_error("INTERNAL ERROR: deletion of config '%s' from section '%s', deleted the wrong config entry.", cv_old->name, co_old->name); + + if(co_old->values == cv_old) { + co_old->values = cv_old->next; + } + else { + struct config_option *t; + for(t = co_old->values; t && t->next != cv_old ;t = t->next) ; + if(!t || t->next != cv_old) + netdata_log_error("INTERNAL ERROR: cannot find variable '%s' in section '%s' of the config - but it should be there.", cv_old->name, co_old->name); + else + t->next = cv_old->next; + } + + freez(cv_old->name); + cv_old->name = strdupz(name_new); + cv_old->hash = simple_hash(cv_old->name); + + cv_new = cv_old; + cv_new->next = co_new->values; + co_new->values = cv_new; + + if(unlikely(appconfig_option_index_add(co_new, cv_old) != cv_old)) + netdata_log_error("INTERNAL ERROR: re-indexing of config '%s' in section '%s', already exists.", cv_old->name, co_new->name); + + ret = 0; + +cleanup: + if(co_old != co_new) + config_section_unlock(co_new); + config_section_unlock(co_old); + return ret; +} + +char *appconfig_get_by_section(struct section *co, const char *name, const char *default_value) +{ + struct config_option *cv; + + // Only calls internal to this file check for a NULL result and they do not supply a NULL arg. + // External caller should treat NULL as an error case. + cv = appconfig_option_index_find(co, name, 0); + if (!cv) { + if (!default_value) return NULL; + cv = appconfig_value_create(co, name, default_value); + if (!cv) return NULL; + } + cv->flags |= CONFIG_VALUE_USED; + + if((cv->flags & CONFIG_VALUE_LOADED) || (cv->flags & CONFIG_VALUE_CHANGED)) { + // this is a loaded value from the config file + // if it is different than the default, mark it + if(!(cv->flags & CONFIG_VALUE_CHECKED)) { + if(default_value && strcmp(cv->value, default_value) != 0) cv->flags |= CONFIG_VALUE_CHANGED; + cv->flags |= CONFIG_VALUE_CHECKED; + } + } + + return(cv->value); +} + + +char *appconfig_get(struct config *root, const char *section, const char *name, const char *default_value) +{ + if (default_value == NULL) + netdata_log_debug(D_CONFIG, "request to get config in section '%s', name '%s' or fail", section, name); + else + netdata_log_debug(D_CONFIG, "request to get config in section '%s', name '%s', default_value '%s'", section, name, default_value); + + struct section *co = appconfig_section_find(root, section); + if (!co && !default_value) + return NULL; + if(!co) co = appconfig_section_create(root, section); + + return appconfig_get_by_section(co, name, default_value); +} + +long long appconfig_get_number(struct config *root, const char *section, const char *name, long long value) +{ + char buffer[100], *s; + sprintf(buffer, "%lld", value); + + s = appconfig_get(root, section, name, buffer); + if(!s) return value; + + return strtoll(s, NULL, 0); +} + +NETDATA_DOUBLE appconfig_get_float(struct config *root, const char *section, const char *name, NETDATA_DOUBLE value) +{ + char buffer[100], *s; + sprintf(buffer, "%0.5" NETDATA_DOUBLE_MODIFIER, value); + + s = appconfig_get(root, section, name, buffer); + if(!s) return value; + + return str2ndd(s, NULL); +} + +inline int appconfig_test_boolean_value(char *s) { + if(!strcasecmp(s, "yes") || !strcasecmp(s, "true") || !strcasecmp(s, "on") + || !strcasecmp(s, "auto") || !strcasecmp(s, "on demand")) + return 1; + + return 0; +} + +int appconfig_get_boolean_by_section(struct section *co, const char *name, int value) { + char *s; + + s = appconfig_get_by_section(co, name, (!value)?"no":"yes"); + if(!s) return value; + + return appconfig_test_boolean_value(s); +} + +int appconfig_get_boolean(struct config *root, const char *section, const char *name, int value) +{ + char *s; + if(value) s = "yes"; + else s = "no"; + + s = appconfig_get(root, section, name, s); + if(!s) return value; + + return appconfig_test_boolean_value(s); +} + +int appconfig_get_boolean_ondemand(struct config *root, const char *section, const char *name, int value) +{ + char *s; + + if(value == CONFIG_BOOLEAN_AUTO) + s = "auto"; + + else if(value == CONFIG_BOOLEAN_NO) + s = "no"; + + else + s = "yes"; + + s = appconfig_get(root, section, name, s); + if(!s) return value; + + if(!strcmp(s, "yes") || !strcmp(s, "true") || !strcmp(s, "on")) + return CONFIG_BOOLEAN_YES; + else if(!strcmp(s, "no") || !strcmp(s, "false") || !strcmp(s, "off")) + return CONFIG_BOOLEAN_NO; + else if(!strcmp(s, "auto") || !strcmp(s, "on demand")) + return CONFIG_BOOLEAN_AUTO; + + return value; +} + +const char *appconfig_set_default(struct config *root, const char *section, const char *name, const char *value) +{ + struct config_option *cv; + + netdata_log_debug(D_CONFIG, "request to set default config in section '%s', name '%s', value '%s'", section, name, value); + + struct section *co = appconfig_section_find(root, section); + if(!co) return appconfig_set(root, section, name, value); + + cv = appconfig_option_index_find(co, name, 0); + if(!cv) return appconfig_set(root, section, name, value); + + cv->flags |= CONFIG_VALUE_USED; + + if(cv->flags & CONFIG_VALUE_LOADED) + return cv->value; + + if(strcmp(cv->value, value) != 0) { + cv->flags |= CONFIG_VALUE_CHANGED; + + freez(cv->value); + cv->value = strdupz(value); + } + + return cv->value; +} + +const char *appconfig_set(struct config *root, const char *section, const char *name, const char *value) +{ + struct config_option *cv; + + netdata_log_debug(D_CONFIG, "request to set config in section '%s', name '%s', value '%s'", section, name, value); + + struct section *co = appconfig_section_find(root, section); + if(!co) co = appconfig_section_create(root, section); + + cv = appconfig_option_index_find(co, name, 0); + if(!cv) cv = appconfig_value_create(co, name, value); + cv->flags |= CONFIG_VALUE_USED; + + if(strcmp(cv->value, value) != 0) { + cv->flags |= CONFIG_VALUE_CHANGED; + + freez(cv->value); + cv->value = strdupz(value); + } + + return value; +} + +long long appconfig_set_number(struct config *root, const char *section, const char *name, long long value) +{ + char buffer[100]; + sprintf(buffer, "%lld", value); + + appconfig_set(root, section, name, buffer); + + return value; +} + +NETDATA_DOUBLE appconfig_set_float(struct config *root, const char *section, const char *name, NETDATA_DOUBLE value) +{ + char buffer[100]; + sprintf(buffer, "%0.5" NETDATA_DOUBLE_MODIFIER, value); + + appconfig_set(root, section, name, buffer); + + return value; +} + +int appconfig_set_boolean(struct config *root, const char *section, const char *name, int value) +{ + char *s; + if(value) s = "yes"; + else s = "no"; + + appconfig_set(root, section, name, s); + + return value; +} + +int appconfig_get_duration(struct config *root, const char *section, const char *name, const char *value) +{ + int result = 0; + const char *s; + + s = appconfig_get(root, section, name, value); + if(!s) goto fallback; + + if(!config_parse_duration(s, &result)) { + netdata_log_error("config option '[%s].%s = %s' is configured with an valid duration", section, name, s); + goto fallback; + } + + return result; + + fallback: + if(!config_parse_duration(value, &result)) + netdata_log_error("INTERNAL ERROR: default duration supplied for option '[%s].%s = %s' is not a valid duration", section, name, value); + + return result; +} + +// ---------------------------------------------------------------------------- +// config load/save + +int appconfig_load(struct config *root, char *filename, int overwrite_used, const char *section_name) +{ + int line = 0; + struct section *co = NULL; + int is_exporter_config = 0; + int _connectors = 0; // number of exporting connector sections we have + char working_instance[CONFIG_MAX_NAME + 1]; + char working_connector[CONFIG_MAX_NAME + 1]; + struct section *working_connector_section = NULL; + int global_exporting_section = 0; + + char buffer[CONFIG_FILE_LINE_MAX + 1], *s; + + if(!filename) filename = CONFIG_DIR "/" CONFIG_FILENAME; + + netdata_log_debug(D_CONFIG, "CONFIG: opening config file '%s'", filename); + + FILE *fp = fopen(filename, "r"); + if(!fp) { + // netdata_log_info("CONFIG: cannot open file '%s'. Using internal defaults.", filename); + return 0; + } + + uint32_t section_hash = 0; + if(section_name) { + section_hash = simple_hash(section_name); + } + is_exporter_config = (strstr(filename, EXPORTING_CONF) != NULL); + + while(fgets(buffer, CONFIG_FILE_LINE_MAX, fp) != NULL) { + buffer[CONFIG_FILE_LINE_MAX] = '\0'; + line++; + + s = trim(buffer); + if(!s || *s == '#') { + netdata_log_debug(D_CONFIG, "CONFIG: ignoring line %d of file '%s', it is empty.", line, filename); + continue; + } + + int len = (int) strlen(s); + if(*s == '[' && s[len - 1] == ']') { + // new section + s[len - 1] = '\0'; + s++; + + if (is_exporter_config) { + global_exporting_section = + !(strcmp(s, CONFIG_SECTION_EXPORTING)) || !(strcmp(s, CONFIG_SECTION_PROMETHEUS)); + if (unlikely(!global_exporting_section)) { + int rc; + rc = is_valid_connector(s, 0); + if (likely(rc)) { + strncpyz(working_connector, s, CONFIG_MAX_NAME); + s = s + rc + 1; + if (unlikely(!(*s))) { + _connectors++; + sprintf(buffer, "instance_%d", _connectors); + s = buffer; + } + strncpyz(working_instance, s, CONFIG_MAX_NAME); + working_connector_section = NULL; + if (unlikely(appconfig_section_find(root, working_instance))) { + netdata_log_error("Instance (%s) already exists", working_instance); + co = NULL; + continue; + } + } else { + co = NULL; + netdata_log_error("Section (%s) does not specify a valid connector", s); + continue; + } + } + } + + co = appconfig_section_find(root, s); + if(!co) co = appconfig_section_create(root, s); + + if(co && section_name && overwrite_used && section_hash == co->hash && !strcmp(section_name, co->name)) { + config_section_wrlock(co); + struct config_option *cv2 = co->values; + while (cv2) { + struct config_option *save = cv2->next; + struct config_option *found = appconfig_option_index_del(co, cv2); + if(found != cv2) + netdata_log_error("INTERNAL ERROR: Cannot remove '%s' from section '%s', it was not inserted before.", + cv2->name, co->name); + + freez(cv2->name); + freez(cv2->value); + freez(cv2); + cv2 = save; + } + co->values = NULL; + config_section_unlock(co); + } + + continue; + } + + if(!co) { + // line outside a section + netdata_log_error("CONFIG: ignoring line %d ('%s') of file '%s', it is outside all sections.", line, s, filename); + continue; + } + + if(section_name && overwrite_used && section_hash != co->hash && strcmp(section_name, co->name)) { + continue; + } + + char *name = s; + char *value = strchr(s, '='); + if(!value) { + netdata_log_error("CONFIG: ignoring line %d ('%s') of file '%s', there is no = in it.", line, s, filename); + continue; + } + *value = '\0'; + value++; + + name = trim(name); + value = trim(value); + + if(!name || *name == '#') { + netdata_log_error("CONFIG: ignoring line %d of file '%s', name is empty.", line, filename); + continue; + } + + if(!value) value = ""; + + struct config_option *cv = appconfig_option_index_find(co, name, 0); + + if (!cv) { + cv = appconfig_value_create(co, name, value); + if (likely(is_exporter_config) && unlikely(!global_exporting_section)) { + if (unlikely(!working_connector_section)) { + working_connector_section = appconfig_section_find(root, working_connector); + if (!working_connector_section) + working_connector_section = appconfig_section_create(root, working_connector); + if (likely(working_connector_section)) { + add_connector_instance(working_connector_section, co); + } + } + } + } else { + if (((cv->flags & CONFIG_VALUE_USED) && overwrite_used) || !(cv->flags & CONFIG_VALUE_USED)) { + netdata_log_debug( + D_CONFIG, "CONFIG: line %d of file '%s', overwriting '%s/%s'.", line, filename, co->name, cv->name); + freez(cv->value); + cv->value = strdupz(value); + } else + netdata_log_debug( + D_CONFIG, + "CONFIG: ignoring line %d of file '%s', '%s/%s' is already present and used.", + line, + filename, + co->name, + cv->name); + } + cv->flags |= CONFIG_VALUE_LOADED; + } + + fclose(fp); + + return 1; +} + +void appconfig_generate(struct config *root, BUFFER *wb, int only_changed) +{ + int i, pri; + struct section *co; + struct config_option *cv; + + { + int found_host_labels = 0; + for (co = root->first_section; co; co = co->next) + if(!strcmp(co->name, CONFIG_SECTION_HOST_LABEL)) + found_host_labels = 1; + + if(!found_host_labels) { + appconfig_section_create(root, CONFIG_SECTION_HOST_LABEL); + appconfig_get(root, CONFIG_SECTION_HOST_LABEL, "name", "value"); + } + } + + buffer_strcat(wb, + "# netdata configuration\n" + "#\n" + "# You can download the latest version of this file, using:\n" + "#\n" + "# wget -O /etc/netdata/netdata.conf http://localhost:19999/netdata.conf\n" + "# or\n" + "# curl -o /etc/netdata/netdata.conf http://localhost:19999/netdata.conf\n" + "#\n" + "# You can uncomment and change any of the options below.\n" + "# The value shown in the commented settings, is the default value.\n" + "#\n" + "\n# global netdata configuration\n"); + + for(i = 0; i <= 17 ;i++) { + appconfig_wrlock(root); + for(co = root->first_section; co ; co = co->next) { + if(!strcmp(co->name, CONFIG_SECTION_GLOBAL)) pri = 0; + else if(!strcmp(co->name, CONFIG_SECTION_DB)) pri = 1; + else if(!strcmp(co->name, CONFIG_SECTION_DIRECTORIES)) pri = 2; + else if(!strcmp(co->name, CONFIG_SECTION_LOGS)) pri = 3; + else if(!strcmp(co->name, CONFIG_SECTION_ENV_VARS)) pri = 4; + else if(!strcmp(co->name, CONFIG_SECTION_HOST_LABEL)) pri = 5; + else if(!strcmp(co->name, CONFIG_SECTION_SQLITE)) pri = 6; + else if(!strcmp(co->name, CONFIG_SECTION_CLOUD)) pri = 7; + else if(!strcmp(co->name, CONFIG_SECTION_ML)) pri = 8; + else if(!strcmp(co->name, CONFIG_SECTION_HEALTH)) pri = 9; + else if(!strcmp(co->name, CONFIG_SECTION_WEB)) pri = 10; + else if(!strcmp(co->name, CONFIG_SECTION_WEBRTC)) pri = 11; + // by default, new sections will get pri = 12 (set at the end, below) + else if(!strcmp(co->name, CONFIG_SECTION_REGISTRY)) pri = 13; + else if(!strcmp(co->name, CONFIG_SECTION_GLOBAL_STATISTICS)) pri = 14; + else if(!strcmp(co->name, CONFIG_SECTION_PLUGINS)) pri = 15; + else if(!strcmp(co->name, CONFIG_SECTION_STATSD)) pri = 16; + else if(!strncmp(co->name, "plugin:", 7)) pri = 17; // << change the loop too if you change this + else pri = 12; // this is used for any new (currently unknown) sections + + if(i == pri) { + int loaded = 0; + int used = 0; + int changed = 0; + int count = 0; + + config_section_wrlock(co); + for(cv = co->values; cv ; cv = cv->next) { + used += (cv->flags & CONFIG_VALUE_USED)?1:0; + loaded += (cv->flags & CONFIG_VALUE_LOADED)?1:0; + changed += (cv->flags & CONFIG_VALUE_CHANGED)?1:0; + count++; + } + config_section_unlock(co); + + if(!count) continue; + if(only_changed && !changed && !loaded) continue; + + if(!used) { + buffer_sprintf(wb, "\n# section '%s' is not used.", co->name); + } + + buffer_sprintf(wb, "\n[%s]\n", co->name); + + config_section_wrlock(co); + for(cv = co->values; cv ; cv = cv->next) { + + if(used && !(cv->flags & CONFIG_VALUE_USED)) { + buffer_sprintf(wb, "\n\t# option '%s' is not used.\n", cv->name); + } + buffer_sprintf(wb, "\t%s%s = %s\n", ((!(cv->flags & CONFIG_VALUE_LOADED)) && (!(cv->flags & CONFIG_VALUE_CHANGED)) && (cv->flags & CONFIG_VALUE_USED))?"# ":"", cv->name, cv->value); + } + config_section_unlock(co); + } + } + appconfig_unlock(root); + } +} + +/** + * Parse Duration + * + * Parse the string setting the result + * + * @param string the timestamp string + * @param result the output variable + * + * @return It returns 1 on success and 0 otherwise + */ +int config_parse_duration(const char* string, int* result) { + while(*string && isspace((uint8_t)*string)) string++; + + if(unlikely(!*string)) goto fallback; + + if(*string == 'n' && !strcmp(string, "never")) { + // this is a valid option + *result = 0; + return 1; + } + + // make sure it is a number + if(!(isdigit((uint8_t)*string) || *string == '+' || *string == '-')) goto fallback; + + char *e = NULL; + NETDATA_DOUBLE n = str2ndd(string, &e); + if(e && *e) { + switch (*e) { + case 'Y': + *result = (int) (n * 31536000); + break; + case 'M': + *result = (int) (n * 2592000); + break; + case 'w': + *result = (int) (n * 604800); + break; + case 'd': + *result = (int) (n * 86400); + break; + case 'h': + *result = (int) (n * 3600); + break; + case 'm': + *result = (int) (n * 60); + break; + case 's': + default: + *result = (int) (n); + break; + } + } + else + *result = (int)(n); + + return 1; + + fallback: + *result = 0; + return 0; +} + +struct section *appconfig_get_section(struct config *root, const char *name) +{ + return appconfig_section_find(root, name); +} diff --git a/src/libnetdata/config/appconfig.h b/src/libnetdata/config/appconfig.h new file mode 100644 index 00000000..214a15ed --- /dev/null +++ b/src/libnetdata/config/appconfig.h @@ -0,0 +1,220 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +/* + * This section manages ini config files, like netdata.conf and stream.conf + * + * It is organized like this: + * + * struct config (i.e. netdata.conf or stream.conf) + * .sections = a linked list of struct section + * .mutex = a mutex to protect the above linked list due to multi-threading + * .index = an AVL tree of struct section + * + * struct section (i.e. [global] or [health] of netdata.conf) + * .value = a linked list of struct config_option + * .mutex = a mutex to protect the above linked list due to multi-threading + * .value_index = an AVL tree of struct config_option + * + * struct config_option (ie. a name-value pair for each ini file option) + * + * The following operations on name-value options are supported: + * SET to set the value of an option + * SET DEFAULT to set the value and the default value of an option + * GET to get the value of an option + * EXISTS to check if an option exists + * MOVE to move an option from a section to another section, and/or rename it + * + * GET and SET operations are provided for the following data types: + * STRING + * NUMBER (long long) + * FLOAT (long double) + * BOOLEAN (false, true) + * BOOLEAN ONDEMAND (false, true, auto) + * + * GET and SET operations create struct config_option, if it is not already present. + * This allows netdata to run even without netdata.conf and stream.conf. The internal + * defaults are used to create the structure that should exist in the ini file and the config + * file can be downloaded from the server. + * + * Also 2 operations are supported for the whole config file: + * + * LOAD To load the ini file from disk + * GENERATE To generate the ini file (this is used to download the ini file from the server) + * + * For each option (name-value pair), the system maintains 4 flags: + * LOADED to indicate that the value has been loaded from the file + * USED to indicate that netdata used the value + * CHANGED to indicate that the value has been changed from the loaded value or the internal default value + * CHECKED is used internally for optimization (to avoid an strcmp() every time GET is called). + * + * TODO: + * 1. The linked lists and the mutexes can be removed and the AVL trees can become DICTIONARY. + * This part of the code was written before we add traversal to AVL. + * + * 2. High level data types could be supported, to simplify the rest of the code: + * MULTIPLE CHOICE to let the user select one of the supported keywords + * this would allow users see in comments the available options + * + * SIMPLE PATTERN to let the user define netdata SIMPLE PATTERNS + * + * 3. Sorting of options should be supported. + * Today, when the ini file is downloaded from the server, the options are shown in the order + * they appear in the linked list (the order they were added, listing changed options first). + * If we remove the linked list, the order they appear in the AVL tree will be used (which is + * random due to simple_hash()). + * Ideally, we support sorting of options when generating the ini file. + * + * 4. There is no free() operation. So, memory is freed on netdata exit. + * + * 5. Avoid memory fragmentation + * Since entries are created from multiple threads and a lot of allocations are required + * for each config_option, fragmentation can be a problem for IoT. + * + * 6. Although this way of managing options is quite flexible and dynamic, it wastes memory + * for the names of the options. Since most of the option names are static, we could provide + * a method to allocate only the dynamic option names. + */ + +#ifndef NETDATA_CONFIG_H +#define NETDATA_CONFIG_H 1 + +#include "../libnetdata.h" + +#define CONFIG_FILENAME "netdata.conf" + +#define CONFIG_SECTION_GLOBAL "global" +#define CONFIG_SECTION_DIRECTORIES "directories" +#define CONFIG_SECTION_LOGS "logs" +#define CONFIG_SECTION_ENV_VARS "environment variables" +#define CONFIG_SECTION_SQLITE "sqlite" +#define CONFIG_SECTION_WEB "web" +#define CONFIG_SECTION_WEBRTC "webrtc" +#define CONFIG_SECTION_STATSD "statsd" +#define CONFIG_SECTION_PLUGINS "plugins" +#define CONFIG_SECTION_CLOUD "cloud" +#define CONFIG_SECTION_REGISTRY "registry" +#define CONFIG_SECTION_HEALTH "health" +#define CONFIG_SECTION_STREAM "stream" +#define CONFIG_SECTION_ML "ml" +#define CONFIG_SECTION_EXPORTING "exporting:global" +#define CONFIG_SECTION_PROMETHEUS "prometheus:exporter" +#define CONFIG_SECTION_HOST_LABEL "host labels" +#define EXPORTING_CONF "exporting.conf" +#define CONFIG_SECTION_GLOBAL_STATISTICS "global statistics" +#define CONFIG_SECTION_DB "db" + + +// these are used to limit the configuration names and values lengths +// they are not enforced by config.c functions (they will strdup() all strings, no matter of their length) +#define CONFIG_MAX_NAME 1024 +#define CONFIG_MAX_VALUE 2048 + +// ---------------------------------------------------------------------------- +// Config definitions +#define CONFIG_FILE_LINE_MAX ((CONFIG_MAX_NAME + CONFIG_MAX_VALUE + 1024) * 2) + +#define CONFIG_VALUE_LOADED 0x01 // has been loaded from the config +#define CONFIG_VALUE_USED 0x02 // has been accessed from the program +#define CONFIG_VALUE_CHANGED 0x04 // has been changed from the loaded value or the internal default value +#define CONFIG_VALUE_CHECKED 0x08 // has been checked if the value is different from the default + +struct config_option { + avl_t avl_node; // the index entry of this entry - this has to be first! + + uint8_t flags; + uint32_t hash; // a simple hash to speed up searching + // we first compare hashes, and only if the hashes are equal we do string comparisons + + char *name; + char *value; + + struct config_option *next; // config->mutex protects just this +}; + +struct section { + avl_t avl_node; // the index entry of this section - this has to be first! + + uint32_t hash; // a simple hash to speed up searching + // we first compare hashes, and only if the hashes are equal we do string comparisons + + char *name; + + struct section *next; // global config_mutex protects just this + + struct config_option *values; + avl_tree_lock values_index; + + netdata_mutex_t mutex; // this locks only the writers, to ensure atomic updates + // readers are protected using the rwlock in avl_tree_lock +}; + +struct config { + struct section *first_section; + struct section *last_section; // optimize inserting at the end + netdata_mutex_t mutex; + avl_tree_lock index; +}; + +#define CONFIG_BOOLEAN_INVALID 100 // an invalid value to check for validity (used as default initialization when needed) + +#define CONFIG_BOOLEAN_NO 0 // disabled +#define CONFIG_BOOLEAN_YES 1 // enabled + +#ifndef CONFIG_BOOLEAN_AUTO +#define CONFIG_BOOLEAN_AUTO 2 // enabled if it has useful info when enabled +#endif + +int appconfig_load(struct config *root, char *filename, int overwrite_used, const char *section_name); +void config_section_wrlock(struct section *co); +void config_section_unlock(struct section *co); + +char *appconfig_get_by_section(struct section *co, const char *name, const char *default_value); +char *appconfig_get(struct config *root, const char *section, const char *name, const char *default_value); +long long appconfig_get_number(struct config *root, const char *section, const char *name, long long value); +NETDATA_DOUBLE appconfig_get_float(struct config *root, const char *section, const char *name, NETDATA_DOUBLE value); +int appconfig_get_boolean_by_section(struct section *co, const char *name, int value); +int appconfig_get_boolean(struct config *root, const char *section, const char *name, int value); +int appconfig_get_boolean_ondemand(struct config *root, const char *section, const char *name, int value); +int appconfig_get_duration(struct config *root, const char *section, const char *name, const char *value); + +const char *appconfig_set(struct config *root, const char *section, const char *name, const char *value); +const char *appconfig_set_default(struct config *root, const char *section, const char *name, const char *value); +long long appconfig_set_number(struct config *root, const char *section, const char *name, long long value); +NETDATA_DOUBLE appconfig_set_float(struct config *root, const char *section, const char *name, NETDATA_DOUBLE value); +int appconfig_set_boolean(struct config *root, const char *section, const char *name, int value); + +int appconfig_exists(struct config *root, const char *section, const char *name); +int appconfig_move(struct config *root, const char *section_old, const char *name_old, const char *section_new, const char *name_new); + +void appconfig_generate(struct config *root, BUFFER *wb, int only_changed); + +int appconfig_section_compare(void *a, void *b); + +void appconfig_section_destroy_non_loaded(struct config *root, const char *section); +void appconfig_section_option_destroy_non_loaded(struct config *root, const char *section, const char *name); + +int config_parse_duration(const char* string, int* result); + +struct section *appconfig_get_section(struct config *root, const char *name); + +void appconfig_wrlock(struct config *root); +void appconfig_unlock(struct config *root); + +int appconfig_test_boolean_value(char *s); + +struct connector_instance { + char instance_name[CONFIG_MAX_NAME + 1]; + char connector_name[CONFIG_MAX_NAME + 1]; +}; + +typedef struct _connector_instance { + struct section *connector; // actual connector + struct section *instance; // This instance + char instance_name[CONFIG_MAX_NAME + 1]; + char connector_name[CONFIG_MAX_NAME + 1]; + struct _connector_instance *next; // Next instance +} _CONNECTOR_INSTANCE; + +_CONNECTOR_INSTANCE *add_connector_instance(struct section *connector, struct section *instance); + +#endif /* NETDATA_CONFIG_H */
\ No newline at end of file diff --git a/src/libnetdata/config/dyncfg.c b/src/libnetdata/config/dyncfg.c new file mode 100644 index 00000000..244864c6 --- /dev/null +++ b/src/libnetdata/config/dyncfg.c @@ -0,0 +1,316 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../../libnetdata/libnetdata.h" + +// ---------------------------------------------------------------------------- + +static struct { + DYNCFG_TYPE type; + const char *name; +} dyncfg_types[] = { + { .type = DYNCFG_TYPE_SINGLE, .name = "single" }, + { .type = DYNCFG_TYPE_TEMPLATE, .name = "template" }, + { .type = DYNCFG_TYPE_JOB, .name = "job" }, +}; + +DYNCFG_TYPE dyncfg_type2id(const char *type) { + if(!type || !*type) + return DYNCFG_TYPE_SINGLE; + + size_t entries = sizeof(dyncfg_types) / sizeof(dyncfg_types[0]); + for(size_t i = 0; i < entries ;i++) { + if(strcmp(dyncfg_types[i].name, type) == 0) + return dyncfg_types[i].type; + } + + return DYNCFG_TYPE_SINGLE; +} + +const char *dyncfg_id2type(DYNCFG_TYPE type) { + size_t entries = sizeof(dyncfg_types) / sizeof(dyncfg_types[0]); + for(size_t i = 0; i < entries ;i++) { + if(type == dyncfg_types[i].type) + return dyncfg_types[i].name; + } + + return "single"; +} + +// ---------------------------------------------------------------------------- + +static struct { + DYNCFG_SOURCE_TYPE source_type; + const char *name; +} dyncfg_source_types[] = { + { .source_type = DYNCFG_SOURCE_TYPE_INTERNAL, .name = "internal" }, + { .source_type = DYNCFG_SOURCE_TYPE_STOCK, .name = "stock" }, + { .source_type = DYNCFG_SOURCE_TYPE_USER, .name = "user" }, + { .source_type = DYNCFG_SOURCE_TYPE_DYNCFG, .name = "dyncfg" }, + { .source_type = DYNCFG_SOURCE_TYPE_DISCOVERED, .name = "discovered" }, +}; + +DYNCFG_SOURCE_TYPE dyncfg_source_type2id(const char *source_type) { + if(!source_type || !*source_type) + return DYNCFG_SOURCE_TYPE_INTERNAL; + + size_t entries = sizeof(dyncfg_source_types) / sizeof(dyncfg_source_types[0]); + for(size_t i = 0; i < entries ;i++) { + if(strcmp(dyncfg_source_types[i].name, source_type) == 0) + return dyncfg_source_types[i].source_type; + } + + return DYNCFG_SOURCE_TYPE_INTERNAL; +} + +const char *dyncfg_id2source_type(DYNCFG_SOURCE_TYPE source_type) { + size_t entries = sizeof(dyncfg_source_types) / sizeof(dyncfg_source_types[0]); + for(size_t i = 0; i < entries ;i++) { + if(source_type == dyncfg_source_types[i].source_type) + return dyncfg_source_types[i].name; + } + + return "internal"; +} + +// ---------------------------------------------------------------------------- + +static struct { + DYNCFG_STATUS status; + const char *name; +} dyncfg_statuses[] = { + { .status = DYNCFG_STATUS_NONE, .name = "none" }, + { .status = DYNCFG_STATUS_ACCEPTED, .name = "accepted" }, + { .status = DYNCFG_STATUS_RUNNING, .name = "running" }, + { .status = DYNCFG_STATUS_FAILED, .name = "failed" }, + { .status = DYNCFG_STATUS_DISABLED, .name = "disabled" }, + { .status = DYNCFG_STATUS_ORPHAN, .name = "orphan" }, + { .status = DYNCFG_STATUS_INCOMPLETE, .name = "incomplete" }, +}; + +DYNCFG_STATUS dyncfg_status2id(const char *status) { + if(!status || !*status) + return DYNCFG_STATUS_NONE; + + size_t entries = sizeof(dyncfg_statuses) / sizeof(dyncfg_statuses[0]); + for(size_t i = 0; i < entries ;i++) { + if(strcmp(dyncfg_statuses[i].name, status) == 0) + return dyncfg_statuses[i].status; + } + + return DYNCFG_STATUS_NONE; +} + +const char *dyncfg_id2status(DYNCFG_STATUS status) { + size_t entries = sizeof(dyncfg_statuses) / sizeof(dyncfg_statuses[0]); + for(size_t i = 0; i < entries ;i++) { + if(status == dyncfg_statuses[i].status) + return dyncfg_statuses[i].name; + } + + return "none"; +} + +// ---------------------------------------------------------------------------- + +static struct { + DYNCFG_CMDS cmd; + const char *name; +} cmd_map[] = { + { .cmd = DYNCFG_CMD_GET, .name = "get" }, + { .cmd = DYNCFG_CMD_SCHEMA, .name = "schema" }, + { .cmd = DYNCFG_CMD_UPDATE, .name = "update" }, + { .cmd = DYNCFG_CMD_ADD, .name = "add" }, + { .cmd = DYNCFG_CMD_TEST, .name = "test" }, + { .cmd = DYNCFG_CMD_REMOVE, .name = "remove" }, + { .cmd = DYNCFG_CMD_ENABLE, .name = "enable" }, + { .cmd = DYNCFG_CMD_DISABLE, .name = "disable" }, + { .cmd = DYNCFG_CMD_RESTART, .name = "restart" }, + { .cmd = DYNCFG_CMD_USERCONFIG, .name = "userconfig" }, +}; + +const char *dyncfg_id2cmd_one(DYNCFG_CMDS cmd) { + for (size_t i = 0; i < sizeof(cmd_map) / sizeof(cmd_map[0]); i++) { + if(cmd == cmd_map[i].cmd) + return cmd_map[i].name; + } + + return NULL; +} + +DYNCFG_CMDS dyncfg_cmds2id(const char *cmds) { + if(!cmds || !*cmds) + return DYNCFG_CMD_NONE; + + DYNCFG_CMDS result = DYNCFG_CMD_NONE; + const char *p = cmds; + size_t len, i; + + while (*p) { + // Skip any leading spaces + while (*p == ' ') p++; + + // Find the end of the current word + const char *end = p; + while (*end && *end != ' ') end++; + len = end - p; + + // Compare with known commands + for (i = 0; i < sizeof(cmd_map) / sizeof(cmd_map[0]); i++) { + if (strncmp(p, cmd_map[i].name, len) == 0 && cmd_map[i].name[len] == '\0') { + result |= cmd_map[i].cmd; + break; + } + } + + // Move to the next word + p = end; + } + + return result; +} + +void dyncfg_cmds2fp(DYNCFG_CMDS cmds, FILE *fp) { + for (size_t i = 0; i < sizeof(cmd_map) / sizeof(cmd_map[0]); i++) { + if(cmds & cmd_map[i].cmd) + fprintf(fp, "%s ", cmd_map[i].name); + } +} + +void dyncfg_cmds2json_array(DYNCFG_CMDS cmds, const char *key, BUFFER *wb) { + buffer_json_member_add_array(wb, key); + for (size_t i = 0; i < sizeof(cmd_map) / sizeof(cmd_map[0]); i++) { + if(cmds & cmd_map[i].cmd) + buffer_json_add_array_item_string(wb, cmd_map[i].name); + } + buffer_json_array_close(wb); +} + +void dyncfg_cmds2buffer(DYNCFG_CMDS cmds, BUFFER *wb) { + size_t added = 0; + for (size_t i = 0; i < sizeof(cmd_map) / sizeof(cmd_map[0]); i++) { + if(cmds & cmd_map[i].cmd) { + if(added) + buffer_fast_strcat(wb, " ", 1); + + buffer_strcat(wb, cmd_map[i].name); + added++; + } + } +} + +// ---------------------------------------------------------------------------- + +bool dyncfg_is_valid_id(const char *id) { + const char *s = id; + + while(*s) { + if(isspace((uint8_t)*s) || *s == '\'') return false; + s++; + } + + return true; +} + +static inline bool is_forbidden_char(char c) { + if(isspace((uint8_t)c) || !isprint((uint8_t)c)) + return true; + + switch(c) { + case '`': // good not to have this in filenames + case '$': // good not to have this in filenames + case '/': // unix does not support this + case ':': // windows does not support this + case '|': // windows does not support this + return true; + + default: + return false; + } +} + +char *dyncfg_escape_id_for_filename(const char *id) { + if (id == NULL) return NULL; + + // Allocate memory for the worst case, where every character is escaped. + char *escaped = mallocz(strlen(id) * 3 + 1); // Each char can become '%XX', plus '\0' + if (!escaped) return NULL; + + const char *src = id; + char *dest = escaped; + + while (*src) { + if (is_forbidden_char(*src)) { + sprintf(dest, "%%%02X", (unsigned char)*src); + dest += 3; + } else { + *dest++ = *src; + } + src++; + } + + *dest = '\0'; + return escaped; +} + +// ---------------------------------------------------------------------------- + +int dyncfg_default_response(BUFFER *wb, int code, const char *msg) { + buffer_flush(wb); + wb->content_type = CT_APPLICATION_JSON; + wb->expires = now_realtime_sec(); + + buffer_json_initialize(wb, "\"", "\"", 0, true, BUFFER_JSON_OPTIONS_MINIFY); + buffer_json_member_add_uint64(wb, "status", code); + buffer_json_member_add_string(wb, "message", msg); + buffer_json_finalize(wb); + + return code; +} + +int dyncfg_node_find_and_call(DICTIONARY *dyncfg_nodes, const char *transaction, const char *function, + usec_t *stop_monotonic_ut, bool *cancelled, + BUFFER *payload, HTTP_ACCESS access, const char *source, BUFFER *result) { + if(!function || !*function) + return dyncfg_default_response(result, HTTP_RESP_BAD_REQUEST, "command received is empty"); + + char buf[strlen(function) + 1]; + memcpy(buf, function, sizeof(buf)); + + char *words[MAX_FUNCTION_PARAMETERS]; // an array of pointers for the words in this line + size_t num_words = quoted_strings_splitter_pluginsd(buf, words, MAX_FUNCTION_PARAMETERS); + + const char *id = get_word(words, num_words, 1); + const char *action = get_word(words, num_words, 2); + const char *add_name = get_word(words, num_words, 3); + + if(!id || !*id) + return dyncfg_default_response(result, HTTP_RESP_BAD_REQUEST, "dyncfg node: id is missing from the request"); + + if(!action || !*action) + return dyncfg_default_response(result, HTTP_RESP_BAD_REQUEST, "dyncfg node: action is missing from the request"); + + DYNCFG_CMDS cmd = dyncfg_cmds2id(action); + if(cmd == DYNCFG_CMD_NONE) + return dyncfg_default_response(result, HTTP_RESP_BAD_REQUEST, "dyncfg node: action given in request is unknown"); + + const DICTIONARY_ITEM *item = dictionary_get_and_acquire_item(dyncfg_nodes, id); + if(!item) + return dyncfg_default_response(result, HTTP_RESP_NOT_FOUND, "dyncfg node: id is not found"); + + struct dyncfg_node *df = dictionary_acquired_item_value(item); + + buffer_flush(result); + result->content_type = CT_APPLICATION_JSON; + + int code = df->cb(transaction, id, cmd, add_name, payload, stop_monotonic_ut, cancelled, result, access, source, df->data); + + if(!result->expires) + result->expires = now_realtime_sec(); + + if(!buffer_tostring(result)) + dyncfg_default_response(result, code, ""); + + dictionary_acquired_item_release(dyncfg_nodes, item); + + return code; +} diff --git a/src/libnetdata/config/dyncfg.h b/src/libnetdata/config/dyncfg.h new file mode 100644 index 00000000..e34dc548 --- /dev/null +++ b/src/libnetdata/config/dyncfg.h @@ -0,0 +1,90 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef LIBNETDATA_DYNCFG_H +#define LIBNETDATA_DYNCFG_H + +#define DYNCFG_VERSION (size_t)1 + +#define DYNCFG_RESP_SUCCESS(code) (code >= 200 && code <= 299) +#define DYNCFG_RESP_RUNNING 200 // accepted and running +#define DYNCFG_RESP_ACCEPTED 202 // accepted, but not running yet +#define DYNCFG_RESP_ACCEPTED_DISABLED 298 // accepted, but is disabled +#define DYNCFG_RESP_ACCEPTED_RESTART_REQUIRED 299 // accepted, but restart is required to apply it + +typedef enum __attribute__((packed)) { + DYNCFG_TYPE_SINGLE = 0, + DYNCFG_TYPE_TEMPLATE, + DYNCFG_TYPE_JOB, +} DYNCFG_TYPE; +DYNCFG_TYPE dyncfg_type2id(const char *type); +const char *dyncfg_id2type(DYNCFG_TYPE type); + +typedef enum __attribute__((packed)) { + DYNCFG_SOURCE_TYPE_INTERNAL = 0, + DYNCFG_SOURCE_TYPE_STOCK, + DYNCFG_SOURCE_TYPE_USER, + DYNCFG_SOURCE_TYPE_DYNCFG, + DYNCFG_SOURCE_TYPE_DISCOVERED, +} DYNCFG_SOURCE_TYPE; +DYNCFG_SOURCE_TYPE dyncfg_source_type2id(const char *source_type); +const char *dyncfg_id2source_type(DYNCFG_SOURCE_TYPE source_type); + +typedef enum __attribute__((packed)) { + DYNCFG_STATUS_NONE = 0, + DYNCFG_STATUS_ACCEPTED, // the plugin has accepted the configuration + DYNCFG_STATUS_RUNNING, // the plugin runs the accepted configuration + DYNCFG_STATUS_FAILED, // the plugin fails to run the accepted configuration + DYNCFG_STATUS_DISABLED, // the configuration is disabled by a user + DYNCFG_STATUS_ORPHAN, // no plugin has claimed this configurations + DYNCFG_STATUS_INCOMPLETE, // a special kind of failed configuration +} DYNCFG_STATUS; +DYNCFG_STATUS dyncfg_status2id(const char *status); +const char *dyncfg_id2status(DYNCFG_STATUS status); + +typedef enum __attribute__((packed)) { + DYNCFG_CMD_NONE = 0, + DYNCFG_CMD_GET = (1 << 0), + DYNCFG_CMD_SCHEMA = (1 << 1), + DYNCFG_CMD_UPDATE = (1 << 2), + DYNCFG_CMD_ADD = (1 << 3), + DYNCFG_CMD_TEST = (1 << 4), + DYNCFG_CMD_REMOVE = (1 << 5), + DYNCFG_CMD_ENABLE = (1 << 6), + DYNCFG_CMD_DISABLE = (1 << 7), + DYNCFG_CMD_RESTART = (1 << 8), + DYNCFG_CMD_USERCONFIG = (1 << 9), +} DYNCFG_CMDS; + +DYNCFG_CMDS dyncfg_cmds2id(const char *cmds); +void dyncfg_cmds2buffer(DYNCFG_CMDS cmds, struct web_buffer *wb); +void dyncfg_cmds2json_array(DYNCFG_CMDS cmds, const char *key, struct web_buffer *wb); +void dyncfg_cmds2fp(DYNCFG_CMDS cmds, FILE *fp); +const char *dyncfg_id2cmd_one(DYNCFG_CMDS cmd); + +bool dyncfg_is_valid_id(const char *id); +char *dyncfg_escape_id_for_filename(const char *id); + +#include "../clocks/clocks.h" +#include "../buffer/buffer.h" +#include "../dictionary/dictionary.h" + +typedef int (*dyncfg_cb_t)(const char *transaction, const char *id, DYNCFG_CMDS cmd, const char *add_name, + BUFFER *payload, usec_t *stop_monotonic_ut, bool *cancelled, BUFFER *result, + HTTP_ACCESS access, const char *source, void *data); + +struct dyncfg_node { + DYNCFG_TYPE type; + DYNCFG_CMDS cmds; + dyncfg_cb_t cb; + void *data; +}; + +#define dyncfg_nodes_dictionary_create() dictionary_create_advanced(DICT_OPTION_FIXED_SIZE, NULL, sizeof(struct dyncfg_node)) + +int dyncfg_default_response(BUFFER *wb, int code, const char *msg); + +int dyncfg_node_find_and_call(DICTIONARY *dyncfg_nodes, const char *transaction, const char *function, + usec_t *stop_monotonic_ut, bool *cancelled, + BUFFER *payload, HTTP_ACCESS access, const char *source, BUFFER *result); + +#endif //LIBNETDATA_DYNCFG_H diff --git a/src/libnetdata/datetime/README.md b/src/libnetdata/datetime/README.md new file mode 100644 index 00000000..303ba8bf --- /dev/null +++ b/src/libnetdata/datetime/README.md @@ -0,0 +1,11 @@ +<!-- +title: "Datetime" +custom_edit_url: https://github.com/netdata/netdata/edit/master/src/libnetdata/datetime/README.md +sidebar_label: "Datetime" +learn_topic_type: "Tasks" +learn_rel_path: "Developers/libnetdata" +--> + +# Datetime + +Formatting dates and timestamps. diff --git a/src/libnetdata/datetime/iso8601.c b/src/libnetdata/datetime/iso8601.c new file mode 100644 index 00000000..8e3f4e02 --- /dev/null +++ b/src/libnetdata/datetime/iso8601.c @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" + +size_t iso8601_datetime_ut(char *buffer, size_t len, usec_t now_ut, ISO8601_OPTIONS options) { + if(unlikely(!buffer || len == 0)) + return 0; + + time_t t = (time_t)(now_ut / USEC_PER_SEC); + struct tm *tmp, tmbuf; + + if(options & ISO8601_UTC) + // Use gmtime_r for UTC time conversion. + tmp = gmtime_r(&t, &tmbuf); + else + // Use localtime_r for local time conversion. + tmp = localtime_r(&t, &tmbuf); + + if (unlikely(!tmp)) { + buffer[0] = '\0'; + return 0; + } + + // Format the date and time according to the ISO 8601 format. + size_t used_length = strftime(buffer, len, "%Y-%m-%dT%H:%M:%S", tmp); + if (unlikely(used_length == 0)) { + buffer[0] = '\0'; + return 0; + } + + if(options & ISO8601_MILLISECONDS) { + // Calculate the remaining microseconds + int milliseconds = (int) ((now_ut % USEC_PER_SEC) / USEC_PER_MS); + if(milliseconds && len - used_length > 4) + used_length += snprintfz(buffer + used_length, len - used_length, ".%03d", milliseconds); + } + else if(options & ISO8601_MICROSECONDS) { + // Calculate the remaining microseconds + int microseconds = (int) (now_ut % USEC_PER_SEC); + if(microseconds && len - used_length > 7) + used_length += snprintfz(buffer + used_length, len - used_length, ".%06d", microseconds); + } + + if(options & ISO8601_UTC) { + if(used_length + 1 < len) { + buffer[used_length++] = 'Z'; + buffer[used_length] = '\0'; // null-terminate the string. + } + } + else { + // Calculate the timezone offset in hours and minutes from UTC. + long offset = tmbuf.tm_gmtoff; + int hours = (int) (offset / 3600); // Convert offset seconds to hours. + int minutes = (int) ((offset % 3600) / 60); // Convert remainder to minutes (keep the sign for minutes). + + // Check if timezone is UTC. + if(hours == 0 && minutes == 0) { + // For UTC, append 'Z' to the timestamp. + if(used_length + 1 < len) { + buffer[used_length++] = 'Z'; + buffer[used_length] = '\0'; // null-terminate the string. + } + } + else { + // For non-UTC, format the timezone offset. Omit minutes if they are zero. + if(minutes == 0) { + // Check enough space is available for the timezone offset string. + if(used_length + 3 < len) // "+hh\0" + used_length += snprintfz(buffer + used_length, len - used_length, "%+03d", hours); + } + else { + // Check enough space is available for the timezone offset string. + if(used_length + 6 < len) // "+hh:mm\0" + used_length += snprintfz(buffer + used_length, len - used_length, + "%+03d:%02d", hours, abs(minutes)); + } + } + } + + return used_length; +} diff --git a/src/libnetdata/datetime/iso8601.h b/src/libnetdata/datetime/iso8601.h new file mode 100644 index 00000000..ce480096 --- /dev/null +++ b/src/libnetdata/datetime/iso8601.h @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" + +#ifndef NETDATA_ISO8601_H +#define NETDATA_ISO8601_H + +typedef enum __attribute__((__packed__)) { + ISO8601_UTC = (1 << 0), + ISO8601_LOCAL_TIMEZONE = (1 << 1), + ISO8601_MILLISECONDS = (1 << 2), + ISO8601_MICROSECONDS = (1 << 3), +} ISO8601_OPTIONS; + +#define ISO8601_MAX_LENGTH 64 +size_t iso8601_datetime_ut(char *buffer, size_t len, usec_t now_ut, ISO8601_OPTIONS options); + +#endif //NETDATA_ISO8601_H diff --git a/src/libnetdata/datetime/rfc3339.c b/src/libnetdata/datetime/rfc3339.c new file mode 100644 index 00000000..5c4e990d --- /dev/null +++ b/src/libnetdata/datetime/rfc3339.c @@ -0,0 +1,135 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" + +#include "rfc3339.h" + +size_t rfc3339_datetime_ut(char *buffer, size_t len, usec_t now_ut, size_t fractional_digits, bool utc) { + if (!buffer || len == 0) + return 0; + + time_t t = (time_t)(now_ut / USEC_PER_SEC); + struct tm *tmp, tmbuf; + + if (utc) + tmp = gmtime_r(&t, &tmbuf); + else + tmp = localtime_r(&t, &tmbuf); + + if (!tmp) { + buffer[0] = '\0'; + return 0; + } + + size_t used_length = strftime(buffer, len, "%Y-%m-%dT%H:%M:%S", tmp); + if (used_length == 0) { + buffer[0] = '\0'; + return 0; + } + + if (fractional_digits >= 1 && fractional_digits <= 9) { + int fractional_part = (int)(now_ut % USEC_PER_SEC); + if (fractional_part && len - used_length > fractional_digits + 1) { + char format[] = ".%01d"; + format[3] = (char)('0' + fractional_digits); + + // Adjust fractional part + fractional_part /= (int)pow(10, 6 - fractional_digits); + + used_length += snprintf(buffer + used_length, len - used_length, + format, fractional_part); + } + } + + if (utc) { + if (used_length + 1 < len) { + buffer[used_length++] = 'Z'; + buffer[used_length] = '\0'; + } + } + else { + long offset = tmbuf.tm_gmtoff; + int hours = (int)(offset / 3600); + int minutes = abs((int)((offset % 3600) / 60)); + + if (used_length + 7 < len) { // Space for "+HH:MM\0" + used_length += snprintf(buffer + used_length, len - used_length, "%+03d:%02d", hours, minutes); + } + } + + return used_length; +} + +usec_t rfc3339_parse_ut(const char *rfc3339, char **endptr) { + struct tm tm = { 0 }; + int tz_hours = 0, tz_mins = 0; + char *s; + usec_t timestamp, usec = 0; + + // Use strptime to parse up to seconds + s = strptime(rfc3339, "%Y-%m-%dT%H:%M:%S", &tm); + if (!s) + return 0; // Parsing error + + // Parse fractional seconds if present + if (*s == '.') { + char *next; + usec = strtoul(s + 1, &next, 10); + int digits_parsed = (int)(next - (s + 1)); + + if (digits_parsed < 1 || digits_parsed > 9) + return 0; // parsing error + + static const usec_t fix_usec[] = { + 1000000, // 0 digits (not used) + 100000, // 1 digit + 10000, // 2 digits + 1000, // 3 digits + 100, // 4 digits + 10, // 5 digits + 1, // 6 digits + 10, // 7 digits + 100, // 8 digits + 1000, // 9 digits + }; + usec = digits_parsed <= 6 ? usec * fix_usec[digits_parsed] : usec / fix_usec[digits_parsed]; + + s = next; + } + + // Check and parse timezone if present + int tz_offset = 0; + if (*s == '+' || *s == '-') { + // Parse the hours:mins part of the timezone + + if (!isdigit((uint8_t)s[1]) || !isdigit((uint8_t)s[2]) || s[3] != ':' || + !isdigit((uint8_t)s[4]) || !isdigit((uint8_t)s[5])) + return 0; // Parsing error + + char tz_sign = *s; + tz_hours = (s[1] - '0') * 10 + (s[2] - '0'); + tz_mins = (s[4] - '0') * 10 + (s[5] - '0'); + + tz_offset = tz_hours * 3600 + tz_mins * 60; + tz_offset *= (tz_sign == '+' ? 1 : -1); + + s += 6; // Move past the timezone part + } + else if (*s == 'Z') + s++; + else + return 0; // Invalid RFC 3339 format + + // Convert to time_t (assuming local time, then adjusting for timezone later) + time_t epoch_s = mktime(&tm); + if (epoch_s == -1) + return 0; // Error in time conversion + + timestamp = (usec_t)epoch_s * USEC_PER_SEC + usec; + timestamp -= tz_offset * USEC_PER_SEC; + + if(endptr) + *endptr = s; + + return timestamp; +} diff --git a/src/libnetdata/datetime/rfc3339.h b/src/libnetdata/datetime/rfc3339.h new file mode 100644 index 00000000..88ebb3ec --- /dev/null +++ b/src/libnetdata/datetime/rfc3339.h @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" + +#ifndef NETDATA_RFC3339_H +#define NETDATA_RFC3339_H + +#define RFC3339_MAX_LENGTH 36 +size_t rfc3339_datetime_ut(char *buffer, size_t len, usec_t now_ut, size_t fractional_digits, bool utc); +usec_t rfc3339_parse_ut(const char *rfc3339, char **endptr); + +#endif //NETDATA_RFC3339_H diff --git a/src/libnetdata/datetime/rfc7231.c b/src/libnetdata/datetime/rfc7231.c new file mode 100644 index 00000000..4925ed2c --- /dev/null +++ b/src/libnetdata/datetime/rfc7231.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" + +inline size_t rfc7231_datetime(char *buffer, size_t len, time_t now_t) { + if (unlikely(!buffer || !len)) + return 0; + + struct tm *tmp, tmbuf; + + // Use gmtime_r for UTC time conversion. + tmp = gmtime_r(&now_t, &tmbuf); + + if (unlikely(!tmp)) { + buffer[0] = '\0'; + return 0; + } + + // Format the date and time according to the RFC 7231 format. + size_t ret = strftime(buffer, len, "%a, %d %b %Y %H:%M:%S GMT", tmp); + if (unlikely(ret == 0)) + buffer[0] = '\0'; + + return ret; +} + +size_t rfc7231_datetime_ut(char *buffer, size_t len, usec_t now_ut) { + return rfc7231_datetime(buffer, len, (time_t) (now_ut / USEC_PER_SEC)); +} diff --git a/src/libnetdata/datetime/rfc7231.h b/src/libnetdata/datetime/rfc7231.h new file mode 100644 index 00000000..5ba93053 --- /dev/null +++ b/src/libnetdata/datetime/rfc7231.h @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" + +#ifndef NETDATA_RFC7231_H +#define NETDATA_RFC7231_H + +#define RFC7231_MAX_LENGTH 30 +size_t rfc7231_datetime(char *buffer, size_t len, time_t now_t); +size_t rfc7231_datetime_ut(char *buffer, size_t len, usec_t now_ut); + +#endif //NETDATA_RFC7231_H diff --git a/src/libnetdata/dictionary/README.md b/src/libnetdata/dictionary/README.md new file mode 100644 index 00000000..59a8f6b9 --- /dev/null +++ b/src/libnetdata/dictionary/README.md @@ -0,0 +1,235 @@ +<!-- +custom_edit_url: https://github.com/netdata/netdata/edit/master/src/libnetdata/dictionary/README.md +sidebar_label: "Dictionaries" +learn_status: "Published" +learn_topic_type: "Tasks" +learn_rel_path: "Developers/libnetdata" +--> + +# Dictionaries + +Netdata dictionaries associate a `name` with a `value`: + +- A `name` can be any string. +- A `value` can be anything. + +Such a pair of a `name` and a `value` consists of an `item` or an `entry` in the dictionary. + +Dictionaries provide an interface to: + +- **Add** an item to the dictionary +- **Get** an item from the dictionary (provided its `name`) +- **Delete** an item from the dictionary (provided its `name`) +- **Traverse** the list of items in the dictionary + +Dictionaries are **ordered**, meaning that the order they have been added, is preserved while traversing them. The caller may reverse this order by passing the flag `DICT_OPTION_ADD_IN_FRONT` when creating the dictionary. + +Dictionaries guarantee **uniqueness** of all items added to them, meaning that only one item with a given `name` can exist in the dictionary at any given time. + +Dictionaries are extremely fast in all operations. They are indexing the keys with `JudyHS` and they utilize a double-linked-list for the traversal operations. Deletion is the most expensive operation, usually somewhat slower than insertion. + +## Memory management + +Dictionaries come with 2 memory management options: + +- **Clone** (copy) the `name` and/or the `value` to memory allocated by the dictionary. +- **Link** the `name` and/or the `value`, without allocating any memory about them. + +In **clone** mode, the dictionary guarantees that all operations on the dictionary items, will automatically take care of the memory used by the `name` and/or the `value`. In case the `value` is an object that needs to have user allocated memory, the following callback functions can be registered: + +1. `dictionary_register_insert_callback()` that can be called just after the insertion of an item to the dictionary, or after the replacement of the value of a dictionary item. +2. `dictionary_register_delete_callback()` that will be called just prior to the deletion of an item from the dictionary, or prior to the replacement of the value of a dictionary item. +3. `dictionary_register_conflict_callback()` that will be called when `DICT_OPTION_DONT_OVERWRITE_VALUE` is set, and another `value` is attempted to be inserted for the same key. +4. `dictionary_register_react_callback()` that will be called after the the `insert` and the `conflict` callbacks. The `conflict` callback is called while the dictionary hash table is available for other threads. + +In **link** mode, the `name` and/or the `value` are just linked to the dictionary item, and it is the user's responsibility to free the memory they use after an item is deleted from the dictionary or when the dictionary is destroyed. + +By default, **clone** mode is used for both the name and the value. + +To use **link** mode for names, add `DICT_OPTION_NAME_LINK_DONT_CLONE` to the flags when creating the dictionary. + +To use **link** mode for values, add `DICT_OPTION_VALUE_LINK_DONT_CLONE` to the flags when creating the dictionary. + +## Locks + +The dictionary allows both **single-threaded** operation (no locks - faster) and **multi-threaded** operation utilizing a read-write lock. + +The default is **multi-threaded**. To enable **single-threaded** add `DICT_OPTION_SINGLE_THREADED` to the flags when creating the dictionary. + +When in **multi-threaded** mode, the dictionaries have 2 independent R/W locks. One for the linked list and one for the hash table (index). An insertion and a deletion will acquire both independently (one after another) for as long as they are needed, but a traversal may hold the the linked list for longer durations. The hash table (index) lock may be acquired while the linked list is acquired, but not the other way around (and the way the code is structured, it is not technically possible to hold and index lock and then lock the linked list one). + +These locks are R/W locks. They allow multiple readers, but only one writer. + +Unlike POSIX standards, the linked-list lock, allows one writer to lock it multiple times. This has been implemented in such a way, so that a traversal to the items of the dictionary in write-lock mode, allows the writing thread to call `dictionary_set()` or `dictionary_del()`, which alter the dictionary index and the linked list. Especially for the deletion of the currently working item, the dictionary support delayed removal, so it will remove it from the index immediately and mark it as deleted, so that it can be added to the dictionary again with a different value and the traversal will still proceed from the point it was. + +## Hash table operations + +The dictionary supports the following operations supported by the hash table: + +- `dictionary_set()` to add an item to the dictionary, or change its value. +- `dictionary_get()` and `dictionary_get_and_acquire_item()` to get an item from the dictionary. +- `dictionary_del()` to delete an item from the dictionary. + +For all the calls, there are also `*_advanced()` versions of them, that support more parameters. Check the header file for more information about them. + +## Creation and destruction + +Use `dictionary_create()` to create a dictionary. + +Use `dictionary_destroy()` to destroy a dictionary. When destroyed, a dictionary frees all the memory it has allocated on its own. This can be complemented by the registration of a deletion callback function that can be called upon deletion of each item in the dictionary, which may free additional resources linked to it. + +### dictionary_set() + +This call is used to: + +- **add** an item to the dictionary. +- **reset** the value of an existing item in the dictionary. + +If **resetting** is not desired, add `DICT_OPTION_DONT_OVERWRITE_VALUE` to the flags when creating the dictionary. In this case, `dictionary_set()` will return the value of the original item found in the dictionary instead of resetting it and the value passed to the call will be ignored. Optionally a conflict callback function can be registered, to manipulate (probably merge or extend) the original value, based on the new value attempted to be added to the dictionary. + +The format is: + +```c +value = dictionary_set(dict, name, value, value_len); +``` + +Where: + +* `dict` is a pointer to the dictionary previously created. +* `name` is a pointer to a string to be used as the key of this item. The name must not be `NULL` and must not be an empty string `""`. +* `value` is a pointer to the value associated with this item. In **clone** mode, if `value` is `NULL`, a new memory allocation will be made of `value_len` size and will be initialized to zero. +* `value_len` is the size of the `value` data in bytes. If `value_len` is zero, no allocation will be done and the dictionary item will permanently have the `NULL` value. + +### dictionary_get() + +This call is used to get the `value` of an item, given its `name`. It utilizes the hash table (index) for making the lookup. + +For **multi-threaded** operation, the `dictionary_get()` call gets a shared read lock on the index lock (multiple readers are allowed). The linked-list lock is not used. + +In clone mode, the value returned is not guaranteed to be valid, as any other thread may delete the item from the dictionary at any time. To ensure the value will be available, use `dictionary_get_and_acquire_item()`, which uses a reference counter to defer deletes until the item is released with `dictionary_acquired_item_release()`. + +The format is: + +```c +value = dictionary_get(dict, name); +``` + +Where: + +* `dict` is a pointer to the dictionary previously created. +* `name` is a pointer to a string to be used as the key of this item. The name must not be `NULL` and must not be an empty string `""`. + +### dictionary_del() + +This call is used to delete an item from the dictionary, given its name. + +If there is a deletion callback registered to the dictionary (`dictionary_register_delete_callback()`), it is called prior to the actual deletion of the item. + +The format is: + +```c +value = dictionary_del(dict, name); +``` + +Where: + +* `dict` is a pointer to the dictionary previously created. +* `name` is a pointer to a string to be used as the key of this item. The name must not be `NULL` and must not be an empty string `""`. + +### dictionary_get_and_acquire_item() + +This call can be used to search and acquire a dictionary item, while ensuring that it will be available for use, until `dictionary_acquired_item_release()` is called. + +This call **does not return the value** of the dictionary item. It returns an internal pointer to a structure that maintains the reference counter used to protect the actual value. To get the value of the item (the same value as returned by `dictionary_get()`), the function `dictionary_acquired_item_value()` has to be called. + +Example: + +```c +// create the dictionary +DICTIONARY *dict = dictionary_create(DICT_OPTION_NONE); + +// add an item to it +dictionary_set(dict, "name", "value", 6); + +// find the item we added and acquire it +const DICTIONARY_ITEM *item = dictionary_get_and_acquire_item(dict, "name"); + +// extract its value +char *value = (char *)dictionary_acquired_item_value(dict, item); + +// now value points to the string "value" +printf("I got value = '%s'\n", value); + +// release the item, so that it can deleted +dictionary_acquired_item_release(dict, item); + +// destroy the dictionary +dictionary_destroy(dict); +``` + +When items are acquired, a reference counter is maintained to keep track of how many users exist for it. If an item with a non-zero number of users is deleted, it is removed from the index, it can be added again to the index (without conflict), and although it exists in the linked-list, it is not offered during traversal. Garbage collection to actually delete the item happens every time another item is added or removed from the linked-list and items are deleted only if no users are using them. + +If any item is still acquired when the dictionary is destroyed, the destruction of the dictionary is also deferred until all the acquired items are released. When the dictionary is destroyed like that, all operations on the dictionary fail (traversals do not traverse, insertions do not insert, deletions do not delete, searches do not find any items, etc). Once the last item in the dictionary is released, the dictionary is automatically destroyed too. + +## Traversal + +Dictionaries offer 3 ways to traverse the entire dictionary: + +- **walkthrough**, implemented by setting a callback function to be called for every item. +- **sorted walkthrough**, which first sorts the dictionary and then call a callback function for every item. +- **foreach**, a way to traverse the dictionary with a for-next loop. + +All these methods are available in **read**, **write**, or **reentrant** mode. In **read** mode only lookups are allowed to the dictionary. In **write** lookups but also insertions and deletions are allowed, and in **reentrant** mode the dictionary is unlocked outside dictionary code. + +### walkthrough (callback) + +There are 4 calls: + +- `dictionary_walkthrough_read()` and `dictionary_sorted_walkthrough_read()` acquire a shared read lock on the linked-list, and they call a callback function for every item of the dictionary. +- `dictionary_walkthrough_write()` and `dictionary_sorted_walkthrough_write()` acquire a write lock on the linked-list, and they call a callback function for every item of the dictionary. This is to be used when items need to be added to or removed from the dictionary. The `write` versions can be used to delete any or all the items from the dictionary, including the currently working one. For the `sorted` version, all items in the dictionary maintain a reference counter, so all deletions are deferred until the sorted walkthrough finishes. + +The non sorted versions traverse the items in the same order they have been added to the dictionary (or the reverse order if the flag `DICT_OPTION_ADD_IN_FRONT` is set during dictionary creation). The sorted versions sort alphabetically the items based on their name, and then they traverse them in the sorted order. + +The callback function returns an `int`. If this value is negative, traversal of the dictionary is stopped immediately and the negative value is returned to the caller. If the returned value of all callback calls is zero or positive, the walkthrough functions return the sum of the return values of all callbacks. So, if you are just interested to know how many items fall into some condition, write a callback function that returns 1 when the item satisfies that condition and 0 when it does not and the walkthrough function will return how many tested positive. + +### foreach (for-next loop) + +The following is a snippet of such a loop: + +```c +MY_STRUCTURE *x; +dfe_start_read(dict, x) { + printf("hey, I got an item named '%s' with value ptr %08X", x_dfe.name, x); +} +dfe_done(x); +``` + +The `x` parameter gives the name of the pointer to be used while iterating the items. Any name is accepted. `x` points to the `value` of the item in the dictionary. + +The `x_dfe.name` is a variable that is automatically created, by concatenating whatever is given as `x` and `_dfe`. It is an object and it has a few members, including `x_dfe.counter` that counts the iterations made so far, `x_dfe.item` that provides the acquired item from the dictionary and which can be used to pass it over for further processing, etc. Check the header file for more info. So, if you call `dfe_start_read(dict, myvar)`, the name will be `myvar_dfe`. + +Both `dfe_start_read(dict, item)` and `dfe_done(item)` are together inside a `do { ... } while(0)` loop, so that the following will work: + +```c +MY_ITEM *item; + +if(a = 1) + // do { + dfe_start_read(dict, x) + printf("hey, I got an item named '%s' with value ptr %08X", x_dfe.name, x); + dfe_done(x); + // } while(0); +else + something else; +``` + +In the above, the `if(a == 1)` condition will work as expected. It will do the foreach loop when a is 1, otherwise it will run `something else`. + +There are 2 versions of `dfe_start`: + +- `dfe_start_read()` that acquires a shared read linked-list lock to the dictionary. +- `dfe_start_write()` that acquires an exclusive write linked-list lock to the dictionary. + +While in the loop, depending on the read or write versions of `dfe_start`, the caller may lookup or manipulate the dictionary. The rules are the same with the unsorted walkthrough callback functions. + +PS: DFE is Dictionary For Each. diff --git a/src/libnetdata/dictionary/dictionary-callbacks.h b/src/libnetdata/dictionary/dictionary-callbacks.h new file mode 100644 index 00000000..38da3df0 --- /dev/null +++ b/src/libnetdata/dictionary/dictionary-callbacks.h @@ -0,0 +1,93 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_DICTIONARY_CALLBACKS_H +#define NETDATA_DICTIONARY_CALLBACKS_H + +#include "dictionary-internals.h" + +// ---------------------------------------------------------------------------- +// callbacks execution + +static inline void dictionary_execute_insert_callback(DICTIONARY *dict, DICTIONARY_ITEM *item, void *constructor_data) { + if(likely(!dict->hooks || !dict->hooks->insert_callback)) + return; + + if(unlikely(is_view_dictionary(dict))) + fatal("DICTIONARY: called %s() on a view.", __FUNCTION__ ); + + internal_error(false, + "DICTIONARY: Running insert callback on item '%s' of dictionary created from %s() %zu@%s.", + item_get_name(item), + dict->creation_function, + dict->creation_line, + dict->creation_file); + + dict->hooks->insert_callback(item, item->shared->value, constructor_data?constructor_data:dict->hooks->insert_callback_data); + DICTIONARY_STATS_CALLBACK_INSERTS_PLUS1(dict); +} + +static inline bool dictionary_execute_conflict_callback(DICTIONARY *dict, DICTIONARY_ITEM *item, void *new_value, void *constructor_data) { + if(likely(!dict->hooks || !dict->hooks->conflict_callback)) + return false; + + if(unlikely(is_view_dictionary(dict))) + fatal("DICTIONARY: called %s() on a view.", __FUNCTION__ ); + + internal_error(false, + "DICTIONARY: Running conflict callback on item '%s' of dictionary created from %s() %zu@%s.", + item_get_name(item), + dict->creation_function, + dict->creation_line, + dict->creation_file); + + bool ret = dict->hooks->conflict_callback( + item, item->shared->value, new_value, + constructor_data ? constructor_data : dict->hooks->conflict_callback_data); + + DICTIONARY_STATS_CALLBACK_CONFLICTS_PLUS1(dict); + + return ret; +} + +static inline void dictionary_execute_react_callback(DICTIONARY *dict, DICTIONARY_ITEM *item, void *constructor_data) { + if(likely(!dict->hooks || !dict->hooks->react_callback)) + return; + + if(unlikely(is_view_dictionary(dict))) + fatal("DICTIONARY: called %s() on a view.", __FUNCTION__ ); + + internal_error(false, + "DICTIONARY: Running react callback on item '%s' of dictionary created from %s() %zu@%s.", + item_get_name(item), + dict->creation_function, + dict->creation_line, + dict->creation_file); + + dict->hooks->react_callback(item, item->shared->value, + constructor_data?constructor_data:dict->hooks->react_callback_data); + + DICTIONARY_STATS_CALLBACK_REACTS_PLUS1(dict); +} + +static inline void dictionary_execute_delete_callback(DICTIONARY *dict, DICTIONARY_ITEM *item) { + if(likely(!dict->hooks || !dict->hooks->delete_callback)) + return; + + // We may execute delete callback on items deleted from a view, + // because we may have references to it, after the master is gone + // so, the shared structure will remain until the last reference is released. + + internal_error(false, + "DICTIONARY: Running delete callback on item '%s' of dictionary created from %s() %zu@%s.", + item_get_name(item), + dict->creation_function, + dict->creation_line, + dict->creation_file); + + dict->hooks->delete_callback(item, item->shared->value, dict->hooks->delelte_callback_data); + + DICTIONARY_STATS_CALLBACK_DELETES_PLUS1(dict); +} + + +#endif //NETDATA_DICTIONARY_CALLBACKS_H diff --git a/src/libnetdata/dictionary/dictionary-hashtable.h b/src/libnetdata/dictionary/dictionary-hashtable.h new file mode 100644 index 00000000..865f0b36 --- /dev/null +++ b/src/libnetdata/dictionary/dictionary-hashtable.h @@ -0,0 +1,263 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_DICTIONARY_HASHTABLE_H +#define NETDATA_DICTIONARY_HASHTABLE_H + +#include "dictionary-internals.h" + +// ---------------------------------------------------------------------------- +// hashtable operations with simple hashtable + +static inline bool compare_keys(void *key1, void *key2) { + const char *k1 = key1; + const char *k2 = key2; + return strcmp(k1, k2) == 0; +} + +static inline void *item_to_key(DICTIONARY_ITEM *item) { + return (void *)item_get_name(item); +} + +#define SIMPLE_HASHTABLE_VALUE_TYPE DICTIONARY_ITEM +#define SIMPLE_HASHTABLE_NAME _DICTIONARY +#define SIMPLE_HASHTABLE_VALUE2KEY_FUNCTION item_to_key +#define SIMPLE_HASHTABLE_COMPARE_KEYS_FUNCTION compare_keys +#include "..//simple_hashtable.h" + +static inline size_t hashtable_init_hashtable(DICTIONARY *dict) { + SIMPLE_HASHTABLE_DICTIONARY *ht = callocz(1, sizeof(*ht)); + simple_hashtable_init_DICTIONARY(ht, 4); + dict->index.JudyHSArray = ht; + return 0; +} + +static inline size_t hashtable_destroy_hashtable(DICTIONARY *dict) { + SIMPLE_HASHTABLE_DICTIONARY *ht = dict->index.JudyHSArray; + if(unlikely(!ht)) return 0; + + size_t mem = sizeof(*ht) + ht->size * sizeof(SIMPLE_HASHTABLE_SLOT_DICTIONARY); + simple_hashtable_destroy_DICTIONARY(ht); + freez(ht); + dict->index.JudyHSArray = NULL; + + return mem; +} + +static inline void *hashtable_insert_hashtable(DICTIONARY *dict, const char *name, size_t name_len) { + SIMPLE_HASHTABLE_DICTIONARY *ht = dict->index.JudyHSArray; + + char key[name_len+1]; + memcpy(key, name, name_len); + key[name_len] = '\0'; + + XXH64_hash_t hash = XXH3_64bits(name, name_len); + SIMPLE_HASHTABLE_SLOT_DICTIONARY *sl = simple_hashtable_get_slot_DICTIONARY(ht, hash, key, true); + sl->hash = hash; // we will need it in insert later - it is ok to overwrite - it is the same already + return sl; +} + +static inline DICTIONARY_ITEM *hashtable_insert_handle_to_item_hashtable(DICTIONARY *dict, void *handle) { + (void)dict; + SIMPLE_HASHTABLE_SLOT_DICTIONARY *sl = handle; + DICTIONARY_ITEM *item = SIMPLE_HASHTABLE_SLOT_DATA(sl); + return item; +} + +static inline void hashtable_set_item_hashtable(DICTIONARY *dict, void *handle, DICTIONARY_ITEM *item) { + SIMPLE_HASHTABLE_DICTIONARY *ht = dict->index.JudyHSArray; + SIMPLE_HASHTABLE_SLOT_DICTIONARY *sl = handle; + simple_hashtable_set_slot_DICTIONARY(ht, sl, sl->hash, item); +} + +static inline int hashtable_delete_hashtable(DICTIONARY *dict, const char *name, size_t name_len, DICTIONARY_ITEM *item_to_delete) { + (void)item_to_delete; + SIMPLE_HASHTABLE_DICTIONARY *ht = dict->index.JudyHSArray; + + char key[name_len+1]; + memcpy(key, name, name_len); + key[name_len] = '\0'; + + XXH64_hash_t hash = XXH3_64bits(name, name_len); + SIMPLE_HASHTABLE_SLOT_DICTIONARY *sl = simple_hashtable_get_slot_DICTIONARY(ht, hash, key, false); + DICTIONARY_ITEM *item = SIMPLE_HASHTABLE_SLOT_DATA(sl); + if(!item) return 0; // return not-found + + simple_hashtable_del_slot_DICTIONARY(ht, sl); + return 1; // return deleted +} + +static inline DICTIONARY_ITEM *hashtable_get_hashtable(DICTIONARY *dict, const char *name, size_t name_len) { + SIMPLE_HASHTABLE_DICTIONARY *ht = dict->index.JudyHSArray; + if(unlikely(!ht)) return NULL; + + char key[name_len+1]; + memcpy(key, name, name_len); + key[name_len] = '\0'; + + XXH64_hash_t hash = XXH3_64bits(name, name_len); + SIMPLE_HASHTABLE_SLOT_DICTIONARY *sl = simple_hashtable_get_slot_DICTIONARY(ht, hash, key, true); + return SIMPLE_HASHTABLE_SLOT_DATA(sl); +} + +// ---------------------------------------------------------------------------- +// hashtable operations with Judy + +static inline size_t hashtable_init_judy(DICTIONARY *dict) { + dict->index.JudyHSArray = NULL; + return 0; +} + +static inline size_t hashtable_destroy_judy(DICTIONARY *dict) { + if(unlikely(!dict->index.JudyHSArray)) return 0; + + pointer_destroy_index(dict); + + JError_t J_Error; + Word_t ret = JudyHSFreeArray(&dict->index.JudyHSArray, &J_Error); + if(unlikely(ret == (Word_t) JERR)) { + netdata_log_error("DICTIONARY: Cannot destroy JudyHS, JU_ERRNO_* == %u, ID == %d", + JU_ERRNO(&J_Error), JU_ERRID(&J_Error)); + } + + netdata_log_debug(D_DICTIONARY, "Dictionary: hash table freed %lu bytes", ret); + + dict->index.JudyHSArray = NULL; + return (size_t)ret; +} + +static inline void *hashtable_insert_judy(DICTIONARY *dict, const char *name, size_t name_len) { + JError_t J_Error; + Pvoid_t *Rc = JudyHSIns(&dict->index.JudyHSArray, (void *)name, name_len, &J_Error); + if (unlikely(Rc == PJERR)) { + netdata_log_error("DICTIONARY: Cannot insert entry with name '%s' to JudyHS, JU_ERRNO_* == %u, ID == %d", + name, JU_ERRNO(&J_Error), JU_ERRID(&J_Error)); + } + + // if *Rc == 0, new item added to the array + // otherwise the existing item value is returned in *Rc + + // we return a pointer to a pointer, so that the caller can + // put anything needed at the value of the index. + // The pointer to pointer we return has to be used before + // any other operation that may change the index (insert/delete). + return (void *)Rc; +} + +static inline DICTIONARY_ITEM *hashtable_insert_handle_to_item_judy(DICTIONARY *dict, void *handle) { + (void)dict; + DICTIONARY_ITEM **item_pptr = handle; + return *item_pptr; +} + +static inline void hashtable_set_item_judy(DICTIONARY *dict, void *handle, DICTIONARY_ITEM *item) { + (void)dict; + DICTIONARY_ITEM **item_pptr = handle; + *item_pptr = item; +} + +static inline int hashtable_delete_judy(DICTIONARY *dict, const char *name, size_t name_len, DICTIONARY_ITEM *item) { + (void)item; + if(unlikely(!dict->index.JudyHSArray)) return 0; + + JError_t J_Error; + int ret = JudyHSDel(&dict->index.JudyHSArray, (void *)name, name_len, &J_Error); + if(unlikely(ret == JERR)) { + netdata_log_error("DICTIONARY: Cannot delete entry with name '%s' from JudyHS, JU_ERRNO_* == %u, ID == %d", + name, + JU_ERRNO(&J_Error), JU_ERRID(&J_Error)); + return 0; + } + + // Hey, this is problematic! We need the value back, not just an int with a status! + // https://sourceforge.net/p/judy/feature-requests/23/ + + if(unlikely(ret == 0)) { + // not found in the dictionary + return 0; + } + else { + // found and deleted from the dictionary + return 1; + } +} + +static inline DICTIONARY_ITEM *hashtable_get_judy(DICTIONARY *dict, const char *name, size_t name_len) { + if(unlikely(!dict->index.JudyHSArray)) return NULL; + + Pvoid_t *Rc; + Rc = JudyHSGet(dict->index.JudyHSArray, (void *)name, name_len); + if(likely(Rc)) { + // found in the hash table + pointer_check(dict, (DICTIONARY_ITEM *)*Rc); + return (DICTIONARY_ITEM *)*Rc; + } + else { + // not found in the hash table + return NULL; + } +} + +// -------------------------------------------------------------------------------------------------------------------- +// select the right hashtable + +static inline size_t hashtable_init_unsafe(DICTIONARY *dict) { + if(dict->options & DICT_OPTION_INDEX_JUDY) + return hashtable_init_judy(dict); + else + return hashtable_init_hashtable(dict); +} + +static inline size_t hashtable_destroy_unsafe(DICTIONARY *dict) { + pointer_destroy_index(dict); + + if(dict->options & DICT_OPTION_INDEX_JUDY) + return hashtable_destroy_judy(dict); + else + return hashtable_destroy_hashtable(dict); +} + +static inline void *hashtable_insert_unsafe(DICTIONARY *dict, const char *name, size_t name_len) { + if(dict->options & DICT_OPTION_INDEX_JUDY) + return hashtable_insert_judy(dict, name, name_len); + else + return hashtable_insert_hashtable(dict, name, name_len); +} + +static inline DICTIONARY_ITEM *hashtable_insert_handle_to_item_unsafe(DICTIONARY *dict, void *handle) { + if(dict->options & DICT_OPTION_INDEX_JUDY) + return hashtable_insert_handle_to_item_judy(dict, handle); + else + return hashtable_insert_handle_to_item_hashtable(dict, handle); +} + +static inline int hashtable_delete_unsafe(DICTIONARY *dict, const char *name, size_t name_len, DICTIONARY_ITEM *item) { + if(dict->options & DICT_OPTION_INDEX_JUDY) + return hashtable_delete_judy(dict, name, name_len, item); + else + return hashtable_delete_hashtable(dict, name, name_len, item); +} + +static inline DICTIONARY_ITEM *hashtable_get_unsafe(DICTIONARY *dict, const char *name, size_t name_len) { + DICTIONARY_STATS_SEARCHES_PLUS1(dict); + + DICTIONARY_ITEM *item; + + if(dict->options & DICT_OPTION_INDEX_JUDY) + item = hashtable_get_judy(dict, name, name_len); + else + item = hashtable_get_hashtable(dict, name, name_len); + + if(item) + pointer_check(dict, item); + + return item; +} + +static inline void hashtable_set_item_unsafe(DICTIONARY *dict, void *handle, DICTIONARY_ITEM *item) { + if(dict->options & DICT_OPTION_INDEX_JUDY) + hashtable_set_item_judy(dict, handle, item); + else + hashtable_set_item_hashtable(dict, handle, item); +} + +#endif //NETDATA_DICTIONARY_HASHTABLE_H diff --git a/src/libnetdata/dictionary/dictionary-internals.h b/src/libnetdata/dictionary/dictionary-internals.h new file mode 100644 index 00000000..54e59564 --- /dev/null +++ b/src/libnetdata/dictionary/dictionary-internals.h @@ -0,0 +1,259 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_DICTIONARY_INTERNALS_H +#define NETDATA_DICTIONARY_INTERNALS_H + +#define DICTIONARY_INTERNALS +#include "../libnetdata.h" + +// runtime flags of the dictionary - must be checked with atomics +typedef enum __attribute__ ((__packed__)) { + DICT_FLAG_NONE = 0, + DICT_FLAG_DESTROYED = (1 << 0), // this dictionary has been destroyed +} DICT_FLAGS; + +#define dict_flag_check(dict, flag) (__atomic_load_n(&((dict)->flags), __ATOMIC_RELAXED) & (flag)) +#define dict_flag_set(dict, flag) __atomic_or_fetch(&((dict)->flags), flag, __ATOMIC_RELAXED) +#define dict_flag_clear(dict, flag) __atomic_and_fetch(&((dict)->flags), ~(flag), __ATOMIC_RELAXED) + +// flags macros +#define is_dictionary_destroyed(dict) dict_flag_check(dict, DICT_FLAG_DESTROYED) + +// configuration options macros +#define is_dictionary_single_threaded(dict) ((dict)->options & DICT_OPTION_SINGLE_THREADED) +#define is_view_dictionary(dict) ((dict)->master) +#define is_master_dictionary(dict) (!is_view_dictionary(dict)) + +typedef enum __attribute__ ((__packed__)) item_options { + ITEM_OPTION_NONE = 0, + ITEM_OPTION_ALLOCATED_NAME = (1 << 0), // the name pointer is a STRING + + // IMPORTANT: This is 1-bit - to add more change ITEM_OPTIONS_BITS +} ITEM_OPTIONS; + +typedef enum __attribute__ ((__packed__)) item_flags { + ITEM_FLAG_NONE = 0, + ITEM_FLAG_DELETED = (1 << 0), // this item is marked deleted, so it is not available for traversal (deleted from the index too) + ITEM_FLAG_BEING_CREATED = (1 << 1), // this item is currently being created - this flag is removed when construction finishes + + // IMPORTANT: This is 8-bit +} ITEM_FLAGS; + +#define item_flag_check(item, flag) (__atomic_load_n(&((item)->flags), __ATOMIC_RELAXED) & (flag)) +#define item_flag_set(item, flag) __atomic_or_fetch(&((item)->flags), flag, __ATOMIC_RELAXED) +#define item_flag_clear(item, flag) __atomic_and_fetch(&((item)->flags), ~(flag), __ATOMIC_RELAXED) + +#define item_shared_flag_check(item, flag) (__atomic_load_n(&((item)->shared->flags), __ATOMIC_RELAXED) & (flag)) +#define item_shared_flag_set(item, flag) __atomic_or_fetch(&((item)->shared->flags), flag, __ATOMIC_RELAXED) +#define item_shared_flag_clear(item, flag) __atomic_and_fetch(&((item)->shared->flags), ~(flag), __ATOMIC_RELAXED) + +#define REFCOUNT_DELETING (-100) + +#define ITEM_FLAGS_TYPE uint8_t +#define KEY_LEN_TYPE uint32_t +#define VALUE_LEN_TYPE uint32_t + +#define ITEM_OPTIONS_BITS 1 +#define KEY_LEN_BITS ((sizeof(KEY_LEN_TYPE) * 8) - (sizeof(ITEM_FLAGS_TYPE) * 8) - ITEM_OPTIONS_BITS) +#define KEY_LEN_MAX ((1 << KEY_LEN_BITS) - 1) + +#define VALUE_LEN_BITS ((sizeof(VALUE_LEN_TYPE) * 8) - (sizeof(ITEM_FLAGS_TYPE) * 8)) +#define VALUE_LEN_MAX ((1 << VALUE_LEN_BITS) - 1) + + +/* + * Every item in the dictionary has the following structure. + */ + +typedef int32_t REFCOUNT; + +typedef struct dictionary_item_shared { + void *value; // the value of the dictionary item + + // the order of the following items is important! + // The total of their storage should be 64-bits + + REFCOUNT links; // how many links this item has + VALUE_LEN_TYPE value_len:VALUE_LEN_BITS; // the size of the value + ITEM_FLAGS_TYPE flags; // shared flags +} DICTIONARY_ITEM_SHARED; + +struct dictionary_item { +#ifdef NETDATA_INTERNAL_CHECKS + DICTIONARY *dict; + pid_t creator_pid; + pid_t deleter_pid; + pid_t ll_adder_pid; + pid_t ll_remover_pid; +#endif + + DICTIONARY_ITEM_SHARED *shared; + + struct dictionary_item *next; // a double linked list to allow fast insertions and deletions + struct dictionary_item *prev; + + union { + STRING *string_name; // the name of the dictionary item + char *caller_name; // the user supplied string pointer + // void *key_ptr; // binary key pointer + }; + + // the order of the following items is important! + // The total of their storage should be 64-bits + + REFCOUNT refcount; // the private reference counter + + KEY_LEN_TYPE key_len:KEY_LEN_BITS; // the size of key indexed (for strings, including the null terminator) + // this is (2^23 - 1) = 8.388.607 bytes max key length. + + ITEM_OPTIONS options:ITEM_OPTIONS_BITS; // permanent configuration options + // (no atomic operations on this - they never change) + + ITEM_FLAGS_TYPE flags; // runtime changing flags for this item (atomic operations on this) + // cannot be a bit field because of atomics. +}; + +struct dictionary_hooks { + REFCOUNT links; + usec_t last_master_deletion_us; + + dict_cb_insert_t insert_callback; + void *insert_callback_data; + + dict_cb_conflict_t conflict_callback; + void *conflict_callback_data; + + dict_cb_react_t react_callback; + void *react_callback_data; + + dict_cb_delete_t delete_callback; + void *delelte_callback_data; +}; + +struct dictionary { +#ifdef NETDATA_INTERNAL_CHECKS + const char *creation_function; + const char *creation_file; + size_t creation_line; + pid_t creation_tid; +#endif + + usec_t last_gc_run_us; + DICT_OPTIONS options; // the configuration flags of the dictionary (they never change - no atomics) + DICT_FLAGS flags; // run time flags for the dictionary (they change all the time - atomics needed) + + ARAL *value_aral; + + struct { // support for multiple indexing engines + Pvoid_t JudyHSArray; // the hash table + RW_SPINLOCK rw_spinlock; // protect the index + } index; + + struct { + DICTIONARY_ITEM *list; // the double linked list of all items in the dictionary + RW_SPINLOCK rw_spinlock; // protect the linked-list + pid_t writer_pid; // the gettid() of the writer + uint32_t writer_depth; // nesting of write locks + } items; + + struct dictionary_hooks *hooks; // pointer to external function callbacks to be called at certain points + struct dictionary_stats *stats; // statistics data, when DICT_OPTION_STATS is set + + DICTIONARY *master; // the master dictionary + DICTIONARY *next; // linked list for delayed destruction (garbage collection of whole dictionaries) + + uint32_t version; // the current version of the dictionary + // it is incremented when: + // - item added + // - item removed + // - item value reset + // - conflict callback returns true + // - function dictionary_version_increment() is called + + int32_t entries; // how many items are currently in the index (the linked list may have more) + int32_t referenced_items; // how many items of the dictionary are currently being used by 3rd parties + int32_t pending_deletion_items; // how many items of the dictionary have been deleted, but have not been removed yet + +#ifdef NETDATA_DICTIONARY_VALIDATE_POINTERS + netdata_mutex_t global_pointer_registry_mutex; + Pvoid_t global_pointer_registry; +#endif +}; + +// ---------------------------------------------------------------------------- +// forward definitions of functions used in reverse order in the code + +void garbage_collect_pending_deletes(DICTIONARY *dict); +static inline void item_linked_list_remove(DICTIONARY *dict, DICTIONARY_ITEM *item); +static size_t dict_item_free_with_hooks(DICTIONARY *dict, DICTIONARY_ITEM *item); +static inline const char *item_get_name(const DICTIONARY_ITEM *item); +static inline int hashtable_delete_unsafe(DICTIONARY *dict, const char *name, size_t name_len, DICTIONARY_ITEM *item); +static void item_release(DICTIONARY *dict, DICTIONARY_ITEM *item); +static bool dict_item_set_deleted(DICTIONARY *dict, DICTIONARY_ITEM *item); + +#define RC_ITEM_OK ( 0) +#define RC_ITEM_MARKED_FOR_DELETION (-1) // the item is marked for deletion +#define RC_ITEM_IS_CURRENTLY_BEING_DELETED (-2) // the item is currently being deleted +#define RC_ITEM_IS_CURRENTLY_BEING_CREATED (-3) // the item is currently being deleted +#define RC_ITEM_IS_REFERENCED (-4) // the item is currently referenced +#define item_check_and_acquire(dict, item) (item_check_and_acquire_advanced(dict, item, false) == RC_ITEM_OK) +static int item_check_and_acquire_advanced(DICTIONARY *dict, DICTIONARY_ITEM *item, bool having_index_lock); +#define item_is_not_referenced_and_can_be_removed(dict, item) (item_is_not_referenced_and_can_be_removed_advanced(dict, item) == RC_ITEM_OK) +static inline int item_is_not_referenced_and_can_be_removed_advanced(DICTIONARY *dict, DICTIONARY_ITEM *item); + +// ---------------------------------------------------------------------------- +// validate each pointer is indexed once - internal checks only + +#ifdef NETDATA_DICTIONARY_VALIDATE_POINTERS +static inline void pointer_index_init(DICTIONARY *dict __maybe_unused) { + netdata_mutex_init(&dict->global_pointer_registry_mutex); +} + +static inline void pointer_destroy_index(DICTIONARY *dict __maybe_unused) { + netdata_mutex_lock(&dict->global_pointer_registry_mutex); + JudyHSFreeArray(&dict->global_pointer_registry, PJE0); + netdata_mutex_unlock(&dict->global_pointer_registry_mutex); +} +static inline void pointer_add(DICTIONARY *dict __maybe_unused, DICTIONARY_ITEM *item __maybe_unused) { + netdata_mutex_lock(&dict->global_pointer_registry_mutex); + Pvoid_t *PValue = JudyHSIns(&dict->global_pointer_registry, &item, sizeof(void *), PJE0); + if(*PValue != NULL) + fatal("pointer already exists in registry"); + *PValue = item; + netdata_mutex_unlock(&dict->global_pointer_registry_mutex); +} + +static inline void pointer_check(DICTIONARY *dict __maybe_unused, DICTIONARY_ITEM *item __maybe_unused) { + netdata_mutex_lock(&dict->global_pointer_registry_mutex); + Pvoid_t *PValue = JudyHSGet(dict->global_pointer_registry, &item, sizeof(void *)); + if(PValue == NULL) + fatal("pointer is not found in registry"); + netdata_mutex_unlock(&dict->global_pointer_registry_mutex); +} + +static inline void pointer_del(DICTIONARY *dict __maybe_unused, DICTIONARY_ITEM *item __maybe_unused) { + netdata_mutex_lock(&dict->global_pointer_registry_mutex); + int ret = JudyHSDel(&dict->global_pointer_registry, &item, sizeof(void *), PJE0); + if(!ret) + fatal("pointer to be deleted does not exist in registry"); + netdata_mutex_unlock(&dict->global_pointer_registry_mutex); +} +#else // !NETDATA_DICTIONARY_VALIDATE_POINTERS +#define pointer_index_init(dict) debug_dummy() +#define pointer_destroy_index(dict) debug_dummy() +#define pointer_add(dict, item) debug_dummy() +#define pointer_check(dict, item) debug_dummy() +#define pointer_del(dict, item) debug_dummy() +#endif // !NETDATA_DICTIONARY_VALIDATE_POINTERS + +extern ARAL *dict_items_aral; +extern ARAL *dict_shared_items_aral; + +#include "dictionary-statistics.h" +#include "dictionary-locks.h" +#include "dictionary-refcount.h" +#include "dictionary-hashtable.h" +#include "dictionary-callbacks.h" +#include "dictionary-item.h" + +#endif //NETDATA_DICTIONARY_INTERNALS_H diff --git a/src/libnetdata/dictionary/dictionary-item.h b/src/libnetdata/dictionary/dictionary-item.h new file mode 100644 index 00000000..f7c6e47a --- /dev/null +++ b/src/libnetdata/dictionary/dictionary-item.h @@ -0,0 +1,555 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_DICTIONARY_ITEM_H +#define NETDATA_DICTIONARY_ITEM_H + +#include "dictionary-internals.h" + +// ---------------------------------------------------------------------------- +// ITEM initialization and updates + +static inline size_t item_set_name(DICTIONARY *dict, DICTIONARY_ITEM *item, const char *name, size_t name_len) { + if(likely(dict->options & DICT_OPTION_NAME_LINK_DONT_CLONE)) { + item->caller_name = (char *)name; + item->key_len = name_len; + } + else { + item->string_name = string_strdupz(name); + item->key_len = string_strlen(item->string_name); + item->options |= ITEM_OPTION_ALLOCATED_NAME; + } + + return item->key_len; +} + +static inline size_t item_free_name(DICTIONARY *dict, DICTIONARY_ITEM *item) { + if(likely(!(dict->options & DICT_OPTION_NAME_LINK_DONT_CLONE))) + string_freez(item->string_name); + + return item->key_len; +} + +static inline const char *item_get_name(const DICTIONARY_ITEM *item) { + if(item->options & ITEM_OPTION_ALLOCATED_NAME) + return string2str(item->string_name); + else + return item->caller_name; +} + +static inline size_t item_get_name_len(const DICTIONARY_ITEM *item) { + if(item->options & ITEM_OPTION_ALLOCATED_NAME) + return string_strlen(item->string_name); + else + return strlen(item->caller_name); +} + +// ---------------------------------------------------------------------------- + +static inline DICTIONARY_ITEM *dict_item_create(DICTIONARY *dict __maybe_unused, size_t *allocated_bytes, DICTIONARY_ITEM *master_item) { + DICTIONARY_ITEM *item; + + size_t size = sizeof(DICTIONARY_ITEM); + item = aral_mallocz(dict_items_aral); + memset(item, 0, sizeof(DICTIONARY_ITEM)); + +#ifdef NETDATA_INTERNAL_CHECKS + item->creator_pid = gettid_cached(); +#endif + + item->refcount = 1; + item->flags = ITEM_FLAG_BEING_CREATED; + + *allocated_bytes += size; + + if(master_item) { + item->shared = master_item->shared; + + if(unlikely(__atomic_add_fetch(&item->shared->links, 1, __ATOMIC_ACQUIRE) <= 1)) + fatal("DICTIONARY: attempted to link to a shared item structure that had zero references"); + } + else { + size = sizeof(DICTIONARY_ITEM_SHARED); + item->shared = aral_mallocz(dict_shared_items_aral); + memset(item->shared, 0, sizeof(DICTIONARY_ITEM_SHARED)); + + item->shared->links = 1; + *allocated_bytes += size; + } + +#ifdef NETDATA_INTERNAL_CHECKS + item->dict = dict; +#endif + return item; +} + +static inline void *dict_item_value_mallocz(DICTIONARY *dict, size_t value_len) { + if(dict->value_aral) { + internal_fatal(aral_element_size(dict->value_aral) != value_len, + "DICTIONARY: item value size %zu does not match the configured fixed one %zu", + value_len, aral_element_size(dict->value_aral)); + return aral_mallocz(dict->value_aral); + } + else + return mallocz(value_len); +} + +static inline void dict_item_value_freez(DICTIONARY *dict, void *ptr) { + if(dict->value_aral) + aral_freez(dict->value_aral, ptr); + else + freez(ptr); +} + +static inline void *dict_item_value_create(DICTIONARY *dict, void *value, size_t value_len) { + void *ptr = NULL; + + if(likely(value_len)) { + if (likely(value)) { + // a value has been supplied + // copy it + ptr = dict_item_value_mallocz(dict, value_len); + memcpy(ptr, value, value_len); + } + else { + // no value has been supplied + // allocate a clear memory block + ptr = dict_item_value_mallocz(dict, value_len); + memset(ptr, 0, value_len); + } + } + // else + // the caller wants an item without any value + + return ptr; +} + +static inline DICTIONARY_ITEM *dict_item_create_with_hooks(DICTIONARY *dict, const char *name, size_t name_len, void *value, size_t value_len, void *constructor_data, DICTIONARY_ITEM *master_item) { +#ifdef NETDATA_INTERNAL_CHECKS + if(unlikely(name_len > KEY_LEN_MAX)) + fatal("DICTIONARY: tried to index a key of size %zu, but the maximum acceptable is %zu", name_len, (size_t)KEY_LEN_MAX); + + if(unlikely(value_len > VALUE_LEN_MAX)) + fatal("DICTIONARY: tried to add an item of size %zu, but the maximum acceptable is %zu", value_len, (size_t)VALUE_LEN_MAX); +#endif + + size_t item_size = 0, key_size = 0, value_size = 0; + + DICTIONARY_ITEM *item = dict_item_create(dict, &item_size, master_item); + key_size += item_set_name(dict, item, name, name_len); + + if(unlikely(is_view_dictionary(dict))) { + // we are on a view dictionary + // do not touch the value + ; + +#ifdef NETDATA_INTERNAL_CHECKS + if(unlikely(!master_item)) + fatal("DICTIONARY: cannot add an item to a view without a master item."); +#endif + } + else { + // we are on the master dictionary + + if(unlikely(dict->options & DICT_OPTION_VALUE_LINK_DONT_CLONE)) + item->shared->value = value; + else + item->shared->value = dict_item_value_create(dict, value, value_len); + + item->shared->value_len = value_len; + value_size += value_len; + + dictionary_execute_insert_callback(dict, item, constructor_data); + } + + DICTIONARY_ENTRIES_PLUS1(dict); + DICTIONARY_STATS_PLUS_MEMORY(dict, key_size, item_size, value_size); + + return item; +} + +static inline void dict_item_reset_value_with_hooks(DICTIONARY *dict, DICTIONARY_ITEM *item, void *value, size_t value_len, void *constructor_data) { + if(unlikely(is_view_dictionary(dict))) + fatal("DICTIONARY: %s() should never be called on views.", __FUNCTION__ ); + + netdata_log_debug(D_DICTIONARY, "Dictionary entry with name '%s' found. Changing its value.", item_get_name(item)); + + DICTIONARY_VALUE_RESETS_PLUS1(dict); + + if(item->shared->value_len != value_len) { + DICTIONARY_STATS_PLUS_MEMORY(dict, 0, 0, value_len); + DICTIONARY_STATS_MINUS_MEMORY(dict, 0, 0, item->shared->value_len); + } + + dictionary_execute_delete_callback(dict, item); + + if(likely(dict->options & DICT_OPTION_VALUE_LINK_DONT_CLONE)) { + netdata_log_debug(D_DICTIONARY, "Dictionary: linking value to '%s'", item_get_name(item)); + item->shared->value = value; + item->shared->value_len = value_len; + } + else { + netdata_log_debug(D_DICTIONARY, "Dictionary: cloning value to '%s'", item_get_name(item)); + + void *old_value = item->shared->value; + void *new_value = NULL; + if(value_len) { + new_value = dict_item_value_mallocz(dict, value_len); + if(value) memcpy(new_value, value, value_len); + else memset(new_value, 0, value_len); + } + item->shared->value = new_value; + item->shared->value_len = value_len; + + netdata_log_debug(D_DICTIONARY, "Dictionary: freeing old value of '%s'", item_get_name(item)); + dict_item_value_freez(dict, old_value); + } + + dictionary_execute_insert_callback(dict, item, constructor_data); +} + +static inline size_t dict_item_free_with_hooks(DICTIONARY *dict, DICTIONARY_ITEM *item) { + netdata_log_debug(D_DICTIONARY, "Destroying name value entry for name '%s'.", item_get_name(item)); + + if(!item_flag_check(item, ITEM_FLAG_DELETED)) + DICTIONARY_ENTRIES_MINUS1(dict); + + size_t item_size = 0, key_size = 0, value_size = 0; + + key_size += item->key_len; + if(unlikely(!(dict->options & DICT_OPTION_NAME_LINK_DONT_CLONE))) + item_free_name(dict, item); + + if(item_shared_release_and_check_if_it_can_be_freed(dict, item)) { + dictionary_execute_delete_callback(dict, item); + + if(unlikely(!(dict->options & DICT_OPTION_VALUE_LINK_DONT_CLONE))) { + netdata_log_debug(D_DICTIONARY, "Dictionary freeing value of '%s'", item_get_name(item)); + dict_item_value_freez(dict, item->shared->value); + item->shared->value = NULL; + } + value_size += item->shared->value_len; + + aral_freez(dict_shared_items_aral, item->shared); + item->shared = NULL; + item_size += sizeof(DICTIONARY_ITEM_SHARED); + } + + aral_freez(dict_items_aral, item); + + item_size += sizeof(DICTIONARY_ITEM); + + DICTIONARY_STATS_MINUS_MEMORY(dict, key_size, item_size, value_size); + + // we return the memory we actually freed + return item_size + ((dict->options & DICT_OPTION_VALUE_LINK_DONT_CLONE) ? 0 : value_size); +} + +// ---------------------------------------------------------------------------- +// linked list management + +static inline void item_linked_list_add(DICTIONARY *dict, DICTIONARY_ITEM *item) { + ll_recursive_lock(dict, DICTIONARY_LOCK_WRITE); + + if(dict->options & DICT_OPTION_ADD_IN_FRONT) + DOUBLE_LINKED_LIST_PREPEND_ITEM_UNSAFE(dict->items.list, item, prev, next); + else + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(dict->items.list, item, prev, next); + +#ifdef NETDATA_INTERNAL_CHECKS + item->ll_adder_pid = gettid_cached(); +#endif + + // clear the BEING created flag, + // after it has been inserted into the linked list + item_flag_clear(item, ITEM_FLAG_BEING_CREATED); + + garbage_collect_pending_deletes(dict); + ll_recursive_unlock(dict, DICTIONARY_LOCK_WRITE); +} + +static inline void item_linked_list_remove(DICTIONARY *dict, DICTIONARY_ITEM *item) { + ll_recursive_lock(dict, DICTIONARY_LOCK_WRITE); + + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(dict->items.list, item, prev, next); + +#ifdef NETDATA_INTERNAL_CHECKS + item->ll_remover_pid = gettid_cached(); +#endif + + garbage_collect_pending_deletes(dict); + ll_recursive_unlock(dict, DICTIONARY_LOCK_WRITE); +} + +// ---------------------------------------------------------------------------- +// item operations + +static inline void dict_item_shared_set_deleted(DICTIONARY *dict, DICTIONARY_ITEM *item) { + if(is_master_dictionary(dict)) { + item_shared_flag_set(item, ITEM_FLAG_DELETED); + + if(dict->hooks) + __atomic_store_n(&dict->hooks->last_master_deletion_us, now_realtime_usec(), __ATOMIC_RELAXED); + } +} + +// returns true if we set the deleted flag on this item +static inline bool dict_item_set_deleted(DICTIONARY *dict, DICTIONARY_ITEM *item) { + ITEM_FLAGS expected, desired; + + expected = __atomic_load_n(&item->flags, __ATOMIC_RELAXED); + + do { + + if (expected & ITEM_FLAG_DELETED) + return false; + + desired = expected | ITEM_FLAG_DELETED; + + } while(!__atomic_compare_exchange_n(&item->flags, &expected, desired, false, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)); + + DICTIONARY_ENTRIES_MINUS1(dict); + return true; +} + +static inline void dict_item_free_or_mark_deleted(DICTIONARY *dict, DICTIONARY_ITEM *item) { + int rc = item_is_not_referenced_and_can_be_removed_advanced(dict, item); + switch(rc) { + case RC_ITEM_OK: + // the item is ours, refcount set to -100 + dict_item_shared_set_deleted(dict, item); + item_linked_list_remove(dict, item); + dict_item_free_with_hooks(dict, item); + break; + + case RC_ITEM_IS_REFERENCED: + case RC_ITEM_IS_CURRENTLY_BEING_CREATED: + // the item is currently referenced by others + dict_item_shared_set_deleted(dict, item); + dict_item_set_deleted(dict, item); + // after this point do not touch the item + break; + + case RC_ITEM_IS_CURRENTLY_BEING_DELETED: + // an item that is currently being deleted by someone else - don't touch it + break; + + default: + internal_error(true, "Hey dev! You forgot to add the new condition here!"); + break; + } +} + +// this is used by traversal functions to remove the current item +// if it is deleted, and it has zero references. This will eliminate +// the need for the garbage collector to kick-in later. +// Most deletions happen during traversal, so this is a nice hack +// to speed up everything! +static inline void dict_item_release_and_check_if_it_is_deleted_and_can_be_removed_under_this_lock_mode(DICTIONARY *dict, DICTIONARY_ITEM *item, char rw) { + if(rw == DICTIONARY_LOCK_WRITE) { + bool should_be_deleted = item_flag_check(item, ITEM_FLAG_DELETED); + + item_release(dict, item); + + if(should_be_deleted && item_is_not_referenced_and_can_be_removed(dict, item)) { + // this has to be before removing from the linked list, + // otherwise the garbage collector will also kick in! + DICTIONARY_PENDING_DELETES_MINUS1(dict); + + item_linked_list_remove(dict, item); + dict_item_free_with_hooks(dict, item); + } + } + else { + // we can't do anything under this mode + item_release(dict, item); + } +} + +static inline bool dict_item_del(DICTIONARY *dict, const char *name, ssize_t name_len) { + if(name_len == -1) + name_len = (ssize_t)strlen(name); + + netdata_log_debug(D_DICTIONARY, "DEL dictionary entry with name '%s'.", name); + + // Unfortunately, the JudyHSDel() does not return the value of the + // item that was deleted, so we have to find it before we delete it, + // since we need to release our structures too. + + dictionary_index_lock_wrlock(dict); + + int ret; + DICTIONARY_ITEM *item = hashtable_get_unsafe(dict, name, name_len); + if(unlikely(!item)) { + dictionary_index_wrlock_unlock(dict); + ret = false; + } + else { + if(hashtable_delete_unsafe(dict, name, name_len, item) == 0) + netdata_log_error("DICTIONARY: INTERNAL ERROR: tried to delete item with name '%s', " + "name_len %zd that is not in the index", + name, name_len); + else + pointer_del(dict, item); + + dictionary_index_wrlock_unlock(dict); + + dict_item_free_or_mark_deleted(dict, item); + ret = true; + } + + return ret; +} + +static inline DICTIONARY_ITEM *dict_item_add_or_reset_value_and_acquire(DICTIONARY *dict, const char *name, ssize_t name_len, void *value, size_t value_len, void *constructor_data, DICTIONARY_ITEM *master_item) { + if(unlikely(!name || !*name)) { + internal_error( + true, + "DICTIONARY: attempted to %s() without a name on a dictionary created from %s() %zu@%s.", + __FUNCTION__, + dict->creation_function, + dict->creation_line, + dict->creation_file); + return NULL; + } + + if(unlikely(is_dictionary_destroyed(dict))) { + internal_error(true, "DICTIONARY: attempted to dictionary_set() on a destroyed dictionary"); + return NULL; + } + + if(name_len == -1) + name_len = (ssize_t)strlen(name); + + netdata_log_debug(D_DICTIONARY, "SET dictionary entry with name '%s'.", name); + + // DISCUSSION: + // Is it better to gain a read-lock and do a hashtable_get_unsafe() + // before we write lock to do hashtable_insert_unsafe()? + // + // Probably this depends on the use case. + // For statsd for example that does dictionary_set() to update received values, + // it could be beneficial to do a get() before we insert(). + // + // But the caller has the option to do this on his/her own. + // So, let's do the fastest here and let the caller decide the flow of calls. + + dictionary_index_lock_wrlock(dict); + + bool added_or_updated = false; + size_t spins = 0; + DICTIONARY_ITEM *item = NULL; + do { + void *handle = hashtable_insert_unsafe(dict, name, name_len); + item = hashtable_insert_handle_to_item_unsafe(dict, handle); + if (likely(item == NULL)) { + // a new item added to the index + + // create the dictionary item + item = dict_item_create_with_hooks(dict, name, name_len, value, value_len, constructor_data, master_item); + + pointer_add(dict, item); + + hashtable_set_item_unsafe(dict, handle, item); + + // unlock the index lock, before we add it to the linked list + // DON'T DO IT THE OTHER WAY AROUND - DO NOT CROSS THE LOCKS! + dictionary_index_wrlock_unlock(dict); + + item_linked_list_add(dict, item); + + added_or_updated = true; + } + else { + pointer_check(dict, item); + + if(item_check_and_acquire_advanced(dict, item, true) != RC_ITEM_OK) { + spins++; + continue; + } + + // the item is already in the index + // so, either we will return the old one + // or overwrite the value, depending on dictionary flags + + // We should not compare the values here! + // even if they are the same, we have to do the whole job + // so that the callbacks will be called. + + if(is_view_dictionary(dict)) { + // view dictionary + // the item is already there and can be used + if(item->shared != master_item->shared) + netdata_log_error("DICTIONARY: changing the master item on a view is not supported. The previous item will remain. To change the key of an item in a view, delete it and add it again."); + } + else { + // master dictionary + // the user wants to reset its value + + if (!(dict->options & DICT_OPTION_DONT_OVERWRITE_VALUE)) { + dict_item_reset_value_with_hooks(dict, item, value, value_len, constructor_data); + added_or_updated = true; + } + + else if (dictionary_execute_conflict_callback(dict, item, value, constructor_data)) { + dictionary_version_increment(dict); + added_or_updated = true; + } + + else { + // conflict callback returned false + // we did really nothing! + ; + } + } + + dictionary_index_wrlock_unlock(dict); + } + } while(!item); + + + if(unlikely(spins > 0)) + DICTIONARY_STATS_INSERT_SPINS_PLUS(dict, spins); + + if(is_master_dictionary(dict) && added_or_updated) + dictionary_execute_react_callback(dict, item, constructor_data); + + return item; +} + +static inline DICTIONARY_ITEM *dict_item_find_and_acquire(DICTIONARY *dict, const char *name, ssize_t name_len) { + if(unlikely(!name || !*name)) { + internal_error( + true, + "DICTIONARY: attempted to %s() without a name on a dictionary created from %s() %zu@%s.", + __FUNCTION__, + dict->creation_function, + dict->creation_line, + dict->creation_file); + return NULL; + } + + if(unlikely(is_dictionary_destroyed(dict))) { + internal_error(true, "DICTIONARY: attempted to dictionary_get() on a destroyed dictionary"); + return NULL; + } + + if(name_len == -1) + name_len = (ssize_t)strlen(name); + + netdata_log_debug(D_DICTIONARY, "GET dictionary entry with name '%s'.", name); + + dictionary_index_lock_rdlock(dict); + + DICTIONARY_ITEM *item = hashtable_get_unsafe(dict, name, name_len); + if(unlikely(item && !item_check_and_acquire(dict, item))) { + item = NULL; + DICTIONARY_STATS_SEARCH_IGNORES_PLUS1(dict); + } + + dictionary_index_rdlock_unlock(dict); + + return item; +} + + +#endif //NETDATA_DICTIONARY_ITEM_H diff --git a/src/libnetdata/dictionary/dictionary-locks.h b/src/libnetdata/dictionary/dictionary-locks.h new file mode 100644 index 00000000..90e42810 --- /dev/null +++ b/src/libnetdata/dictionary/dictionary-locks.h @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_DICTIONARY_LOCKS_H +#define NETDATA_DICTIONARY_LOCKS_H + +#include "dictionary-internals.h" + +// ---------------------------------------------------------------------------- +// dictionary locks + +static inline size_t dictionary_locks_init(DICTIONARY *dict) { + if(likely(!is_dictionary_single_threaded(dict))) { + rw_spinlock_init(&dict->index.rw_spinlock); + rw_spinlock_init(&dict->items.rw_spinlock); + } + + return 0; +} + +static inline size_t dictionary_locks_destroy(DICTIONARY *dict __maybe_unused) { + return 0; +} + +static inline void ll_recursive_lock_set_thread_as_writer(DICTIONARY *dict) { + pid_t expected = 0, desired = gettid_cached(); + if(!__atomic_compare_exchange_n(&dict->items.writer_pid, &expected, desired, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED)) + fatal("DICTIONARY: Cannot set thread %d as exclusive writer, expected %d, desired %d, found %d.", gettid_cached(), expected, desired, __atomic_load_n(&dict->items.writer_pid, __ATOMIC_RELAXED)); +} + +static inline void ll_recursive_unlock_unset_thread_writer(DICTIONARY *dict) { + pid_t expected = gettid_cached(), desired = 0; + if(!__atomic_compare_exchange_n(&dict->items.writer_pid, &expected, desired, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED)) + fatal("DICTIONARY: Cannot unset thread %d as exclusive writer, expected %d, desired %d, found %d.", gettid_cached(), expected, desired, __atomic_load_n(&dict->items.writer_pid, __ATOMIC_RELAXED)); +} + +static inline bool ll_recursive_lock_is_thread_the_writer(DICTIONARY *dict) { + pid_t tid = gettid_cached(); + return tid > 0 && tid == __atomic_load_n(&dict->items.writer_pid, __ATOMIC_RELAXED); +} + +static inline void ll_recursive_lock(DICTIONARY *dict, char rw) { + if(unlikely(is_dictionary_single_threaded(dict))) + return; + + if(ll_recursive_lock_is_thread_the_writer(dict)) { + dict->items.writer_depth++; + return; + } + + if(rw == DICTIONARY_LOCK_READ || rw == DICTIONARY_LOCK_REENTRANT || rw == 'R') { + // read lock + rw_spinlock_read_lock(&dict->items.rw_spinlock); + } + else { + // write lock + rw_spinlock_write_lock(&dict->items.rw_spinlock); + ll_recursive_lock_set_thread_as_writer(dict); + } +} + +static inline void ll_recursive_unlock(DICTIONARY *dict, char rw) { + if(unlikely(is_dictionary_single_threaded(dict))) + return; + + if(ll_recursive_lock_is_thread_the_writer(dict) && dict->items.writer_depth > 0) { + dict->items.writer_depth--; + return; + } + + if(rw == DICTIONARY_LOCK_READ || rw == DICTIONARY_LOCK_REENTRANT || rw == 'R') { + // read unlock + + rw_spinlock_read_unlock(&dict->items.rw_spinlock); + } + else { + // write unlock + + ll_recursive_unlock_unset_thread_writer(dict); + + rw_spinlock_write_unlock(&dict->items.rw_spinlock); + } +} + +static inline void dictionary_index_lock_rdlock(DICTIONARY *dict) { + if(unlikely(is_dictionary_single_threaded(dict))) + return; + + rw_spinlock_read_lock(&dict->index.rw_spinlock); +} + +static inline void dictionary_index_rdlock_unlock(DICTIONARY *dict) { + if(unlikely(is_dictionary_single_threaded(dict))) + return; + + rw_spinlock_read_unlock(&dict->index.rw_spinlock); +} + +static inline void dictionary_index_lock_wrlock(DICTIONARY *dict) { + if(unlikely(is_dictionary_single_threaded(dict))) + return; + + rw_spinlock_write_lock(&dict->index.rw_spinlock); +} +static inline void dictionary_index_wrlock_unlock(DICTIONARY *dict) { + if(unlikely(is_dictionary_single_threaded(dict))) + return; + + rw_spinlock_write_unlock(&dict->index.rw_spinlock); +} + + +#endif //NETDATA_DICTIONARY_LOCKS_H diff --git a/src/libnetdata/dictionary/dictionary-refcount.h b/src/libnetdata/dictionary/dictionary-refcount.h new file mode 100644 index 00000000..47d5c275 --- /dev/null +++ b/src/libnetdata/dictionary/dictionary-refcount.h @@ -0,0 +1,247 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_DICTIONARY_REFCOUNT_H +#define NETDATA_DICTIONARY_REFCOUNT_H + +#include "dictionary-internals.h" + +// ---------------------------------------------------------------------------- +// reference counters + +static inline size_t reference_counter_init(DICTIONARY *dict __maybe_unused) { + // allocate memory required for reference counters + // return number of bytes + return 0; +} + +static inline size_t reference_counter_free(DICTIONARY *dict __maybe_unused) { + // free memory required for reference counters + // return number of bytes + return 0; +} + +static inline void item_acquire(DICTIONARY *dict, DICTIONARY_ITEM *item) { + REFCOUNT refcount; + + if(unlikely(is_dictionary_single_threaded(dict))) + refcount = ++item->refcount; + + else + // increment the refcount + refcount = __atomic_add_fetch(&item->refcount, 1, __ATOMIC_SEQ_CST); + + + if(refcount <= 0) { + internal_error( + true, + "DICTIONARY: attempted to acquire item which is deleted (refcount = %d): " + "'%s' on dictionary created by %s() (%zu@%s)", + refcount - 1, + item_get_name(item), + dict->creation_function, + dict->creation_line, + dict->creation_file); + + fatal( + "DICTIONARY: request to acquire item '%s', which is deleted (refcount = %d)!", + item_get_name(item), + refcount - 1); + } + + if(refcount == 1) { + // referenced items counts number of unique items referenced + // so, we increase it only when refcount == 1 + DICTIONARY_REFERENCED_ITEMS_PLUS1(dict); + + // if this is a deleted item, but the counter increased to 1 + // we need to remove it from the pending items to delete + if(item_flag_check(item, ITEM_FLAG_DELETED)) + DICTIONARY_PENDING_DELETES_MINUS1(dict); + } +} + +static inline void item_release(DICTIONARY *dict, DICTIONARY_ITEM *item) { + // this function may be called without any lock on the dictionary + // or even when someone else has 'write' lock on the dictionary + + bool is_deleted; + REFCOUNT refcount; + + if(unlikely(is_dictionary_single_threaded(dict))) { + is_deleted = item->flags & ITEM_FLAG_DELETED; + refcount = --item->refcount; + } + else { + // get the flags before decrementing any reference counters + // (the other way around may lead to use-after-free) + is_deleted = item_flag_check(item, ITEM_FLAG_DELETED); + + // decrement the refcount + refcount = __atomic_sub_fetch(&item->refcount, 1, __ATOMIC_RELEASE); + } + + if(refcount < 0) { + internal_error( + true, + "DICTIONARY: attempted to release item without references (refcount = %d): " + "'%s' on dictionary created by %s() (%zu@%s)", + refcount + 1, + item_get_name(item), + dict->creation_function, + dict->creation_line, + dict->creation_file); + + fatal( + "DICTIONARY: attempted to release item '%s' without references (refcount = %d)", + item_get_name(item), + refcount + 1); + } + + if(refcount == 0) { + + if(is_deleted) + DICTIONARY_PENDING_DELETES_PLUS1(dict); + + // referenced items counts number of unique items referenced + // so, we decrease it only when refcount == 0 + DICTIONARY_REFERENCED_ITEMS_MINUS1(dict); + } +} + +static inline int item_check_and_acquire_advanced(DICTIONARY *dict, DICTIONARY_ITEM *item, bool having_index_lock) { + size_t spins = 0; + REFCOUNT refcount, desired; + + int ret = RC_ITEM_OK; + + refcount = DICTIONARY_ITEM_REFCOUNT_GET(dict, item); + + do { + spins++; + + if(refcount < 0) { + // we can't use this item + ret = RC_ITEM_IS_CURRENTLY_BEING_DELETED; + break; + } + + if(item_flag_check(item, ITEM_FLAG_DELETED)) { + // we can't use this item + ret = RC_ITEM_MARKED_FOR_DELETION; + break; + } + + desired = refcount + 1; + + } while(!__atomic_compare_exchange_n(&item->refcount, &refcount, desired, false, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)); + + // if ret == ITEM_OK, we acquired the item + + if(ret == RC_ITEM_OK) { + if (unlikely(is_view_dictionary(dict) && + item_shared_flag_check(item, ITEM_FLAG_DELETED) && + !item_flag_check(item, ITEM_FLAG_DELETED))) { + // but, we can't use this item + + if (having_index_lock) { + // delete it from the hashtable + if(hashtable_delete_unsafe(dict, item_get_name(item), item->key_len, item) == 0) + netdata_log_error("DICTIONARY: INTERNAL ERROR VIEW: tried to delete item with name '%s', " + "name_len %u that is not in the index", + item_get_name(item), (KEY_LEN_TYPE)(item->key_len)); + else + pointer_del(dict, item); + + // mark it in our dictionary as deleted too, + // this is safe to be done here, because we have got + // a reference counter on item + dict_item_set_deleted(dict, item); + + // decrement the refcount we incremented above + if (__atomic_sub_fetch(&item->refcount, 1, __ATOMIC_RELEASE) == 0) { + // this is a deleted item, and we are the last one + DICTIONARY_PENDING_DELETES_PLUS1(dict); + } + + // do not touch the item below this point + } else { + // this is traversal / walkthrough + // decrement the refcount we incremented above + __atomic_sub_fetch(&item->refcount, 1, __ATOMIC_RELEASE); + } + + return RC_ITEM_MARKED_FOR_DELETION; + } + + if(desired == 1) + DICTIONARY_REFERENCED_ITEMS_PLUS1(dict); + } + + if(unlikely(spins > 1)) + DICTIONARY_STATS_CHECK_SPINS_PLUS(dict, spins - 1); + + return ret; +} + +// if a dictionary item can be deleted, return true, otherwise return false +// we use the private reference counter +static inline int item_is_not_referenced_and_can_be_removed_advanced(DICTIONARY *dict, DICTIONARY_ITEM *item) { + // if we can set refcount to REFCOUNT_DELETING, we can delete this item + + size_t spins = 0; + REFCOUNT refcount, desired = REFCOUNT_DELETING; + + int ret = RC_ITEM_OK; + + refcount = DICTIONARY_ITEM_REFCOUNT_GET(dict, item); + + do { + spins++; + + if(refcount < 0) { + // we can't use this item + ret = RC_ITEM_IS_CURRENTLY_BEING_DELETED; + break; + } + + if(refcount > 0) { + // we can't delete this + ret = RC_ITEM_IS_REFERENCED; + break; + } + + if(item_flag_check(item, ITEM_FLAG_BEING_CREATED)) { + // we can't use this item + ret = RC_ITEM_IS_CURRENTLY_BEING_CREATED; + break; + } + } while(!__atomic_compare_exchange_n(&item->refcount, &refcount, desired, false, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)); + +#ifdef NETDATA_INTERNAL_CHECKS + if(ret == RC_ITEM_OK) + item->deleter_pid = gettid_cached(); +#endif + + if(unlikely(spins > 1)) + DICTIONARY_STATS_DELETE_SPINS_PLUS(dict, spins - 1); + + return ret; +} + +// if a dictionary item can be freed, return true, otherwise return false +// we use the shared reference counter +static inline bool item_shared_release_and_check_if_it_can_be_freed(DICTIONARY *dict __maybe_unused, DICTIONARY_ITEM *item) { + // if we can set refcount to REFCOUNT_DELETING, we can delete this item + + REFCOUNT links = __atomic_sub_fetch(&item->shared->links, 1, __ATOMIC_RELEASE); + if(links == 0 && __atomic_compare_exchange_n(&item->shared->links, &links, REFCOUNT_DELETING, false, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)) { + + // we can delete it + return true; + } + + // we can't delete it + return false; +} + +#endif //NETDATA_DICTIONARY_REFCOUNT_H diff --git a/src/libnetdata/dictionary/dictionary-statistics.h b/src/libnetdata/dictionary/dictionary-statistics.h new file mode 100644 index 00000000..20eb8159 --- /dev/null +++ b/src/libnetdata/dictionary/dictionary-statistics.h @@ -0,0 +1,246 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_DICTIONARY_STATISTICS_H +#define NETDATA_DICTIONARY_STATISTICS_H + +#include "dictionary-internals.h" + +// ---------------------------------------------------------------------------- +// memory statistics + +#ifdef DICT_WITH_STATS +static inline void DICTIONARY_STATS_PLUS_MEMORY(DICTIONARY *dict, size_t key_size, size_t item_size, size_t value_size) { + if(key_size) + __atomic_fetch_add(&dict->stats->memory.index, (long)JUDYHS_INDEX_SIZE_ESTIMATE(key_size), __ATOMIC_RELAXED); + + if(item_size) + __atomic_fetch_add(&dict->stats->memory.dict, (long)item_size, __ATOMIC_RELAXED); + + if(value_size) + __atomic_fetch_add(&dict->stats->memory.values, (long)value_size, __ATOMIC_RELAXED); +} + +static inline void DICTIONARY_STATS_MINUS_MEMORY(DICTIONARY *dict, size_t key_size, size_t item_size, size_t value_size) { + if(key_size) + __atomic_fetch_sub(&dict->stats->memory.index, (long)JUDYHS_INDEX_SIZE_ESTIMATE(key_size), __ATOMIC_RELAXED); + + if(item_size) + __atomic_fetch_sub(&dict->stats->memory.dict, (long)item_size, __ATOMIC_RELAXED); + + if(value_size) + __atomic_fetch_sub(&dict->stats->memory.values, (long)value_size, __ATOMIC_RELAXED); +} +#else +#define DICTIONARY_STATS_PLUS_MEMORY(dict, key_size, item_size, value_size) do {(void)item_size;} while(0) +#define DICTIONARY_STATS_MINUS_MEMORY(dict, key_size, item_size, value_size) do {;} while(0) +#endif + +// ---------------------------------------------------------------------------- +// internal statistics API + +#ifdef DICT_WITH_STATS +static inline void DICTIONARY_STATS_SEARCHES_PLUS1(DICTIONARY *dict) { + __atomic_fetch_add(&dict->stats->ops.searches, 1, __ATOMIC_RELAXED); +} +#else +#define DICTIONARY_STATS_SEARCHES_PLUS1(dict) do {;} while(0) +#endif + +static inline void DICTIONARY_ENTRIES_PLUS1(DICTIONARY *dict) { +#ifdef DICT_WITH_STATS + // statistics + __atomic_fetch_add(&dict->stats->items.entries, 1, __ATOMIC_RELAXED); + __atomic_fetch_add(&dict->stats->items.referenced, 1, __ATOMIC_RELAXED); + __atomic_fetch_add(&dict->stats->ops.inserts, 1, __ATOMIC_RELAXED); +#endif + + if(unlikely(is_dictionary_single_threaded(dict))) { + dict->version++; + dict->entries++; + dict->referenced_items++; + + } + else { + __atomic_fetch_add(&dict->version, 1, __ATOMIC_RELAXED); + __atomic_fetch_add(&dict->entries, 1, __ATOMIC_RELAXED); + __atomic_fetch_add(&dict->referenced_items, 1, __ATOMIC_RELAXED); + } +} + +static inline void DICTIONARY_ENTRIES_MINUS1(DICTIONARY *dict) { +#ifdef DICT_WITH_STATS + // statistics + __atomic_fetch_add(&dict->stats->ops.deletes, 1, __ATOMIC_RELAXED); + __atomic_fetch_sub(&dict->stats->items.entries, 1, __ATOMIC_RELAXED); +#endif + + size_t entries; (void)entries; + if(unlikely(is_dictionary_single_threaded(dict))) { + dict->version++; + entries = dict->entries--; + } + else { + __atomic_fetch_add(&dict->version, 1, __ATOMIC_RELAXED); + entries = __atomic_fetch_sub(&dict->entries, 1, __ATOMIC_RELAXED); + } + + internal_fatal(entries == 0, + "DICT: negative number of entries in dictionary created from %s() (%zu@%s)", + dict->creation_function, + dict->creation_line, + dict->creation_file); +} + +static inline void DICTIONARY_VALUE_RESETS_PLUS1(DICTIONARY *dict) { +#ifdef DICT_WITH_STATS + __atomic_fetch_add(&dict->stats->ops.resets, 1, __ATOMIC_RELAXED); +#endif + + if(unlikely(is_dictionary_single_threaded(dict))) + dict->version++; + else + __atomic_fetch_add(&dict->version, 1, __ATOMIC_RELAXED); +} + +#ifdef DICT_WITH_STATS +static inline void DICTIONARY_STATS_TRAVERSALS_PLUS1(DICTIONARY *dict) { + __atomic_fetch_add(&dict->stats->ops.traversals, 1, __ATOMIC_RELAXED); +} +static inline void DICTIONARY_STATS_WALKTHROUGHS_PLUS1(DICTIONARY *dict) { + __atomic_fetch_add(&dict->stats->ops.walkthroughs, 1, __ATOMIC_RELAXED); +} +static inline void DICTIONARY_STATS_CHECK_SPINS_PLUS(DICTIONARY *dict, size_t count) { + __atomic_fetch_add(&dict->stats->spin_locks.use_spins, count, __ATOMIC_RELAXED); +} +static inline void DICTIONARY_STATS_INSERT_SPINS_PLUS(DICTIONARY *dict, size_t count) { + __atomic_fetch_add(&dict->stats->spin_locks.insert_spins, count, __ATOMIC_RELAXED); +} +static inline void DICTIONARY_STATS_DELETE_SPINS_PLUS(DICTIONARY *dict, size_t count) { + __atomic_fetch_add(&dict->stats->spin_locks.delete_spins, count, __ATOMIC_RELAXED); +} +static inline void DICTIONARY_STATS_SEARCH_IGNORES_PLUS1(DICTIONARY *dict) { + __atomic_fetch_add(&dict->stats->spin_locks.search_spins, 1, __ATOMIC_RELAXED); +} +static inline void DICTIONARY_STATS_CALLBACK_INSERTS_PLUS1(DICTIONARY *dict) { + __atomic_fetch_add(&dict->stats->callbacks.inserts, 1, __ATOMIC_RELEASE); +} +static inline void DICTIONARY_STATS_CALLBACK_CONFLICTS_PLUS1(DICTIONARY *dict) { + __atomic_fetch_add(&dict->stats->callbacks.conflicts, 1, __ATOMIC_RELEASE); +} +static inline void DICTIONARY_STATS_CALLBACK_REACTS_PLUS1(DICTIONARY *dict) { + __atomic_fetch_add(&dict->stats->callbacks.reacts, 1, __ATOMIC_RELEASE); +} +static inline void DICTIONARY_STATS_CALLBACK_DELETES_PLUS1(DICTIONARY *dict) { + __atomic_fetch_add(&dict->stats->callbacks.deletes, 1, __ATOMIC_RELEASE); +} +static inline void DICTIONARY_STATS_GARBAGE_COLLECTIONS_PLUS1(DICTIONARY *dict) { + __atomic_fetch_add(&dict->stats->ops.garbage_collections, 1, __ATOMIC_RELAXED); +} +static inline void DICTIONARY_STATS_DICT_CREATIONS_PLUS1(DICTIONARY *dict) { + __atomic_fetch_add(&dict->stats->dictionaries.active, 1, __ATOMIC_RELAXED); + __atomic_fetch_add(&dict->stats->ops.creations, 1, __ATOMIC_RELAXED); +} +static inline void DICTIONARY_STATS_DICT_DESTRUCTIONS_PLUS1(DICTIONARY *dict) { + __atomic_fetch_sub(&dict->stats->dictionaries.active, 1, __ATOMIC_RELAXED); + __atomic_fetch_add(&dict->stats->ops.destructions, 1, __ATOMIC_RELAXED); +} +static inline void DICTIONARY_STATS_DICT_DESTROY_QUEUED_PLUS1(DICTIONARY *dict) { + __atomic_fetch_add(&dict->stats->dictionaries.deleted, 1, __ATOMIC_RELAXED); +} +static inline void DICTIONARY_STATS_DICT_DESTROY_QUEUED_MINUS1(DICTIONARY *dict) { + __atomic_fetch_sub(&dict->stats->dictionaries.deleted, 1, __ATOMIC_RELAXED); +} +static inline void DICTIONARY_STATS_DICT_FLUSHES_PLUS1(DICTIONARY *dict) { + __atomic_fetch_add(&dict->stats->ops.flushes, 1, __ATOMIC_RELAXED); +} +#else +#define DICTIONARY_STATS_TRAVERSALS_PLUS1(dict) do {;} while(0) +#define DICTIONARY_STATS_WALKTHROUGHS_PLUS1(dict) do {;} while(0) +#define DICTIONARY_STATS_CHECK_SPINS_PLUS(dict, count) do {;} while(0) +#define DICTIONARY_STATS_INSERT_SPINS_PLUS(dict, count) do {;} while(0) +#define DICTIONARY_STATS_DELETE_SPINS_PLUS(dict, count) do {;} while(0) +#define DICTIONARY_STATS_SEARCH_IGNORES_PLUS1(dict) do {;} while(0) +#define DICTIONARY_STATS_CALLBACK_INSERTS_PLUS1(dict) do {;} while(0) +#define DICTIONARY_STATS_CALLBACK_CONFLICTS_PLUS1(dict) do {;} while(0) +#define DICTIONARY_STATS_CALLBACK_REACTS_PLUS1(dict) do {;} while(0) +#define DICTIONARY_STATS_CALLBACK_DELETES_PLUS1(dict) do {;} while(0) +#define DICTIONARY_STATS_GARBAGE_COLLECTIONS_PLUS1(dict) do {;} while(0) +#define DICTIONARY_STATS_DICT_CREATIONS_PLUS1(dict) do {;} while(0) +#define DICTIONARY_STATS_DICT_DESTRUCTIONS_PLUS1(dict) do {;} while(0) +#define DICTIONARY_STATS_DICT_DESTROY_QUEUED_PLUS1(dict) do {;} while(0) +#define DICTIONARY_STATS_DICT_DESTROY_QUEUED_MINUS1(dict) do {;} while(0) +#define DICTIONARY_STATS_DICT_FLUSHES_PLUS1(dict) do {;} while(0) +#endif + +static inline void DICTIONARY_REFERENCED_ITEMS_PLUS1(DICTIONARY *dict) { +#ifdef DICT_WITH_STATS + __atomic_fetch_add(&dict->stats->items.referenced, 1, __ATOMIC_RELAXED); +#endif + + if(unlikely(is_dictionary_single_threaded(dict))) + ++dict->referenced_items; + else + __atomic_add_fetch(&dict->referenced_items, 1, __ATOMIC_RELAXED); +} + +static inline void DICTIONARY_REFERENCED_ITEMS_MINUS1(DICTIONARY *dict) { +#ifdef DICT_WITH_STATS + __atomic_fetch_sub(&dict->stats->items.referenced, 1, __ATOMIC_RELAXED); +#endif + + long int referenced_items; (void)referenced_items; + if(unlikely(is_dictionary_single_threaded(dict))) + referenced_items = --dict->referenced_items; + else + referenced_items = __atomic_sub_fetch(&dict->referenced_items, 1, __ATOMIC_SEQ_CST); + + internal_fatal(referenced_items < 0, + "DICT: negative number of referenced items (%ld) in dictionary created from %s() (%zu@%s)", + referenced_items, + dict->creation_function, + dict->creation_line, + dict->creation_file); +} + +static inline void DICTIONARY_PENDING_DELETES_PLUS1(DICTIONARY *dict) { +#ifdef DICT_WITH_STATS + __atomic_fetch_add(&dict->stats->items.pending_deletion, 1, __ATOMIC_RELAXED); +#endif + + if(unlikely(is_dictionary_single_threaded(dict))) + ++dict->pending_deletion_items; + else + __atomic_add_fetch(&dict->pending_deletion_items, 1, __ATOMIC_RELEASE); +} + +static inline long int DICTIONARY_PENDING_DELETES_MINUS1(DICTIONARY *dict) { +#ifdef DICT_WITH_STATS + __atomic_fetch_sub(&dict->stats->items.pending_deletion, 1, __ATOMIC_RELEASE); +#endif + + if(unlikely(is_dictionary_single_threaded(dict))) + return --dict->pending_deletion_items; + else + return __atomic_sub_fetch(&dict->pending_deletion_items, 1, __ATOMIC_ACQUIRE); +} + +static inline long int DICTIONARY_PENDING_DELETES_GET(DICTIONARY *dict) { + if(unlikely(is_dictionary_single_threaded(dict))) + return dict->pending_deletion_items; + else + return __atomic_load_n(&dict->pending_deletion_items, __ATOMIC_SEQ_CST); +} + +static inline REFCOUNT DICTIONARY_ITEM_REFCOUNT_GET(DICTIONARY *dict, DICTIONARY_ITEM *item) { + if(unlikely(dict && is_dictionary_single_threaded(dict))) // this is an exception, dict can be null + return item->refcount; + else + return (REFCOUNT)__atomic_load_n(&item->refcount, __ATOMIC_ACQUIRE); +} + +static inline REFCOUNT DICTIONARY_ITEM_REFCOUNT_GET_SOLE(DICTIONARY_ITEM *item) { + return (REFCOUNT)__atomic_load_n(&item->refcount, __ATOMIC_ACQUIRE); +} + + +#endif //NETDATA_DICTIONARY_STATISTICS_H diff --git a/src/libnetdata/dictionary/dictionary-traversal.c b/src/libnetdata/dictionary/dictionary-traversal.c new file mode 100644 index 00000000..1e55dcbb --- /dev/null +++ b/src/libnetdata/dictionary/dictionary-traversal.c @@ -0,0 +1,268 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "dictionary-internals.h" + + +// ---------------------------------------------------------------------------- +// traversal with loop + +void *dictionary_foreach_start_rw(DICTFE *dfe, DICTIONARY *dict, char rw) { + if(unlikely(!dfe || !dict)) return NULL; + + DICTIONARY_STATS_TRAVERSALS_PLUS1(dict); + + if(unlikely(is_dictionary_destroyed(dict))) { + internal_error(true, "DICTIONARY: attempted to dictionary_foreach_start_rw() on a destroyed dictionary"); + dfe->counter = 0; + dfe->item = NULL; + dfe->name = NULL; + dfe->value = NULL; + return NULL; + } + + dfe->counter = 0; + dfe->dict = dict; + dfe->rw = rw; + dfe->locked = true; + ll_recursive_lock(dict, dfe->rw); + + // get the first item from the list + DICTIONARY_ITEM *item = dict->items.list; + + // skip all the deleted items + while(item && !item_check_and_acquire(dict, item)) + item = item->next; + + if(likely(item)) { + dfe->item = item; + dfe->name = (char *)item_get_name(item); + dfe->value = item->shared->value; + } + else { + dfe->item = NULL; + dfe->name = NULL; + dfe->value = NULL; + } + + if(unlikely(dfe->rw == DICTIONARY_LOCK_REENTRANT)) { + ll_recursive_unlock(dfe->dict, dfe->rw); + dfe->locked = false; + } + + return dfe->value; +} + +void *dictionary_foreach_next(DICTFE *dfe) { + if(unlikely(!dfe || !dfe->dict)) return NULL; + + if(unlikely(is_dictionary_destroyed(dfe->dict))) { + internal_error(true, "DICTIONARY: attempted to dictionary_foreach_next() on a destroyed dictionary"); + dfe->item = NULL; + dfe->name = NULL; + dfe->value = NULL; + return NULL; + } + + if(unlikely(dfe->rw == DICTIONARY_LOCK_REENTRANT) || !dfe->locked) { + ll_recursive_lock(dfe->dict, dfe->rw); + dfe->locked = true; + } + + // the item we just did + DICTIONARY_ITEM *item = dfe->item; + + // get the next item from the list + DICTIONARY_ITEM *item_next = (item) ? item->next : NULL; + + // skip all the deleted items until one that can be acquired is found + while(item_next && !item_check_and_acquire(dfe->dict, item_next)) + item_next = item_next->next; + + if(likely(item)) { + dict_item_release_and_check_if_it_is_deleted_and_can_be_removed_under_this_lock_mode(dfe->dict, item, dfe->rw); + // item_release(dfe->dict, item); + } + + item = item_next; + if(likely(item)) { + dfe->item = item; + dfe->name = (char *)item_get_name(item); + dfe->value = item->shared->value; + dfe->counter++; + } + else { + dfe->item = NULL; + dfe->name = NULL; + dfe->value = NULL; + } + + if(unlikely(dfe->rw == DICTIONARY_LOCK_REENTRANT)) { + ll_recursive_unlock(dfe->dict, dfe->rw); + dfe->locked = false; + } + + return dfe->value; +} + +void dictionary_foreach_unlock(DICTFE *dfe) { + if(dfe->locked) { + ll_recursive_unlock(dfe->dict, dfe->rw); + dfe->locked = false; + } +} + +void dictionary_foreach_done(DICTFE *dfe) { + if(unlikely(!dfe || !dfe->dict)) return; + + if(unlikely(is_dictionary_destroyed(dfe->dict))) { + internal_error(true, "DICTIONARY: attempted to dictionary_foreach_next() on a destroyed dictionary"); + return; + } + + // the item we just did + DICTIONARY_ITEM *item = dfe->item; + + // release it, so that it can possibly be deleted + if(likely(item)) { + dict_item_release_and_check_if_it_is_deleted_and_can_be_removed_under_this_lock_mode(dfe->dict, item, dfe->rw); + // item_release(dfe->dict, item); + } + + if(likely(dfe->rw != DICTIONARY_LOCK_REENTRANT) && dfe->locked) { + ll_recursive_unlock(dfe->dict, dfe->rw); + dfe->locked = false; + } + + dfe->dict = NULL; + dfe->item = NULL; + dfe->name = NULL; + dfe->value = NULL; + dfe->counter = 0; +} + +// ---------------------------------------------------------------------------- +// API - walk through the dictionary. +// The dictionary is locked for reading while this happens +// do not use other dictionary calls while walking the dictionary - deadlock! + +int dictionary_walkthrough_rw(DICTIONARY *dict, char rw, dict_walkthrough_callback_t walkthrough_callback, void *data) { + if(unlikely(!dict || !walkthrough_callback)) return 0; + + if(unlikely(is_dictionary_destroyed(dict))) { + internal_error(true, "DICTIONARY: attempted to dictionary_walkthrough_rw() on a destroyed dictionary"); + return 0; + } + + ll_recursive_lock(dict, rw); + + DICTIONARY_STATS_WALKTHROUGHS_PLUS1(dict); + + // written in such a way, that the callback can delete the active element + + int ret = 0; + DICTIONARY_ITEM *item = dict->items.list, *item_next; + while(item) { + + // skip the deleted items + if(unlikely(!item_check_and_acquire(dict, item))) { + item = item->next; + continue; + } + + if(unlikely(rw == DICTIONARY_LOCK_REENTRANT)) + ll_recursive_unlock(dict, rw); + + int r = walkthrough_callback(item, item->shared->value, data); + + if(unlikely(rw == DICTIONARY_LOCK_REENTRANT)) + ll_recursive_lock(dict, rw); + + // since we have a reference counter, this item cannot be deleted + // until we release the reference counter, so the pointers are there + item_next = item->next; + + dict_item_release_and_check_if_it_is_deleted_and_can_be_removed_under_this_lock_mode(dict, item, rw); + // item_release(dict, item); + + if(unlikely(r < 0)) { + ret = r; + break; + } + + ret += r; + + item = item_next; + } + + ll_recursive_unlock(dict, rw); + + return ret; +} + +// ---------------------------------------------------------------------------- +// sorted walkthrough + +typedef int (*qsort_compar)(const void *item1, const void *item2); + +static int dictionary_sort_compar(const void *item1, const void *item2) { + return strcmp(item_get_name((*(DICTIONARY_ITEM **)item1)), item_get_name((*(DICTIONARY_ITEM **)item2))); +} + +int dictionary_sorted_walkthrough_rw(DICTIONARY *dict, char rw, dict_walkthrough_callback_t walkthrough_callback, void *data, dict_item_comparator_t item_comparator) { + if(unlikely(!dict || !walkthrough_callback)) return 0; + + if(unlikely(is_dictionary_destroyed(dict))) { + internal_error(true, "DICTIONARY: attempted to dictionary_sorted_walkthrough_rw() on a destroyed dictionary"); + return 0; + } + + DICTIONARY_STATS_WALKTHROUGHS_PLUS1(dict); + + ll_recursive_lock(dict, rw); + size_t entries = __atomic_load_n(&dict->entries, __ATOMIC_RELAXED); + DICTIONARY_ITEM **array = mallocz(sizeof(DICTIONARY_ITEM *) * entries); + + size_t i; + DICTIONARY_ITEM *item; + for(item = dict->items.list, i = 0; item && i < entries; item = item->next) { + if(likely(item_check_and_acquire(dict, item))) + array[i++] = item; + } + ll_recursive_unlock(dict, rw); + + if(unlikely(i != entries)) + entries = i; + + if(item_comparator) + qsort(array, entries, sizeof(DICTIONARY_ITEM *), (qsort_compar) item_comparator); + else + qsort(array, entries, sizeof(DICTIONARY_ITEM *), dictionary_sort_compar); + + bool callit = true; + int ret = 0, r; + for(i = 0; i < entries ;i++) { + item = array[i]; + + if(callit) + r = walkthrough_callback(item, item->shared->value, data); + + dict_item_release_and_check_if_it_is_deleted_and_can_be_removed_under_this_lock_mode(dict, item, rw); + // item_release(dict, item); + + if(r < 0) { + ret = r; + r = 0; + + // stop calling the callback, + // but we have to continue, to release all the reference counters + callit = false; + } + else + ret += r; + } + + freez(array); + + return ret; +} + diff --git a/src/libnetdata/dictionary/dictionary-unittest.c b/src/libnetdata/dictionary/dictionary-unittest.c new file mode 100644 index 00000000..716d194a --- /dev/null +++ b/src/libnetdata/dictionary/dictionary-unittest.c @@ -0,0 +1,1190 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "dictionary-internals.h" + +// ---------------------------------------------------------------------------- +// unit test + +static void dictionary_unittest_free_char_pp(char **pp, size_t entries) { + for(size_t i = 0; i < entries ;i++) + freez(pp[i]); + + freez(pp); +} + +static char **dictionary_unittest_generate_names(size_t entries) { + char **names = mallocz(sizeof(char *) * entries); + for(size_t i = 0; i < entries ;i++) { + char buf[25 + 1] = ""; + snprintfz(buf, sizeof(buf), "name.%zu.0123456789.%zu!@#$%%^&*(),./[]{}\\|~`", i, entries / 2 + i); + names[i] = strdupz(buf); + } + return names; +} + +static char **dictionary_unittest_generate_values(size_t entries) { + char **values = mallocz(sizeof(char *) * entries); + for(size_t i = 0; i < entries ;i++) { + char buf[25 + 1] = ""; + snprintfz(buf, sizeof(buf), "value-%zu-0987654321.%zu%%^&*(),. \t !@#$/[]{}\\|~`", i, entries / 2 + i); + values[i] = strdupz(buf); + } + return values; +} + +static size_t dictionary_unittest_set_clone(DICTIONARY *dict, char **names, char **values, size_t entries) { + size_t errors = 0; + for(size_t i = 0; i < entries ;i++) { + size_t vallen = strlen(values[i]); + char *val = (char *)dictionary_set(dict, names[i], values[i], vallen); + if(val == values[i]) { fprintf(stderr, ">>> %s() returns reference to value\n", __FUNCTION__); errors++; } + if(!val || memcmp(val, values[i], vallen) != 0) { fprintf(stderr, ">>> %s() returns invalid value\n", __FUNCTION__); errors++; } + } + return errors; +} + +static size_t dictionary_unittest_set_null(DICTIONARY *dict, char **names, char **values, size_t entries) { + (void)values; + size_t errors = 0; + size_t i = 0; + for(; i < entries ;i++) { + void *val = dictionary_set(dict, names[i], NULL, 0); + if(val != NULL) { fprintf(stderr, ">>> %s() returns a non NULL value\n", __FUNCTION__); errors++; } + } + if(dictionary_entries(dict) != i) { + fprintf(stderr, ">>> %s() dictionary items do not match\n", __FUNCTION__); + errors++; + } + return errors; +} + + +static size_t dictionary_unittest_set_nonclone(DICTIONARY *dict, char **names, char **values, size_t entries) { + size_t errors = 0; + for(size_t i = 0; i < entries ;i++) { + size_t vallen = strlen(values[i]); + char *val = (char *)dictionary_set(dict, names[i], values[i], vallen); + if(val != values[i]) { fprintf(stderr, ">>> %s() returns invalid pointer to value\n", __FUNCTION__); errors++; } + } + return errors; +} + +static size_t dictionary_unittest_get_clone(DICTIONARY *dict, char **names, char **values, size_t entries) { + size_t errors = 0; + for(size_t i = 0; i < entries ;i++) { + size_t vallen = strlen(values[i]); + char *val = (char *)dictionary_get(dict, names[i]); + if(val == values[i]) { fprintf(stderr, ">>> %s() returns reference to value\n", __FUNCTION__); errors++; } + if(!val || memcmp(val, values[i], vallen) != 0) { fprintf(stderr, ">>> %s() returns invalid value\n", __FUNCTION__); errors++; } + } + return errors; +} + +static size_t dictionary_unittest_get_nonclone(DICTIONARY *dict, char **names, char **values, size_t entries) { + size_t errors = 0; + for(size_t i = 0; i < entries ;i++) { + char *val = (char *)dictionary_get(dict, names[i]); + if(val != values[i]) { fprintf(stderr, ">>> %s() returns invalid pointer to value\n", __FUNCTION__); errors++; } + } + return errors; +} + +static size_t dictionary_unittest_get_nonexisting(DICTIONARY *dict, char **names, char **values, size_t entries) { + (void)names; + size_t errors = 0; + for(size_t i = 0; i < entries ;i++) { + char *val = (char *)dictionary_get(dict, values[i]); + if(val) { fprintf(stderr, ">>> %s() returns non-existing item\n", __FUNCTION__); errors++; } + } + return errors; +} + +static size_t dictionary_unittest_del_nonexisting(DICTIONARY *dict, char **names, char **values, size_t entries) { + (void)names; + size_t errors = 0; + for(size_t i = 0; i < entries ;i++) { + bool ret = dictionary_del(dict, values[i]); + if(ret) { fprintf(stderr, ">>> %s() deleted non-existing item\n", __FUNCTION__); errors++; } + } + return errors; +} + +static size_t dictionary_unittest_del_existing(DICTIONARY *dict, char **names, char **values, size_t entries) { + (void)values; + size_t errors = 0; + + size_t forward_from = 0, forward_to = entries / 3; + size_t middle_from = forward_to, middle_to = entries * 2 / 3; + size_t backward_from = middle_to, backward_to = entries; + + for(size_t i = forward_from; i < forward_to ;i++) { + bool ret = dictionary_del(dict, names[i]); + if(!ret) { fprintf(stderr, ">>> %s() didn't delete (forward) existing item\n", __FUNCTION__); errors++; } + } + + for(size_t i = middle_to - 1; i >= middle_from ;i--) { + bool ret = dictionary_del(dict, names[i]); + if(!ret) { fprintf(stderr, ">>> %s() didn't delete (middle) existing item\n", __FUNCTION__); errors++; } + } + + for(size_t i = backward_to - 1; i >= backward_from ;i--) { + bool ret = dictionary_del(dict, names[i]); + if(!ret) { fprintf(stderr, ">>> %s() didn't delete (backward) existing item\n", __FUNCTION__); errors++; } + } + + return errors; +} + +static size_t dictionary_unittest_reset_clone(DICTIONARY *dict, char **names, char **values, size_t entries) { + (void)values; + // set the name as value too + size_t errors = 0; + for(size_t i = 0; i < entries ;i++) { + size_t vallen = strlen(names[i]); + char *val = (char *)dictionary_set(dict, names[i], names[i], vallen); + if(val == names[i]) { fprintf(stderr, ">>> %s() returns reference to value\n", __FUNCTION__); errors++; } + if(!val || memcmp(val, names[i], vallen) != 0) { fprintf(stderr, ">>> %s() returns invalid value\n", __FUNCTION__); errors++; } + } + return errors; +} + +static size_t dictionary_unittest_reset_nonclone(DICTIONARY *dict, char **names, char **values, size_t entries) { + (void)values; + // set the name as value too + size_t errors = 0; + for(size_t i = 0; i < entries ;i++) { + size_t vallen = strlen(names[i]); + char *val = (char *)dictionary_set(dict, names[i], names[i], vallen); + if(val != names[i]) { fprintf(stderr, ">>> %s() returns invalid pointer to value\n", __FUNCTION__); errors++; } + if(!val) { fprintf(stderr, ">>> %s() returns invalid value\n", __FUNCTION__); errors++; } + } + return errors; +} + +static size_t dictionary_unittest_reset_dont_overwrite_nonclone(DICTIONARY *dict, char **names, char **values, size_t entries) { + // set the name as value too + size_t errors = 0; + for(size_t i = 0; i < entries ;i++) { + size_t vallen = strlen(names[i]); + char *val = (char *)dictionary_set(dict, names[i], names[i], vallen); + if(val != values[i]) { fprintf(stderr, ">>> %s() returns invalid pointer to value\n", __FUNCTION__); errors++; } + } + return errors; +} + +static int dictionary_unittest_walkthrough_callback(const DICTIONARY_ITEM *item __maybe_unused, void *value __maybe_unused, void *data __maybe_unused) { + return 1; +} + +static size_t dictionary_unittest_walkthrough(DICTIONARY *dict, char **names, char **values, size_t entries) { + (void)names; + (void)values; + int sum = dictionary_walkthrough_read(dict, dictionary_unittest_walkthrough_callback, NULL); + if(sum < (int)entries) return entries - sum; + else return sum - entries; +} + +static int dictionary_unittest_walkthrough_delete_this_callback(const DICTIONARY_ITEM *item, void *value __maybe_unused, void *data) { + const char *name = dictionary_acquired_item_name((DICTIONARY_ITEM *)item); + + if(!dictionary_del((DICTIONARY *)data, name)) + return 0; + + return 1; +} + +static size_t dictionary_unittest_walkthrough_delete_this(DICTIONARY *dict, char **names, char **values, size_t entries) { + (void)names; + (void)values; + int sum = dictionary_walkthrough_write(dict, dictionary_unittest_walkthrough_delete_this_callback, dict); + if(sum < (int)entries) return entries - sum; + else return sum - entries; +} + +static int dictionary_unittest_walkthrough_stop_callback(const DICTIONARY_ITEM *item __maybe_unused, void *value __maybe_unused, void *data __maybe_unused) { + return -1; +} + +static size_t dictionary_unittest_walkthrough_stop(DICTIONARY *dict, char **names, char **values, size_t entries) { + (void)names; + (void)values; + (void)entries; + int sum = dictionary_walkthrough_read(dict, dictionary_unittest_walkthrough_stop_callback, NULL); + if(sum != -1) return 1; + return 0; +} + +static size_t dictionary_unittest_foreach(DICTIONARY *dict, char **names, char **values, size_t entries) { + (void)names; + (void)values; + (void)entries; + size_t count = 0; + char *item; + dfe_start_read(dict, item) + count++; + dfe_done(item); + + if(count > entries) return count - entries; + return entries - count; +} + +static size_t dictionary_unittest_foreach_delete_this(DICTIONARY *dict, char **names, char **values, size_t entries) { + (void)names; + (void)values; + (void)entries; + size_t count = 0; + char *item; + dfe_start_write(dict, item) + if(dictionary_del(dict, item_dfe.name)) count++; + dfe_done(item); + + if(count > entries) return count - entries; + return entries - count; +} + +static size_t dictionary_unittest_destroy(DICTIONARY *dict, char **names, char **values, size_t entries) { + (void)names; + (void)values; + (void)entries; + size_t bytes = dictionary_destroy(dict); + fprintf(stderr, " %s() freed %zu bytes,", __FUNCTION__, bytes); + return 0; +} + +static usec_t dictionary_unittest_run_and_measure_time(DICTIONARY *dict, char *message, char **names, char **values, size_t entries, size_t *errors, size_t (*callback)(DICTIONARY *dict, char **names, char **values, size_t entries)) { + fprintf(stderr, "%40s ... ", message); + + usec_t started = now_realtime_usec(); + size_t errs = callback(dict, names, values, entries); + usec_t ended = now_realtime_usec(); + usec_t dt = ended - started; + + if(callback == dictionary_unittest_destroy) dict = NULL; + + long int found_ok = 0, found_deleted = 0, found_referenced = 0; + if(dict) { + DICTIONARY_ITEM *item; + DOUBLE_LINKED_LIST_FOREACH_FORWARD(dict->items.list, item, prev, next) { + if(item->refcount >= 0 && !(item ->flags & ITEM_FLAG_DELETED)) + found_ok++; + else + found_deleted++; + + if(item->refcount > 0) + found_referenced++; + } + } + + fprintf(stderr, " %zu errors, %d (found %ld) items in dictionary, %d (found %ld) referenced, %d (found %ld) deleted, %"PRIu64" usec \n", + errs, dict?dict->entries:0, found_ok, dict?dict->referenced_items:0, found_referenced, dict?dict->pending_deletion_items:0, found_deleted, dt); + *errors += errs; + return dt; +} + +static void dictionary_unittest_clone(DICTIONARY *dict, char **names, char **values, size_t entries, size_t *errors) { + dictionary_unittest_run_and_measure_time(dict, "adding entries", names, values, entries, errors, dictionary_unittest_set_clone); + dictionary_unittest_run_and_measure_time(dict, "getting entries", names, values, entries, errors, dictionary_unittest_get_clone); + dictionary_unittest_run_and_measure_time(dict, "getting non-existing entries", names, values, entries, errors, dictionary_unittest_get_nonexisting); + dictionary_unittest_run_and_measure_time(dict, "resetting entries", names, values, entries, errors, dictionary_unittest_reset_clone); + dictionary_unittest_run_and_measure_time(dict, "deleting non-existing entries", names, values, entries, errors, dictionary_unittest_del_nonexisting); + dictionary_unittest_run_and_measure_time(dict, "traverse foreach read loop", names, values, entries, errors, dictionary_unittest_foreach); + dictionary_unittest_run_and_measure_time(dict, "walkthrough read callback", names, values, entries, errors, dictionary_unittest_walkthrough); + dictionary_unittest_run_and_measure_time(dict, "walkthrough read callback stop", names, values, entries, errors, dictionary_unittest_walkthrough_stop); + dictionary_unittest_run_and_measure_time(dict, "deleting existing entries", names, values, entries, errors, dictionary_unittest_del_existing); + dictionary_unittest_run_and_measure_time(dict, "walking through empty", names, values, 0, errors, dictionary_unittest_walkthrough); + dictionary_unittest_run_and_measure_time(dict, "traverse foreach empty", names, values, 0, errors, dictionary_unittest_foreach); + dictionary_unittest_run_and_measure_time(dict, "destroying empty dictionary", names, values, entries, errors, dictionary_unittest_destroy); +} + +static void dictionary_unittest_nonclone(DICTIONARY *dict, char **names, char **values, size_t entries, size_t *errors) { + dictionary_unittest_run_and_measure_time(dict, "adding entries", names, values, entries, errors, dictionary_unittest_set_nonclone); + dictionary_unittest_run_and_measure_time(dict, "getting entries", names, values, entries, errors, dictionary_unittest_get_nonclone); + dictionary_unittest_run_and_measure_time(dict, "getting non-existing entries", names, values, entries, errors, dictionary_unittest_get_nonexisting); + dictionary_unittest_run_and_measure_time(dict, "resetting entries", names, values, entries, errors, dictionary_unittest_reset_nonclone); + dictionary_unittest_run_and_measure_time(dict, "deleting non-existing entries", names, values, entries, errors, dictionary_unittest_del_nonexisting); + dictionary_unittest_run_and_measure_time(dict, "traverse foreach read loop", names, values, entries, errors, dictionary_unittest_foreach); + dictionary_unittest_run_and_measure_time(dict, "walkthrough read callback", names, values, entries, errors, dictionary_unittest_walkthrough); + dictionary_unittest_run_and_measure_time(dict, "walkthrough read callback stop", names, values, entries, errors, dictionary_unittest_walkthrough_stop); + dictionary_unittest_run_and_measure_time(dict, "deleting existing entries", names, values, entries, errors, dictionary_unittest_del_existing); + dictionary_unittest_run_and_measure_time(dict, "walking through empty", names, values, 0, errors, dictionary_unittest_walkthrough); + dictionary_unittest_run_and_measure_time(dict, "traverse foreach empty", names, values, 0, errors, dictionary_unittest_foreach); + dictionary_unittest_run_and_measure_time(dict, "destroying empty dictionary", names, values, entries, errors, dictionary_unittest_destroy); +} + +struct dictionary_unittest_sorting { + const char *old_name; + const char *old_value; + size_t count; +}; + +static int dictionary_unittest_sorting_callback(const DICTIONARY_ITEM *item, void *value, void *data) { + const char *name = dictionary_acquired_item_name((DICTIONARY_ITEM *)item); + struct dictionary_unittest_sorting *t = (struct dictionary_unittest_sorting *)data; + const char *v = (const char *)value; + + int ret = 0; + if(t->old_name && strcmp(t->old_name, name) > 0) { + fprintf(stderr, "name '%s' should be after '%s'\n", t->old_name, name); + ret = 1; + } + t->count++; + t->old_name = name; + t->old_value = v; + + return ret; +} + +static size_t dictionary_unittest_sorted_walkthrough(DICTIONARY *dict, char **names, char **values, size_t entries) { + (void)names; + (void)values; + struct dictionary_unittest_sorting tmp = { .old_name = NULL, .old_value = NULL, .count = 0 }; + size_t errors; + errors = dictionary_sorted_walkthrough_read(dict, dictionary_unittest_sorting_callback, &tmp); + + if(tmp.count != entries) { + fprintf(stderr, "Expected %zu entries, counted %zu\n", entries, tmp.count); + errors++; + } + return errors; +} + +static void dictionary_unittest_sorting(DICTIONARY *dict, char **names, char **values, size_t entries, size_t *errors) { + dictionary_unittest_run_and_measure_time(dict, "adding entries", names, values, entries, errors, dictionary_unittest_set_clone); + dictionary_unittest_run_and_measure_time(dict, "sorted walkthrough", names, values, entries, errors, dictionary_unittest_sorted_walkthrough); +} + +static void dictionary_unittest_null_dfe(DICTIONARY *dict, char **names, char **values, size_t entries, size_t *errors) { + dictionary_unittest_run_and_measure_time(dict, "adding null value entries", names, values, entries, errors, dictionary_unittest_set_null); + dictionary_unittest_run_and_measure_time(dict, "traverse foreach read loop", names, values, entries, errors, dictionary_unittest_foreach); +} + + +static int unittest_check_dictionary_callback(const DICTIONARY_ITEM *item __maybe_unused, void *value __maybe_unused, void *data __maybe_unused) { + return 1; +} + +static size_t unittest_check_dictionary(const char *label, DICTIONARY *dict, size_t traversable, size_t active_items, size_t deleted_items, size_t referenced_items, size_t pending_deletion) { + size_t errors = 0; + + size_t ll = 0; + void *t; + dfe_start_read(dict, t) + ll++; + dfe_done(t); + + fprintf(stderr, "DICT %-20s: dictionary foreach entries %zu, expected %zu...\t\t\t\t\t", + label, ll, traversable); + if(ll != traversable) { + fprintf(stderr, "FAILED\n"); + errors++; + } + else + fprintf(stderr, "OK\n"); + + ll = dictionary_walkthrough_read(dict, unittest_check_dictionary_callback, NULL); + fprintf(stderr, "DICT %-20s: dictionary walkthrough entries %zu, expected %zu...\t\t\t\t", + label, ll, traversable); + if(ll != traversable) { + fprintf(stderr, "FAILED\n"); + errors++; + } + else + fprintf(stderr, "OK\n"); + + ll = dictionary_sorted_walkthrough_read(dict, unittest_check_dictionary_callback, NULL); + fprintf(stderr, "DICT %-20s: dictionary sorted walkthrough entries %zu, expected %zu...\t\t\t", + label, ll, traversable); + if(ll != traversable) { + fprintf(stderr, "FAILED\n"); + errors++; + } + else + fprintf(stderr, "OK\n"); + + DICTIONARY_ITEM *item; + size_t active = 0, deleted = 0, referenced = 0, pending = 0; + for(item = dict->items.list; item; item = item->next) { + if(!(item->flags & ITEM_FLAG_DELETED) && !(item->shared->flags & ITEM_FLAG_DELETED)) + active++; + else { + deleted++; + + if(item->refcount == 0) + pending++; + } + + if(item->refcount > 0) + referenced++; + } + + fprintf(stderr, "DICT %-20s: dictionary active items reported %d, counted %zu, expected %zu...\t\t\t", + label, dict->entries, active, active_items); + if(active != active_items || active != (size_t)dict->entries) { + fprintf(stderr, "FAILED\n"); + errors++; + } + else + fprintf(stderr, "OK\n"); + + fprintf(stderr, "DICT %-20s: dictionary deleted items counted %zu, expected %zu...\t\t\t\t", + label, deleted, deleted_items); + if(deleted != deleted_items) { + fprintf(stderr, "FAILED\n"); + errors++; + } + else + fprintf(stderr, "OK\n"); + + fprintf(stderr, "DICT %-20s: dictionary referenced items reported %d, counted %zu, expected %zu...\t\t", + label, dict->referenced_items, referenced, referenced_items); + if(referenced != referenced_items || dict->referenced_items != (long int)referenced) { + fprintf(stderr, "FAILED\n"); + errors++; + } + else + fprintf(stderr, "OK\n"); + + fprintf(stderr, "DICT %-20s: dictionary pending deletion items reported %d, counted %zu, expected %zu...\t", + label, dict->pending_deletion_items, pending, pending_deletion); + if(pending != pending_deletion || pending != (size_t)dict->pending_deletion_items) { + fprintf(stderr, "FAILED\n"); + errors++; + } + else + fprintf(stderr, "OK\n"); + + return errors; +} + +static int check_item_callback(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *data) { + return value == data; +} + +static size_t unittest_check_item(const char *label, DICTIONARY *dict, + DICTIONARY_ITEM *item, const char *name, const char *value, int refcount, + ITEM_FLAGS deleted_flags, bool searchable, bool browsable, bool linked) { + size_t errors = 0; + + fprintf(stderr, "ITEM %-20s: name is '%s', expected '%s'...\t\t\t\t\t\t", label, item_get_name(item), name); + if(strcmp(item_get_name(item), name) != 0) { + fprintf(stderr, "FAILED\n"); + errors++; + } + else + fprintf(stderr, "OK\n"); + + fprintf(stderr, "ITEM %-20s: value is '%s', expected '%s'...\t\t\t\t\t", label, (const char *)item->shared->value, value); + if(strcmp((const char *)item->shared->value, value) != 0) { + fprintf(stderr, "FAILED\n"); + errors++; + } + else + fprintf(stderr, "OK\n"); + + fprintf(stderr, "ITEM %-20s: refcount is %d, expected %d...\t\t\t\t\t\t\t", label, item->refcount, refcount); + if (item->refcount != refcount) { + fprintf(stderr, "FAILED\n"); + errors++; + } + else + fprintf(stderr, "OK\n"); + + fprintf(stderr, "ITEM %-20s: deleted flag is %s, expected %s...\t\t\t\t\t", label, + (item->flags & ITEM_FLAG_DELETED || item->shared->flags & ITEM_FLAG_DELETED)?"true":"false", + (deleted_flags & ITEM_FLAG_DELETED)?"true":"false"); + + if ((item->flags & ITEM_FLAG_DELETED || item->shared->flags & ITEM_FLAG_DELETED) != (deleted_flags & ITEM_FLAG_DELETED)) { + fprintf(stderr, "FAILED\n"); + errors++; + } + else + fprintf(stderr, "OK\n"); + + void *v = dictionary_get(dict, name); + bool found = v == item->shared->value; + fprintf(stderr, "ITEM %-20s: searchable %5s, expected %5s...\t\t\t\t\t\t", label, + found?"true":"false", searchable?"true":"false"); + if(found != searchable) { + fprintf(stderr, "FAILED\n"); + errors++; + } + else + fprintf(stderr, "OK\n"); + + found = false; + void *t; + dfe_start_read(dict, t) { + if(t == item->shared->value) found = true; + } + dfe_done(t); + + fprintf(stderr, "ITEM %-20s: dfe browsable %5s, expected %5s...\t\t\t\t\t", label, + found?"true":"false", browsable?"true":"false"); + if(found != browsable) { + fprintf(stderr, "FAILED\n"); + errors++; + } + else + fprintf(stderr, "OK\n"); + + found = dictionary_walkthrough_read(dict, check_item_callback, item->shared->value); + fprintf(stderr, "ITEM %-20s: walkthrough browsable %5s, expected %5s...\t\t\t\t", label, + found?"true":"false", browsable?"true":"false"); + if(found != browsable) { + fprintf(stderr, "FAILED\n"); + errors++; + } + else + fprintf(stderr, "OK\n"); + + found = dictionary_sorted_walkthrough_read(dict, check_item_callback, item->shared->value); + fprintf(stderr, "ITEM %-20s: sorted walkthrough browsable %5s, expected %5s...\t\t\t", label, + found?"true":"false", browsable?"true":"false"); + if(found != browsable) { + fprintf(stderr, "FAILED\n"); + errors++; + } + else + fprintf(stderr, "OK\n"); + + found = false; + DICTIONARY_ITEM *n; + for(n = dict->items.list; n ;n = n->next) + if(n == item) found = true; + + fprintf(stderr, "ITEM %-20s: linked %5s, expected %5s...\t\t\t\t\t\t", label, + found?"true":"false", linked?"true":"false"); + if(found != linked) { + fprintf(stderr, "FAILED\n"); + errors++; + } + else + fprintf(stderr, "OK\n"); + + return errors; +} + +struct thread_unittest { + int join; + DICTIONARY *dict; + int dups; + + ND_THREAD *thread; + struct dictionary_stats stats; +}; + +static void *unittest_dict_thread(void *arg) { + struct thread_unittest *tu = arg; + for(; 1 ;) { + if(__atomic_load_n(&tu->join, __ATOMIC_RELAXED)) + break; + + DICT_ITEM_CONST DICTIONARY_ITEM *item = + dictionary_set_and_acquire_item_advanced(tu->dict, "dict thread checking 1234567890", + -1, NULL, 0, NULL); + tu->stats.ops.inserts++; + + dictionary_get(tu->dict, dictionary_acquired_item_name(item)); + tu->stats.ops.searches++; + + void *t1; + dfe_start_write(tu->dict, t1) { + + // this should delete the referenced item + dictionary_del(tu->dict, t1_dfe.name); + tu->stats.ops.deletes++; + + void *t2; + dfe_start_write(tu->dict, t2) { + // this should add another + dictionary_set(tu->dict, t2_dfe.name, NULL, 0); + tu->stats.ops.inserts++; + + dictionary_get(tu->dict, dictionary_acquired_item_name(item)); + tu->stats.ops.searches++; + + // and this should delete it again + dictionary_del(tu->dict, t2_dfe.name); + tu->stats.ops.deletes++; + } + dfe_done(t2); + tu->stats.ops.traversals++; + + // this should fail to add it + dictionary_set(tu->dict, t1_dfe.name, NULL, 0); + tu->stats.ops.inserts++; + + dictionary_del(tu->dict, t1_dfe.name); + tu->stats.ops.deletes++; + } + dfe_done(t1); + tu->stats.ops.traversals++; + + for(int i = 0; i < tu->dups ; i++) { + dictionary_acquired_item_dup(tu->dict, item); + dictionary_get(tu->dict, dictionary_acquired_item_name(item)); + tu->stats.ops.searches++; + } + + for(int i = 0; i < tu->dups ; i++) { + dictionary_acquired_item_release(tu->dict, item); + dictionary_del(tu->dict, dictionary_acquired_item_name(item)); + tu->stats.ops.deletes++; + } + + dictionary_acquired_item_release(tu->dict, item); + dictionary_del(tu->dict, "dict thread checking 1234567890"); + tu->stats.ops.deletes++; + + // test concurrent deletions and flushes + { + if(gettid_cached() % 2) { + char buf [256 + 1]; + + for (int i = 0; i < 1000; i++) { + snprintfz(buf, sizeof(buf), "del/flush test %d", i); + dictionary_set(tu->dict, buf, NULL, 0); + tu->stats.ops.inserts++; + } + + for (int i = 0; i < 1000; i++) { + snprintfz(buf, sizeof(buf), "del/flush test %d", i); + dictionary_del(tu->dict, buf); + tu->stats.ops.deletes++; + } + } + else { + for (int i = 0; i < 10; i++) { + dictionary_flush(tu->dict); + tu->stats.ops.flushes++; + } + } + } + } + + return arg; +} + +static int dictionary_unittest_threads() { + time_t seconds_to_run = 5; + int threads_to_create = 2; + + struct thread_unittest tu[threads_to_create]; + memset(tu, 0, sizeof(struct thread_unittest) * threads_to_create); + + fprintf( + stderr, + "\nChecking dictionary concurrency with %d threads for %lld seconds...\n", + threads_to_create, + (long long)seconds_to_run); + + // threads testing of dictionary + struct dictionary_stats stats = {}; + tu[0].join = 0; + tu[0].dups = 1; + tu[0].dict = dictionary_create_advanced(DICT_OPTION_DONT_OVERWRITE_VALUE, &stats, 0); + + for (int i = 0; i < threads_to_create; i++) { + if(i) + tu[i] = tu[0]; + + char buf[100 + 1]; + snprintf(buf, 100, "dict%d", i); + tu[i].thread = nd_thread_create( + buf, + NETDATA_THREAD_OPTION_DONT_LOG | NETDATA_THREAD_OPTION_JOINABLE, + unittest_dict_thread, + &tu[i]); + } + + sleep_usec(seconds_to_run * USEC_PER_SEC); + + for (int i = 0; i < threads_to_create; i++) { + __atomic_store_n(&tu[i].join, 1, __ATOMIC_RELAXED); + + nd_thread_join(tu[i].thread); + + if(i) { + tu[0].stats.ops.inserts += tu[i].stats.ops.inserts; + tu[0].stats.ops.deletes += tu[i].stats.ops.deletes; + tu[0].stats.ops.searches += tu[i].stats.ops.searches; + tu[0].stats.ops.flushes += tu[i].stats.ops.flushes; + tu[0].stats.ops.traversals += tu[i].stats.ops.traversals; + } + } + + fprintf(stderr, + "CALLS : inserts %zu" + ", deletes %zu" + ", searches %zu" + ", traversals %zu" + ", flushes %zu" + "\n", + tu[0].stats.ops.inserts, + tu[0].stats.ops.deletes, + tu[0].stats.ops.searches, + tu[0].stats.ops.traversals, + tu[0].stats.ops.flushes + ); + +#ifdef DICT_WITH_STATS + fprintf(stderr, + "ACTUAL: inserts %zu" + ", deletes %zu" + ", searches %zu" + ", traversals %zu" + ", resets %zu" + ", flushes %zu" + ", entries %d" + ", referenced_items %d" + ", pending deletions %d" + ", check spins %zu" + ", insert spins %zu" + ", delete spins %zu" + ", search ignores %zu" + "\n", + stats.ops.inserts, + stats.ops.deletes, + stats.ops.searches, + stats.ops.traversals, + stats.ops.resets, + stats.ops.flushes, + tu[0].dict->entries, + tu[0].dict->referenced_items, + tu[0].dict->pending_deletion_items, + stats.spin_locks.use_spins, + stats.spin_locks.insert_spins, + stats.spin_locks.delete_spins, + stats.spin_locks.search_spins + ); +#endif + + dictionary_destroy(tu[0].dict); + return 0; +} + +struct thread_view_unittest { + int join; + DICTIONARY *master; + DICTIONARY *view; + DICTIONARY_ITEM *item_master; + int dups; +}; + +static void *unittest_dict_master_thread(void *arg) { + struct thread_view_unittest *tv = arg; + + DICTIONARY_ITEM *item = NULL; + int loops = 0; + while(!__atomic_load_n(&tv->join, __ATOMIC_RELAXED)) { + + if(!item) + item = dictionary_set_and_acquire_item(tv->master, "ITEM1", "123", strlen("123")); + + if(__atomic_load_n(&tv->item_master, __ATOMIC_RELAXED) != NULL) { + dictionary_acquired_item_release(tv->master, item); + dictionary_del(tv->master, "ITEM1"); + item = NULL; + loops++; + continue; + } + + dictionary_acquired_item_dup(tv->master, item); // for the view thread + __atomic_store_n(&tv->item_master, item, __ATOMIC_RELAXED); + dictionary_del(tv->master, "ITEM1"); + + + for(int i = 0; i < tv->dups + loops ; i++) { + dictionary_acquired_item_dup(tv->master, item); + } + + for(int i = 0; i < tv->dups + loops ; i++) { + dictionary_acquired_item_release(tv->master, item); + } + + dictionary_acquired_item_release(tv->master, item); + + item = NULL; + loops = 0; + } + + return arg; +} + +static void *unittest_dict_view_thread(void *arg) { + struct thread_view_unittest *tv = arg; + + DICTIONARY_ITEM *m_item = NULL; + + while(!__atomic_load_n(&tv->join, __ATOMIC_RELAXED)) { + if(!(m_item = __atomic_load_n(&tv->item_master, __ATOMIC_RELAXED))) + continue; + + DICTIONARY_ITEM *v_item = dictionary_view_set_and_acquire_item(tv->view, "ITEM2", m_item); + dictionary_acquired_item_release(tv->master, m_item); + __atomic_store_n(&tv->item_master, NULL, __ATOMIC_RELAXED); + + for(int i = 0; i < tv->dups ; i++) { + dictionary_acquired_item_dup(tv->view, v_item); + } + + for(int i = 0; i < tv->dups ; i++) { + dictionary_acquired_item_release(tv->view, v_item); + } + + dictionary_del(tv->view, "ITEM2"); + + while(!__atomic_load_n(&tv->join, __ATOMIC_RELAXED) && !(m_item = __atomic_load_n(&tv->item_master, __ATOMIC_RELAXED))) { + dictionary_acquired_item_dup(tv->view, v_item); + dictionary_acquired_item_release(tv->view, v_item); + } + + dictionary_acquired_item_release(tv->view, v_item); + } + + return arg; +} + +static int dictionary_unittest_view_threads() { + + struct thread_view_unittest tv = { + .join = 0, + .master = NULL, + .view = NULL, + .item_master = NULL, + .dups = 1, + }; + + // threads testing of dictionary + struct dictionary_stats stats_master = {}; + struct dictionary_stats stats_view = {}; + tv.master = dictionary_create_advanced(DICT_OPTION_NAME_LINK_DONT_CLONE | DICT_OPTION_DONT_OVERWRITE_VALUE, &stats_master, 0); + tv.view = dictionary_create_view(tv.master); + tv.view->stats = &stats_view; + + time_t seconds_to_run = 5; + fprintf( + stderr, + "\nChecking dictionary concurrency with 1 master and 1 view threads for %lld seconds...\n", + (long long)seconds_to_run); + + ND_THREAD *master_thread, *view_thread; + tv.join = 0; + + master_thread = nd_thread_create( + "master", + NETDATA_THREAD_OPTION_DONT_LOG | NETDATA_THREAD_OPTION_JOINABLE, + unittest_dict_master_thread, + &tv); + + view_thread = nd_thread_create( + "view", + NETDATA_THREAD_OPTION_DONT_LOG | NETDATA_THREAD_OPTION_JOINABLE, + unittest_dict_view_thread, + &tv); + + sleep_usec(seconds_to_run * USEC_PER_SEC); + + __atomic_store_n(&tv.join, 1, __ATOMIC_RELAXED); + nd_thread_join(view_thread); + nd_thread_join(master_thread); + +#ifdef DICT_WITH_STATS + fprintf(stderr, + "MASTER: inserts %zu" + ", deletes %zu" + ", searches %zu" + ", resets %zu" + ", entries %d" + ", referenced_items %d" + ", pending deletions %d" + ", check spins %zu" + ", insert spins %zu" + ", delete spins %zu" + ", search ignores %zu" + "\n", + stats_master.ops.inserts, + stats_master.ops.deletes, + stats_master.ops.searches, + stats_master.ops.resets, + tv.master->entries, + tv.master->referenced_items, + tv.master->pending_deletion_items, + stats_master.spin_locks.use_spins, + stats_master.spin_locks.insert_spins, + stats_master.spin_locks.delete_spins, + stats_master.spin_locks.search_spins + ); + fprintf(stderr, + "VIEW : inserts %zu" + ", deletes %zu" + ", searches %zu" + ", resets %zu" + ", entries %d" + ", referenced_items %d" + ", pending deletions %d" + ", check spins %zu" + ", insert spins %zu" + ", delete spins %zu" + ", search ignores %zu" + "\n", + stats_view.ops.inserts, + stats_view.ops.deletes, + stats_view.ops.searches, + stats_view.ops.resets, + tv.view->entries, + tv.view->referenced_items, + tv.view->pending_deletion_items, + stats_view.spin_locks.use_spins, + stats_view.spin_locks.insert_spins, + stats_view.spin_locks.delete_spins, + stats_view.spin_locks.search_spins + ); +#endif + + dictionary_destroy(tv.master); + dictionary_destroy(tv.view); + + return 0; +} + +size_t dictionary_unittest_views(void) { + size_t errors = 0; + struct dictionary_stats stats = {}; + DICTIONARY *master = dictionary_create_advanced(DICT_OPTION_NONE, &stats, 0); + DICTIONARY *view = dictionary_create_view(master); + + fprintf(stderr, "\n\nChecking dictionary views...\n"); + + // Add an item to both master and view, then remove the view first and the master second + fprintf(stderr, "\nPASS 1: Adding 1 item to master:\n"); + DICTIONARY_ITEM *item1_on_master = dictionary_set_and_acquire_item(master, "KEY 1", "VALUE1", strlen("VALUE1") + 1); + errors += unittest_check_dictionary("master", master, 1, 1, 0, 1, 0); + errors += unittest_check_item("master", master, item1_on_master, "KEY 1", item1_on_master->shared->value, 1, ITEM_FLAG_NONE, true, true, true); + + fprintf(stderr, "\nPASS 1: Adding master item to view:\n"); + DICTIONARY_ITEM *item1_on_view = dictionary_view_set_and_acquire_item(view, "KEY 1 ON VIEW", item1_on_master); + errors += unittest_check_dictionary("view", view, 1, 1, 0, 1, 0); + errors += unittest_check_item("view", view, item1_on_view, "KEY 1 ON VIEW", item1_on_master->shared->value, 1, ITEM_FLAG_NONE, true, true, true); + + fprintf(stderr, "\nPASS 1: Deleting view item:\n"); + dictionary_del(view, "KEY 1 ON VIEW"); + errors += unittest_check_dictionary("master", master, 1, 1, 0, 1, 0); + errors += unittest_check_dictionary("view", view, 0, 0, 1, 1, 0); + errors += unittest_check_item("master", master, item1_on_master, "KEY 1", item1_on_master->shared->value, 1, ITEM_FLAG_NONE, true, true, true); + errors += unittest_check_item("view", view, item1_on_view, "KEY 1 ON VIEW", item1_on_master->shared->value, 1, ITEM_FLAG_DELETED, false, false, true); + + fprintf(stderr, "\nPASS 1: Releasing the deleted view item:\n"); + dictionary_acquired_item_release(view, item1_on_view); + errors += unittest_check_dictionary("master", master, 1, 1, 0, 1, 0); + errors += unittest_check_dictionary("view", view, 0, 0, 1, 0, 1); + errors += unittest_check_item("master", master, item1_on_master, "KEY 1", item1_on_master->shared->value, 1, ITEM_FLAG_NONE, true, true, true); + + fprintf(stderr, "\nPASS 1: Releasing the acquired master item:\n"); + dictionary_acquired_item_release(master, item1_on_master); + errors += unittest_check_dictionary("master", master, 1, 1, 0, 0, 0); + errors += unittest_check_dictionary("view", view, 0, 0, 1, 0, 1); + errors += unittest_check_item("master", master, item1_on_master, "KEY 1", item1_on_master->shared->value, 0, ITEM_FLAG_NONE, true, true, true); + + fprintf(stderr, "\nPASS 1: Deleting the released master item:\n"); + dictionary_del(master, "KEY 1"); + errors += unittest_check_dictionary("master", master, 0, 0, 0, 0, 0); + errors += unittest_check_dictionary("view", view, 0, 0, 1, 0, 1); + + // The other way now: + // Add an item to both master and view, then remove the master first and verify it is deleted on the view also + fprintf(stderr, "\nPASS 2: Adding 1 item to master:\n"); + item1_on_master = dictionary_set_and_acquire_item(master, "KEY 1", "VALUE1", strlen("VALUE1") + 1); + errors += unittest_check_dictionary("master", master, 1, 1, 0, 1, 0); + errors += unittest_check_item("master", master, item1_on_master, "KEY 1", item1_on_master->shared->value, 1, ITEM_FLAG_NONE, true, true, true); + + fprintf(stderr, "\nPASS 2: Adding master item to view:\n"); + item1_on_view = dictionary_view_set_and_acquire_item(view, "KEY 1 ON VIEW", item1_on_master); + errors += unittest_check_dictionary("view", view, 1, 1, 0, 1, 0); + errors += unittest_check_item("view", view, item1_on_view, "KEY 1 ON VIEW", item1_on_master->shared->value, 1, ITEM_FLAG_NONE, true, true, true); + + fprintf(stderr, "\nPASS 2: Deleting master item:\n"); + dictionary_del(master, "KEY 1"); + garbage_collect_pending_deletes(view); + errors += unittest_check_dictionary("master", master, 0, 0, 1, 1, 0); + errors += unittest_check_dictionary("view", view, 0, 0, 1, 1, 0); + errors += unittest_check_item("master", master, item1_on_master, "KEY 1", item1_on_master->shared->value, 1, ITEM_FLAG_DELETED, false, false, true); + errors += unittest_check_item("view", view, item1_on_view, "KEY 1 ON VIEW", item1_on_master->shared->value, 1, ITEM_FLAG_DELETED, false, false, true); + + fprintf(stderr, "\nPASS 2: Releasing the acquired master item:\n"); + dictionary_acquired_item_release(master, item1_on_master); + errors += unittest_check_dictionary("master", master, 0, 0, 1, 0, 1); + errors += unittest_check_dictionary("view", view, 0, 0, 1, 1, 0); + errors += unittest_check_item("view", view, item1_on_view, "KEY 1 ON VIEW", item1_on_master->shared->value, 1, ITEM_FLAG_DELETED, false, false, true); + + fprintf(stderr, "\nPASS 2: Releasing the deleted view item:\n"); + dictionary_acquired_item_release(view, item1_on_view); + errors += unittest_check_dictionary("master", master, 0, 0, 1, 0, 1); + errors += unittest_check_dictionary("view", view, 0, 0, 1, 0, 1); + + dictionary_destroy(master); + dictionary_destroy(view); + return errors; +} + +/* + * FIXME: a dictionary-related leak is reported when running the address + * sanitizer. Need to investigate if it's introduced by the unit-test itself, + * or the dictionary implementation. +*/ +int dictionary_unittest(size_t entries) { + if(entries < 10) entries = 10; + + DICTIONARY *dict; + size_t errors = 0; + + fprintf(stderr, "Generating %zu names and values...\n", entries); + char **names = dictionary_unittest_generate_names(entries); + char **values = dictionary_unittest_generate_values(entries); + + fprintf(stderr, "\nCreating dictionary single threaded, clone, %zu items\n", entries); + dict = dictionary_create(DICT_OPTION_SINGLE_THREADED); + dictionary_unittest_clone(dict, names, values, entries, &errors); + + fprintf(stderr, "\nCreating dictionary multi threaded, clone, %zu items\n", entries); + dict = dictionary_create(DICT_OPTION_NONE); + dictionary_unittest_clone(dict, names, values, entries, &errors); + + fprintf(stderr, "\nCreating dictionary single threaded, non-clone, add-in-front options, %zu items\n", entries); + dict = dictionary_create( + DICT_OPTION_SINGLE_THREADED | DICT_OPTION_NAME_LINK_DONT_CLONE | DICT_OPTION_VALUE_LINK_DONT_CLONE | + DICT_OPTION_ADD_IN_FRONT); + dictionary_unittest_nonclone(dict, names, values, entries, &errors); + + fprintf(stderr, "\nCreating dictionary multi threaded, non-clone, add-in-front options, %zu items\n", entries); + dict = dictionary_create( + DICT_OPTION_NAME_LINK_DONT_CLONE | DICT_OPTION_VALUE_LINK_DONT_CLONE | DICT_OPTION_ADD_IN_FRONT); + dictionary_unittest_nonclone(dict, names, values, entries, &errors); + + fprintf(stderr, "\nCreating dictionary single-threaded, non-clone, don't overwrite options, %zu items\n", entries); + dict = dictionary_create( + DICT_OPTION_SINGLE_THREADED | DICT_OPTION_NAME_LINK_DONT_CLONE | DICT_OPTION_VALUE_LINK_DONT_CLONE | + DICT_OPTION_DONT_OVERWRITE_VALUE); + dictionary_unittest_run_and_measure_time(dict, "adding entries", names, values, entries, &errors, dictionary_unittest_set_nonclone); + dictionary_unittest_run_and_measure_time(dict, "resetting non-overwrite entries", names, values, entries, &errors, dictionary_unittest_reset_dont_overwrite_nonclone); + dictionary_unittest_run_and_measure_time(dict, "traverse foreach read loop", names, values, entries, &errors, dictionary_unittest_foreach); + dictionary_unittest_run_and_measure_time(dict, "walkthrough read callback", names, values, entries, &errors, dictionary_unittest_walkthrough); + dictionary_unittest_run_and_measure_time(dict, "walkthrough read callback stop", names, values, entries, &errors, dictionary_unittest_walkthrough_stop); + dictionary_unittest_run_and_measure_time(dict, "destroying full dictionary", names, values, entries, &errors, dictionary_unittest_destroy); + + fprintf(stderr, "\nCreating dictionary multi-threaded, non-clone, don't overwrite options, %zu items\n", entries); + dict = dictionary_create( + DICT_OPTION_NAME_LINK_DONT_CLONE | DICT_OPTION_VALUE_LINK_DONT_CLONE | DICT_OPTION_DONT_OVERWRITE_VALUE); + dictionary_unittest_run_and_measure_time(dict, "adding entries", names, values, entries, &errors, dictionary_unittest_set_nonclone); + dictionary_unittest_run_and_measure_time(dict, "walkthrough write delete this", names, values, entries, &errors, dictionary_unittest_walkthrough_delete_this); + dictionary_unittest_run_and_measure_time(dict, "destroying empty dictionary", names, values, entries, &errors, dictionary_unittest_destroy); + + fprintf(stderr, "\nCreating dictionary multi-threaded, non-clone, don't overwrite options, %zu items\n", entries); + dict = dictionary_create( + DICT_OPTION_NAME_LINK_DONT_CLONE | DICT_OPTION_VALUE_LINK_DONT_CLONE | DICT_OPTION_DONT_OVERWRITE_VALUE); + dictionary_unittest_run_and_measure_time(dict, "adding entries", names, values, entries, &errors, dictionary_unittest_set_nonclone); + dictionary_unittest_run_and_measure_time(dict, "foreach write delete this", names, values, entries, &errors, dictionary_unittest_foreach_delete_this); + dictionary_unittest_run_and_measure_time(dict, "traverse foreach read loop empty", names, values, 0, &errors, dictionary_unittest_foreach); + dictionary_unittest_run_and_measure_time(dict, "walkthrough read callback empty", names, values, 0, &errors, dictionary_unittest_walkthrough); + dictionary_unittest_run_and_measure_time(dict, "destroying empty dictionary", names, values, entries, &errors, dictionary_unittest_destroy); + + fprintf(stderr, "\nCreating dictionary single threaded, clone, %zu items\n", entries); + dict = dictionary_create(DICT_OPTION_SINGLE_THREADED); + dictionary_unittest_sorting(dict, names, values, entries, &errors); + dictionary_unittest_run_and_measure_time(dict, "destroying full dictionary", names, values, entries, &errors, dictionary_unittest_destroy); + + fprintf(stderr, "\nCreating dictionary single threaded, clone, %zu items\n", entries); + dict = dictionary_create(DICT_OPTION_SINGLE_THREADED); + dictionary_unittest_null_dfe(dict, names, values, entries, &errors); + dictionary_unittest_run_and_measure_time(dict, "destroying full dictionary", names, values, entries, &errors, dictionary_unittest_destroy); + + fprintf(stderr, "\nCreating dictionary single threaded, noclone, %zu items\n", entries); + dict = dictionary_create(DICT_OPTION_SINGLE_THREADED | DICT_OPTION_VALUE_LINK_DONT_CLONE); + dictionary_unittest_null_dfe(dict, names, values, entries, &errors); + dictionary_unittest_run_and_measure_time(dict, "destroying full dictionary", names, values, entries, &errors, dictionary_unittest_destroy); + + // check reference counters + { + fprintf(stderr, "\nTesting reference counters:\n"); + dict = dictionary_create(DICT_OPTION_NONE | DICT_OPTION_NAME_LINK_DONT_CLONE); + errors += unittest_check_dictionary("", dict, 0, 0, 0, 0, 0); + + fprintf(stderr, "\nAdding test item to dictionary and acquiring it\n"); + dictionary_set(dict, "test", "ITEM1", 6); + DICTIONARY_ITEM *item = (DICTIONARY_ITEM *)dictionary_get_and_acquire_item(dict, "test"); + + errors += unittest_check_dictionary("", dict, 1, 1, 0, 1, 0); + errors += unittest_check_item("ACQUIRED", dict, item, "test", "ITEM1", 1, ITEM_FLAG_NONE, true, true, true); + + fprintf(stderr, "\nChecking that reference counters are increased:\n"); + void *t; + dfe_start_read(dict, t) { + errors += unittest_check_dictionary("", dict, 1, 1, 0, 1, 0); + errors += unittest_check_item("ACQUIRED TRAVERSAL", dict, item, "test", "ITEM1", 2, ITEM_FLAG_NONE, true, true, true); + } + dfe_done(t); + + fprintf(stderr, "\nChecking that reference counters are decreased:\n"); + errors += unittest_check_dictionary("", dict, 1, 1, 0, 1, 0); + errors += unittest_check_item("ACQUIRED TRAVERSAL 2", dict, item, "test", "ITEM1", 1, ITEM_FLAG_NONE, true, true, true); + + fprintf(stderr, "\nDeleting the item we have acquired:\n"); + dictionary_del(dict, "test"); + + errors += unittest_check_dictionary("", dict, 0, 0, 1, 1, 0); + errors += unittest_check_item("DELETED", dict, item, "test", "ITEM1", 1, ITEM_FLAG_DELETED, false, false, true); + + fprintf(stderr, "\nAdding another item with the same name of the item we deleted, while being acquired:\n"); + dictionary_set(dict, "test", "ITEM2", 6); + errors += unittest_check_dictionary("", dict, 1, 1, 1, 1, 0); + + fprintf(stderr, "\nAcquiring the second item:\n"); + DICTIONARY_ITEM *item2 = (DICTIONARY_ITEM *)dictionary_get_and_acquire_item(dict, "test"); + errors += unittest_check_item("FIRST", dict, item, "test", "ITEM1", 1, ITEM_FLAG_DELETED, false, false, true); + errors += unittest_check_item("SECOND", dict, item2, "test", "ITEM2", 1, ITEM_FLAG_NONE, true, true, true); + errors += unittest_check_dictionary("", dict, 1, 1, 1, 2, 0); + + fprintf(stderr, "\nReleasing the second item (the first is still acquired):\n"); + dictionary_acquired_item_release(dict, (DICTIONARY_ITEM *)item2); + errors += unittest_check_dictionary("", dict, 1, 1, 1, 1, 0); + errors += unittest_check_item("FIRST", dict, item, "test", "ITEM1", 1, ITEM_FLAG_DELETED, false, false, true); + errors += unittest_check_item("SECOND RELEASED", dict, item2, "test", "ITEM2", 0, ITEM_FLAG_NONE, true, true, true); + + fprintf(stderr, "\nDeleting the second item (the first is still acquired):\n"); + dictionary_del(dict, "test"); + errors += unittest_check_dictionary("", dict, 0, 0, 1, 1, 0); + errors += unittest_check_item("ACQUIRED DELETED", dict, item, "test", "ITEM1", 1, ITEM_FLAG_DELETED, false, false, true); + + fprintf(stderr, "\nReleasing the first item (which we have already deleted):\n"); + dictionary_acquired_item_release(dict, (DICTIONARY_ITEM *)item); + dfe_start_write(dict, item) ; dfe_done(item); + errors += unittest_check_dictionary("", dict, 0, 0, 1, 0, 1); + + fprintf(stderr, "\nAdding again the test item to dictionary and acquiring it\n"); + dictionary_set(dict, "test", "ITEM1", 6); + item = (DICTIONARY_ITEM *)dictionary_get_and_acquire_item(dict, "test"); + + errors += unittest_check_dictionary("", dict, 1, 1, 0, 1, 0); + errors += unittest_check_item("RE-ADDITION", dict, item, "test", "ITEM1", 1, ITEM_FLAG_NONE, true, true, true); + + fprintf(stderr, "\nDestroying the dictionary while we have acquired an item\n"); + dictionary_destroy(dict); + + fprintf(stderr, "Releasing the item (on a destroyed dictionary)\n"); + dictionary_acquired_item_release(dict, (DICTIONARY_ITEM *)item); + item = NULL; + dict = NULL; + } + + dictionary_unittest_free_char_pp(names, entries); + dictionary_unittest_free_char_pp(values, entries); + + errors += dictionary_unittest_views(); + errors += dictionary_unittest_threads(); + errors += dictionary_unittest_view_threads(); + + cleanup_destroyed_dictionaries(); + + fprintf(stderr, "\n%zu errors found\n", errors); + return errors ? 1 : 0; +} diff --git a/src/libnetdata/dictionary/dictionary.c b/src/libnetdata/dictionary/dictionary.c new file mode 100644 index 00000000..9d50ed62 --- /dev/null +++ b/src/libnetdata/dictionary/dictionary.c @@ -0,0 +1,767 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "dictionary-internals.h" + +ARAL *dict_items_aral = NULL; +ARAL *dict_shared_items_aral = NULL; + +struct dictionary_stats dictionary_stats_category_other = { + .name = "other", +}; + +// ---------------------------------------------------------------------------- +// public locks API + +inline void dictionary_write_lock(DICTIONARY *dict) { + ll_recursive_lock(dict, DICTIONARY_LOCK_WRITE); +} + +inline void dictionary_write_unlock(DICTIONARY *dict) { + ll_recursive_unlock(dict, DICTIONARY_LOCK_WRITE); +} + +// ---------------------------------------------------------------------------- +// callbacks registration + +static inline void dictionary_hooks_allocate(DICTIONARY *dict) { + if(dict->hooks) return; + + dict->hooks = callocz(1, sizeof(struct dictionary_hooks)); + dict->hooks->links = 1; + + DICTIONARY_STATS_PLUS_MEMORY(dict, 0, sizeof(struct dictionary_hooks), 0); +} + +static inline size_t dictionary_hooks_free(DICTIONARY *dict) { + if(!dict->hooks) return 0; + + REFCOUNT links = __atomic_sub_fetch(&dict->hooks->links, 1, __ATOMIC_ACQUIRE); + if(links == 0) { + freez(dict->hooks); + dict->hooks = NULL; + + DICTIONARY_STATS_MINUS_MEMORY(dict, 0, sizeof(struct dictionary_hooks), 0); + return sizeof(struct dictionary_hooks); + } + + return 0; +} + +void dictionary_register_insert_callback(DICTIONARY *dict, dict_cb_insert_t insert_callback, void *data) { + if(unlikely(is_view_dictionary(dict))) + fatal("DICTIONARY: called %s() on a view.", __FUNCTION__ ); + + dictionary_hooks_allocate(dict); + dict->hooks->insert_callback = insert_callback; + dict->hooks->insert_callback_data = data; +} + +void dictionary_register_conflict_callback(DICTIONARY *dict, dict_cb_conflict_t conflict_callback, void *data) { + if(unlikely(is_view_dictionary(dict))) + fatal("DICTIONARY: called %s() on a view.", __FUNCTION__ ); + + internal_error(!(dict->options & DICT_OPTION_DONT_OVERWRITE_VALUE), "DICTIONARY: registering conflict callback without DICT_OPTION_DONT_OVERWRITE_VALUE"); + dict->options |= DICT_OPTION_DONT_OVERWRITE_VALUE; + + dictionary_hooks_allocate(dict); + dict->hooks->conflict_callback = conflict_callback; + dict->hooks->conflict_callback_data = data; +} + +void dictionary_register_react_callback(DICTIONARY *dict, dict_cb_react_t react_callback, void *data) { + if(unlikely(is_view_dictionary(dict))) + fatal("DICTIONARY: called %s() on a view.", __FUNCTION__ ); + + dictionary_hooks_allocate(dict); + dict->hooks->react_callback = react_callback; + dict->hooks->react_callback_data = data; +} + +void dictionary_register_delete_callback(DICTIONARY *dict, dict_cb_delete_t delete_callback, void *data) { + if(unlikely(is_view_dictionary(dict))) + fatal("DICTIONARY: called %s() on a view.", __FUNCTION__ ); + + dictionary_hooks_allocate(dict); + dict->hooks->delete_callback = delete_callback; + dict->hooks->delelte_callback_data = data; +} + +// ---------------------------------------------------------------------------- +// dictionary statistics API + +size_t dictionary_version(DICTIONARY *dict) { + if(unlikely(!dict)) return 0; + + // this is required for views to return the right number + // garbage_collect_pending_deletes(dict); + + return __atomic_load_n(&dict->version, __ATOMIC_RELAXED); +} +size_t dictionary_entries(DICTIONARY *dict) { + if(unlikely(!dict)) return 0; + + // this is required for views to return the right number + // garbage_collect_pending_deletes(dict); + + long int entries = __atomic_load_n(&dict->entries, __ATOMIC_RELAXED); + internal_fatal(entries < 0, "DICTIONARY: entries is negative: %ld", entries); + + return entries; +} +size_t dictionary_referenced_items(DICTIONARY *dict) { + if(unlikely(!dict)) return 0; + + long int referenced_items = __atomic_load_n(&dict->referenced_items, __ATOMIC_RELAXED); + if(referenced_items < 0) + fatal("DICTIONARY: referenced items is negative: %ld", referenced_items); + + return referenced_items; +} + +void dictionary_version_increment(DICTIONARY *dict) { + __atomic_fetch_add(&dict->version, 1, __ATOMIC_RELAXED); +} + +// ---------------------------------------------------------------------------- +// items garbage collector + +void garbage_collect_pending_deletes(DICTIONARY *dict) { + usec_t last_master_deletion_us = dict->hooks?__atomic_load_n(&dict->hooks->last_master_deletion_us, __ATOMIC_RELAXED):0; + usec_t last_gc_run_us = __atomic_load_n(&dict->last_gc_run_us, __ATOMIC_RELAXED); + + bool is_view = is_view_dictionary(dict); + + if(likely(!( + DICTIONARY_PENDING_DELETES_GET(dict) > 0 || + (is_view && last_master_deletion_us > last_gc_run_us) + ))) + return; + + ll_recursive_lock(dict, DICTIONARY_LOCK_WRITE); + + __atomic_store_n(&dict->last_gc_run_us, now_realtime_usec(), __ATOMIC_RELAXED); + + if(is_view) + dictionary_index_lock_wrlock(dict); + + DICTIONARY_STATS_GARBAGE_COLLECTIONS_PLUS1(dict); + + size_t deleted = 0, pending = 0, examined = 0; + DICTIONARY_ITEM *item = dict->items.list, *item_next; + while(item) { + examined++; + + // this will clean up + item_next = item->next; + int rc = item_check_and_acquire_advanced(dict, item, is_view); + + if(rc == RC_ITEM_MARKED_FOR_DELETION) { + // we didn't get a reference + + if(item_is_not_referenced_and_can_be_removed(dict, item)) { + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(dict->items.list, item, prev, next); + dict_item_free_with_hooks(dict, item); + deleted++; + + pending = DICTIONARY_PENDING_DELETES_MINUS1(dict); + if (!pending) + break; + } + } + else if(rc == RC_ITEM_IS_CURRENTLY_BEING_DELETED) + ; // do not touch this item (we didn't get a reference) + + else if(rc == RC_ITEM_OK) + item_release(dict, item); + + item = item_next; + } + + if(is_view) + dictionary_index_wrlock_unlock(dict); + + ll_recursive_unlock(dict, DICTIONARY_LOCK_WRITE); + + (void)deleted; + (void)examined; + + internal_error(false, "DICTIONARY: garbage collected dictionary created by %s (%zu@%s), " + "examined %zu items, deleted %zu items, still pending %zu items", + dict->creation_function, dict->creation_line, dict->creation_file, + examined, deleted, pending); +} + +void dictionary_garbage_collect(DICTIONARY *dict) { + if(!dict) return; + garbage_collect_pending_deletes(dict); +} + +// ---------------------------------------------------------------------------- + +void dictionary_static_items_aral_init(void) { + static SPINLOCK spinlock; + + if(unlikely(!dict_items_aral || !dict_shared_items_aral)) { + spinlock_lock(&spinlock); + + // we have to check again + if(!dict_items_aral) + dict_items_aral = aral_create( + "dict-items", + sizeof(DICTIONARY_ITEM), + 0, + 65536, + aral_by_size_statistics(), + NULL, NULL, false, false); + + // we have to check again + if(!dict_shared_items_aral) + dict_shared_items_aral = aral_create( + "dict-shared-items", + sizeof(DICTIONARY_ITEM_SHARED), + 0, + 65536, + aral_by_size_statistics(), + NULL, NULL, false, false); + + spinlock_unlock(&spinlock); + } +} + +// ---------------------------------------------------------------------------- +// delayed destruction of dictionaries + +static bool dictionary_free_all_resources(DICTIONARY *dict, size_t *mem, bool force) { + if(mem) + *mem = 0; + + if(!force && dictionary_referenced_items(dict)) + return false; + + size_t dict_size = 0, counted_items = 0, item_size = 0, index_size = 0; + (void)counted_items; + +#ifdef NETDATA_INTERNAL_CHECKS + long int entries = dict->entries; + long int referenced_items = dict->referenced_items; + long int pending_deletion_items = dict->pending_deletion_items; + const char *creation_function = dict->creation_function; + const char *creation_file = dict->creation_file; + size_t creation_line = dict->creation_line; +#endif + + // destroy the index + dictionary_index_lock_wrlock(dict); + index_size += hashtable_destroy_unsafe(dict); + dictionary_index_wrlock_unlock(dict); + + ll_recursive_lock(dict, DICTIONARY_LOCK_WRITE); + DICTIONARY_ITEM *item = dict->items.list; + while (item) { + // cache item->next + // because we are going to free item + DICTIONARY_ITEM *item_next = item->next; + + item_size += dict_item_free_with_hooks(dict, item); + item = item_next; + + // to speed up destruction, we don't unlink the item + // from the linked-list here + + counted_items++; + } + dict->items.list = NULL; + ll_recursive_unlock(dict, DICTIONARY_LOCK_WRITE); + + dict_size += dictionary_locks_destroy(dict); + dict_size += reference_counter_free(dict); + dict_size += dictionary_hooks_free(dict); + dict_size += sizeof(DICTIONARY); + DICTIONARY_STATS_MINUS_MEMORY(dict, 0, sizeof(DICTIONARY), 0); + + if(dict->value_aral) + aral_by_size_release(dict->value_aral); + + freez(dict); + + internal_error( + false, + "DICTIONARY: Freed dictionary created from %s() %zu@%s, having %ld (counted %zu) entries, %ld referenced, %ld pending deletion, total freed memory: %zu bytes (sizeof(dict) = %zu, sizeof(item) = %zu).", + creation_function, + creation_line, + creation_file, + entries, counted_items, referenced_items, pending_deletion_items, + dict_size + item_size, sizeof(DICTIONARY), sizeof(DICTIONARY_ITEM) + sizeof(DICTIONARY_ITEM_SHARED)); + + if(mem) + *mem = dict_size + item_size + index_size; + + return true; +} + +netdata_mutex_t dictionaries_waiting_to_be_destroyed_mutex = NETDATA_MUTEX_INITIALIZER; +static DICTIONARY *dictionaries_waiting_to_be_destroyed = NULL; + +static void dictionary_queue_for_destruction(DICTIONARY *dict) { + if(is_dictionary_destroyed(dict)) + return; + + DICTIONARY_STATS_DICT_DESTROY_QUEUED_PLUS1(dict); + dict_flag_set(dict, DICT_FLAG_DESTROYED); + + netdata_mutex_lock(&dictionaries_waiting_to_be_destroyed_mutex); + + dict->next = dictionaries_waiting_to_be_destroyed; + dictionaries_waiting_to_be_destroyed = dict; + + netdata_mutex_unlock(&dictionaries_waiting_to_be_destroyed_mutex); +} + +void cleanup_destroyed_dictionaries(void) { + if(!dictionaries_waiting_to_be_destroyed) + return; + + netdata_mutex_lock(&dictionaries_waiting_to_be_destroyed_mutex); + + DICTIONARY *dict, *last = NULL, *next = NULL; + for(dict = dictionaries_waiting_to_be_destroyed; dict ; dict = next) { + next = dict->next; + +#ifdef NETDATA_INTERNAL_CHECKS + size_t line = dict->creation_line; + const char *file = dict->creation_file; + const char *function = dict->creation_function; + pid_t pid = dict->creation_tid; +#endif + + DICTIONARY_STATS_DICT_DESTROY_QUEUED_MINUS1(dict); + if(dictionary_free_all_resources(dict, NULL, false)) { + + internal_error( + true, + "DICTIONARY: freed dictionary with delayed destruction, created from %s() %zu@%s pid %d.", + function, line, file, pid); + + if(last) last->next = next; + else dictionaries_waiting_to_be_destroyed = next; + } + else { + + internal_error( + true, + "DICTIONARY: cannot free dictionary with delayed destruction, created from %s() %zu@%s pid %d.", + function, line, file, pid); + + DICTIONARY_STATS_DICT_DESTROY_QUEUED_PLUS1(dict); + last = dict; + } + } + + netdata_mutex_unlock(&dictionaries_waiting_to_be_destroyed_mutex); +} + +// ---------------------------------------------------------------------------- +// API internal checks + +#ifdef NETDATA_INTERNAL_CHECKS +#define api_internal_check(dict, item, allow_null_dict, allow_null_item) api_internal_check_with_trace(dict, item, __FUNCTION__, allow_null_dict, allow_null_item) +static inline void api_internal_check_with_trace(DICTIONARY *dict, DICTIONARY_ITEM *item, const char *function, bool allow_null_dict, bool allow_null_item) { + if(!allow_null_dict && !dict) { + internal_error( + item, + "DICTIONARY: attempted to %s() with a NULL dictionary, passing an item created from %s() %zu@%s.", + function, + item->dict->creation_function, + item->dict->creation_line, + item->dict->creation_file); + fatal("DICTIONARY: attempted to %s() but dict is NULL", function); + } + + if(!allow_null_item && !item) { + internal_error( + true, + "DICTIONARY: attempted to %s() without an item on a dictionary created from %s() %zu@%s.", + function, + dict?dict->creation_function:"unknown", + dict?dict->creation_line:0, + dict?dict->creation_file:"unknown"); + fatal("DICTIONARY: attempted to %s() but item is NULL", function); + } + + if(dict && item && dict != item->dict) { + internal_error( + true, + "DICTIONARY: attempted to %s() an item on a dictionary created from %s() %zu@%s, but the item belongs to the dictionary created from %s() %zu@%s.", + function, + dict->creation_function, + dict->creation_line, + dict->creation_file, + item->dict->creation_function, + item->dict->creation_line, + item->dict->creation_file + ); + fatal("DICTIONARY: %s(): item does not belong to this dictionary.", function); + } + + if(item) { + REFCOUNT refcount = DICTIONARY_ITEM_REFCOUNT_GET(dict, item); + if (unlikely(refcount <= 0)) { + internal_error( + true, + "DICTIONARY: attempted to %s() of an item with reference counter = %d on a dictionary created from %s() %zu@%s", + function, + refcount, + item->dict->creation_function, + item->dict->creation_line, + item->dict->creation_file); + fatal("DICTIONARY: attempted to %s but item is having refcount = %d", function, refcount); + } + } +} +#else +#define api_internal_check(dict, item, allow_null_dict, allow_null_item) debug_dummy() +#endif + +#define api_is_name_good(dict, name, name_len) api_is_name_good_with_trace(dict, name, name_len, __FUNCTION__) +static bool api_is_name_good_with_trace(DICTIONARY *dict __maybe_unused, const char *name, ssize_t name_len __maybe_unused, const char *function __maybe_unused) { + if(unlikely(!name)) { + internal_error( + true, + "DICTIONARY: attempted to %s() with name = NULL on a dictionary created from %s() %zu@%s.", + function, + dict?dict->creation_function:"unknown", + dict?dict->creation_line:0, + dict?dict->creation_file:"unknown"); + return false; + } + + if(unlikely(!*name)) { + internal_error( + true, + "DICTIONARY: attempted to %s() with empty name on a dictionary created from %s() %zu@%s.", + function, + dict?dict->creation_function:"unknown", + dict?dict->creation_line:0, + dict?dict->creation_file:"unknown"); + return false; + } + + internal_error( + name_len > 0 && name_len != (ssize_t)strlen(name), + "DICTIONARY: attempted to %s() with a name of '%s', having length of %zu, " + "but the supplied name_len = %ld, on a dictionary created from %s() %zu@%s.", + function, + name, + strlen(name), + (long int) name_len, + dict?dict->creation_function:"unknown", + dict?dict->creation_line:0, + dict?dict->creation_file:"unknown"); + + internal_error( + name_len <= 0 && name_len != -1, + "DICTIONARY: attempted to %s() with a name of '%s', having length of %zu, " + "but the supplied name_len = %ld, on a dictionary created from %s() %zu@%s.", + function, + name, + strlen(name), + (long int) name_len, + dict?dict->creation_function:"unknown", + dict?dict->creation_line:0, + dict?dict->creation_file:"unknown"); + + return true; +} + +// ---------------------------------------------------------------------------- +// API - dictionary management + +static DICTIONARY *dictionary_create_internal(DICT_OPTIONS options, struct dictionary_stats *stats, size_t fixed_size) { + cleanup_destroyed_dictionaries(); + + DICTIONARY *dict = callocz(1, sizeof(DICTIONARY)); + dict->options = options; + dict->stats = stats; + + if((dict->options & DICT_OPTION_FIXED_SIZE) && !fixed_size) { + dict->options &= ~DICT_OPTION_FIXED_SIZE; + internal_fatal(true, "DICTIONARY: requested fixed size dictionary, without setting the size"); + } + if(!(dict->options & DICT_OPTION_FIXED_SIZE) && fixed_size) { + dict->options |= DICT_OPTION_FIXED_SIZE; + internal_fatal(true, "DICTIONARY: set a fixed size for the items, without setting DICT_OPTION_FIXED_SIZE flag"); + } + + if(dict->options & DICT_OPTION_FIXED_SIZE) + dict->value_aral = aral_by_size_acquire(fixed_size); + else + dict->value_aral = NULL; + + if(!(dict->options & (DICT_OPTION_INDEX_JUDY|DICT_OPTION_INDEX_HASHTABLE))) + dict->options |= DICT_OPTION_INDEX_JUDY; + + size_t dict_size = 0; + dict_size += sizeof(DICTIONARY); + dict_size += dictionary_locks_init(dict); + dict_size += reference_counter_init(dict); + dict_size += hashtable_init_unsafe(dict); + + dictionary_static_items_aral_init(); + pointer_index_init(dict); + + DICTIONARY_STATS_PLUS_MEMORY(dict, 0, dict_size, 0); + + return dict; +} + +#ifdef NETDATA_INTERNAL_CHECKS +DICTIONARY *dictionary_create_advanced_with_trace(DICT_OPTIONS options, struct dictionary_stats *stats, size_t fixed_size, const char *function, size_t line, const char *file) { +#else +DICTIONARY *dictionary_create_advanced(DICT_OPTIONS options, struct dictionary_stats *stats, size_t fixed_size) { +#endif + + DICTIONARY *dict = dictionary_create_internal(options, stats?stats:&dictionary_stats_category_other, fixed_size); + +#ifdef NETDATA_INTERNAL_CHECKS + dict->creation_function = function; + dict->creation_file = file; + dict->creation_line = line; +#endif + + DICTIONARY_STATS_DICT_CREATIONS_PLUS1(dict); + return dict; +} + +#ifdef NETDATA_INTERNAL_CHECKS +DICTIONARY *dictionary_create_view_with_trace(DICTIONARY *master, const char *function, size_t line, const char *file) { +#else +DICTIONARY *dictionary_create_view(DICTIONARY *master) { +#endif + + DICTIONARY *dict = dictionary_create_internal(master->options, master->stats, + master->value_aral ? aral_element_size(master->value_aral) : 0); + + dict->master = master; + + dictionary_hooks_allocate(master); + + if(unlikely(__atomic_load_n(&master->hooks->links, __ATOMIC_RELAXED)) < 1) + fatal("DICTIONARY: attempted to create a view that has %d links", master->hooks->links); + + dict->hooks = master->hooks; + __atomic_add_fetch(&master->hooks->links, 1, __ATOMIC_ACQUIRE); + +#ifdef NETDATA_INTERNAL_CHECKS + dict->creation_function = function; + dict->creation_file = file; + dict->creation_line = line; + dict->creation_tid = gettid_cached(); +#endif + + DICTIONARY_STATS_DICT_CREATIONS_PLUS1(dict); + return dict; +} + +void dictionary_flush(DICTIONARY *dict) { + if(unlikely(!dict)) + return; + + ll_recursive_lock(dict, DICTIONARY_LOCK_WRITE); + + DICTIONARY_ITEM *item, *next = NULL; + for(item = dict->items.list; item ;item = next) { + next = item->next; + dict_item_del(dict, item_get_name(item), (ssize_t)item_get_name_len(item)); + } + + ll_recursive_unlock(dict, DICTIONARY_LOCK_WRITE); + + DICTIONARY_STATS_DICT_FLUSHES_PLUS1(dict); +} + +size_t dictionary_destroy(DICTIONARY *dict) { + cleanup_destroyed_dictionaries(); + + if(!dict) return 0; + + ll_recursive_lock(dict, DICTIONARY_LOCK_WRITE); + + dict_flag_set(dict, DICT_FLAG_DESTROYED); + DICTIONARY_STATS_DICT_DESTRUCTIONS_PLUS1(dict); + + size_t referenced_items = dictionary_referenced_items(dict); + if(referenced_items) { + dictionary_flush(dict); + dictionary_queue_for_destruction(dict); + + internal_error( + true, + "DICTIONARY: delaying destruction of dictionary created from %s() %zu@%s, because it has %d referenced items in it (%d total).", + dict->creation_function, + dict->creation_line, + dict->creation_file, + dict->referenced_items, + dict->entries); + + ll_recursive_unlock(dict, DICTIONARY_LOCK_WRITE); + return 0; + } + + ll_recursive_unlock(dict, DICTIONARY_LOCK_WRITE); + + size_t freed; + dictionary_free_all_resources(dict, &freed, true); + + return freed; +} + +// ---------------------------------------------------------------------------- +// SET an item to the dictionary + +DICT_ITEM_CONST DICTIONARY_ITEM *dictionary_set_and_acquire_item_advanced(DICTIONARY *dict, const char *name, ssize_t name_len, void *value, size_t value_len, void *constructor_data) { + if(unlikely(!api_is_name_good(dict, name, name_len))) + return NULL; + + api_internal_check(dict, NULL, false, true); + + if(unlikely(is_view_dictionary(dict))) + fatal("DICTIONARY: this dictionary is a view, you cannot add items other than the ones from the master dictionary."); + + DICTIONARY_ITEM *item = + dict_item_add_or_reset_value_and_acquire(dict, name, name_len, value, value_len, constructor_data, NULL); + api_internal_check(dict, item, false, false); + return item; +} + +void *dictionary_set_advanced(DICTIONARY *dict, const char *name, ssize_t name_len, void *value, size_t value_len, void *constructor_data) { + DICTIONARY_ITEM *item = dictionary_set_and_acquire_item_advanced(dict, name, name_len, value, value_len, constructor_data); + + if(likely(item)) { + void *v = item->shared->value; + item_release(dict, item); + return v; + } + + return NULL; +} + +DICT_ITEM_CONST DICTIONARY_ITEM *dictionary_view_set_and_acquire_item_advanced(DICTIONARY *dict, const char *name, ssize_t name_len, DICTIONARY_ITEM *master_item) { + if(unlikely(!api_is_name_good(dict, name, name_len))) + return NULL; + + api_internal_check(dict, NULL, false, true); + + if(unlikely(is_master_dictionary(dict))) + fatal("DICTIONARY: this dictionary is a master, you cannot add items from other dictionaries."); + + garbage_collect_pending_deletes(dict); + + dictionary_acquired_item_dup(dict->master, master_item); + DICTIONARY_ITEM *item = dict_item_add_or_reset_value_and_acquire(dict, name, name_len, NULL, 0, NULL, master_item); + dictionary_acquired_item_release(dict->master, master_item); + + api_internal_check(dict, item, false, false); + return item; +} + +void *dictionary_view_set_advanced(DICTIONARY *dict, const char *name, ssize_t name_len, DICTIONARY_ITEM *master_item) { + DICTIONARY_ITEM *item = dictionary_view_set_and_acquire_item_advanced(dict, name, name_len, master_item); + + if(likely(item)) { + void *v = item->shared->value; + item_release(dict, item); + return v; + } + + return NULL; +} + +// ---------------------------------------------------------------------------- +// GET an item from the dictionary + +DICT_ITEM_CONST DICTIONARY_ITEM *dictionary_get_and_acquire_item_advanced(DICTIONARY *dict, const char *name, ssize_t name_len) { + if(unlikely(!api_is_name_good(dict, name, name_len))) + return NULL; + + api_internal_check(dict, NULL, false, true); + DICTIONARY_ITEM *item = dict_item_find_and_acquire(dict, name, name_len); + api_internal_check(dict, item, false, true); + return item; +} + +void *dictionary_get_advanced(DICTIONARY *dict, const char *name, ssize_t name_len) { + DICTIONARY_ITEM *item = dictionary_get_and_acquire_item_advanced(dict, name, name_len); + + if(likely(item)) { + void *v = item->shared->value; + item_release(dict, item); + return v; + } + + return NULL; +} + +// ---------------------------------------------------------------------------- +// DUP/REL an item (increase/decrease its reference counter) + +DICT_ITEM_CONST DICTIONARY_ITEM *dictionary_acquired_item_dup(DICTIONARY *dict, DICT_ITEM_CONST DICTIONARY_ITEM *item) { + // we allow the item to be NULL here + api_internal_check(dict, item, false, true); + + if(likely(item)) { + item_acquire(dict, item); + api_internal_check(dict, item, false, false); + } + + return item; +} + +void dictionary_acquired_item_release(DICTIONARY *dict, DICT_ITEM_CONST DICTIONARY_ITEM *item) { + // we allow the item to be NULL here + api_internal_check(dict, item, false, true); + + // no need to get a lock here + // we pass the last parameter to reference_counter_release() as true + // so that the release may get a write-lock if required to clean up + + if(likely(item)) + item_release(dict, item); +} + +// ---------------------------------------------------------------------------- +// get the name/value of an item + +const char *dictionary_acquired_item_name(DICT_ITEM_CONST DICTIONARY_ITEM *item) { + return item_get_name(item); +} + +void *dictionary_acquired_item_value(DICT_ITEM_CONST DICTIONARY_ITEM *item) { + if(likely(item)) + return item->shared->value; + + return NULL; +} + +size_t dictionary_acquired_item_references(DICT_ITEM_CONST DICTIONARY_ITEM *item) { + if(likely(item)) + return DICTIONARY_ITEM_REFCOUNT_GET_SOLE(item); + + return 0; +} + +// ---------------------------------------------------------------------------- +// DEL an item + +bool dictionary_del_advanced(DICTIONARY *dict, const char *name, ssize_t name_len) { + if(unlikely(!api_is_name_good(dict, name, name_len))) + return false; + + api_internal_check(dict, NULL, false, true); + + if(unlikely(is_dictionary_destroyed(dict))) { + internal_error(true, "DICTIONARY: attempted to delete item on a destroyed dictionary"); + return false; + } + + return dict_item_del(dict, name, name_len); +} diff --git a/src/libnetdata/dictionary/dictionary.h b/src/libnetdata/dictionary/dictionary.h new file mode 100644 index 00000000..231fbfeb --- /dev/null +++ b/src/libnetdata/dictionary/dictionary.h @@ -0,0 +1,333 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_DICTIONARY_H +#define NETDATA_DICTIONARY_H 1 + +#include "../libnetdata.h" + + +/* + * Netdata DICTIONARY features: + * + * CLONE or LINK + * Names and Values in the dictionary can be cloned or linked. + * In clone mode, the dictionary does all the memory management. + * The default is clone for both names and values. + * Set DICT_OPTION_NAME_LINK_DONT_CLONE to link names. + * Set DICT_OPTION_VALUE_LINK_DONT_CLONE to link names. + * + * ORDERED + * Items are ordered in the order they are added (new items are appended at the end). + * You may reverse the order by setting the flag DICT_OPTION_ADD_IN_FRONT. + * + * LOOKUP + * The dictionary uses JudyHS to maintain a very fast randomly accessible hash table. + * + * MULTI-THREADED and SINGLE-THREADED + * Each dictionary may be single threaded (no locks), or multi-threaded (multiple readers or one writer). + * The default is multi-threaded. Add the flag DICT_OPTION_SINGLE_THREADED for single-threaded. + * + * WALK-THROUGH and FOREACH traversal + * The dictionary can be traversed on read or write mode, either with a callback (walkthrough) or with + * a loop (foreach). + * + * In write mode traversal, the caller may delete only the current item, but may add as many items as needed. + * + */ + +#ifdef NETDATA_INTERNAL_CHECKS +#define DICT_WITH_STATS 1 +#endif + +#ifdef DICTIONARY_INTERNALS +#define DICTFE_CONST +#define DICT_ITEM_CONST +#else +#define DICTFE_CONST const +#define DICT_ITEM_CONST const +#endif + +typedef struct dictionary DICTIONARY; +typedef struct dictionary_item DICTIONARY_ITEM; + +typedef enum __attribute__((packed)) dictionary_options { + DICT_OPTION_NONE = 0, // the default is the opposite of all below + DICT_OPTION_SINGLE_THREADED = (1 << 0), // don't use any locks (default: use locks) + DICT_OPTION_VALUE_LINK_DONT_CLONE = (1 << 1), // don't copy the value, just point to the one provided (default: copy) + DICT_OPTION_NAME_LINK_DONT_CLONE = (1 << 2), // don't copy the name, just point to the one provided (default: copy) + DICT_OPTION_DONT_OVERWRITE_VALUE = (1 << 3), // don't overwrite values of dictionary items (default: overwrite) + DICT_OPTION_ADD_IN_FRONT = (1 << 4), // add dictionary items at the front of the linked list (default: at the end) + DICT_OPTION_FIXED_SIZE = (1 << 5), // the items of the dictionary have a fixed size + DICT_OPTION_INDEX_JUDY = (1 << 6), // the default, if no other indexing is set + DICT_OPTION_INDEX_HASHTABLE = (1 << 7), // use SIMPLE_HASHTABLE for indexing +} DICT_OPTIONS; + +struct dictionary_stats { + const char *name; // the name of the category + + struct { + size_t active; // the number of active dictionaries + size_t deleted; // the number of dictionaries queued for destruction + } dictionaries; + + struct { + long entries; // active items in the dictionary + long pending_deletion; // pending deletion items in the dictionary + long referenced; // referenced items in the dictionary + } items; + + struct { + size_t creations; // dictionary creations + size_t destructions; // dictionary destructions + size_t flushes; // dictionary flushes + size_t traversals; // dictionary foreach + size_t walkthroughs; // dictionary walkthrough + size_t garbage_collections; // dictionary garbage collections + size_t searches; // item searches + size_t inserts; // item inserts + size_t resets; // item resets + size_t deletes; // item deletes + } ops; + + struct { + size_t inserts; // number of times the insert callback is called + size_t conflicts; // number of times the conflict callback is called + size_t reacts; // number of times the react callback is called + size_t deletes; // number of times the delete callback is called + } callbacks; + + // memory + struct { + long index; // bytes of keys indexed (indication of the index size) + long values; // bytes of caller structures + long dict; // bytes of the structures dictionary needs + } memory; + + // spin locks + struct { + size_t use_spins; // number of times a reference to item had to spin to acquire it or ignore it + size_t search_spins; // number of times a successful search result had to be thrown away + size_t insert_spins; // number of times an insertion to the hash table had to be repeated + size_t delete_spins; // number of times a deletion had to spin to get a decision + } spin_locks; +}; + +// Create a dictionary +#ifdef NETDATA_INTERNAL_CHECKS +#define dictionary_create(options) dictionary_create_advanced_with_trace(options, NULL, 0, __FUNCTION__, __LINE__, __FILE__) +#define dictionary_create_advanced(options, stats, fixed_size) dictionary_create_advanced_with_trace(options, stats, fixed_size, __FUNCTION__, __LINE__, __FILE__) +DICTIONARY *dictionary_create_advanced_with_trace(DICT_OPTIONS options, struct dictionary_stats *stats, size_t fixed_size, const char *function, size_t line, const char *file); +#else +#define dictionary_create(options) dictionary_create_advanced(options, NULL, 0) +DICTIONARY *dictionary_create_advanced(DICT_OPTIONS options, struct dictionary_stats *stats, size_t fixed_size); +#endif + +// Create a view on a dictionary +#ifdef NETDATA_INTERNAL_CHECKS +#define dictionary_create_view(master) dictionary_create_view_with_trace(master, __FUNCTION__, __LINE__, __FILE__) +DICTIONARY *dictionary_create_view_with_trace(DICTIONARY *master, const char *function, size_t line, const char *file); +#else +DICTIONARY *dictionary_create_view(DICTIONARY *master); +#endif + +// an insert callback to be called just after an item is added to the dictionary +// this callback is called while the dictionary is write locked! +typedef void (*dict_cb_insert_t)(const DICTIONARY_ITEM *item, void *value, void *data); +void dictionary_register_insert_callback(DICTIONARY *dict, dict_cb_insert_t insert_callback, void *data); + +// a delete callback to be called just before an item is deleted forever +// this callback is called while the dictionary is write locked! +typedef void (*dict_cb_delete_t)(const DICTIONARY_ITEM *item, void *value, void *data); +void dictionary_register_delete_callback(DICTIONARY *dict, dict_cb_delete_t delete_callback, void *data); + +// a merge callback to be called when DICT_OPTION_DONT_OVERWRITE_VALUE +// and an item is already found in the dictionary - the dictionary does nothing else in this case +// the old_value will remain in the dictionary - the new_value is ignored +// The callback should return true if the value has been updated (it increases the dictionary version). +typedef bool (*dict_cb_conflict_t)(const DICTIONARY_ITEM *item, void *old_value, void *new_value, void *data); +void dictionary_register_conflict_callback(DICTIONARY *dict, dict_cb_conflict_t conflict_callback, void *data); + +// a reaction callback to be called after every item insertion or conflict +// after the constructors have finished and the items are fully available for use +// and the dictionary is not write locked anymore +typedef void (*dict_cb_react_t)(const DICTIONARY_ITEM *item, void *value, void *data); +void dictionary_register_react_callback(DICTIONARY *dict, dict_cb_react_t react_callback, void *data); + +// Destroy a dictionary +// Returns the number of bytes freed +// The returned value will not include name/key sizes +// Registered delete callbacks will be run for each item in the dictionary. +size_t dictionary_destroy(DICTIONARY *dict); + +// Empties a dictionary +// Referenced items will survive, but are not offered anymore. +// Registered delete callbacks will be run for each item in the dictionary. +void dictionary_flush(DICTIONARY *dict); + +void dictionary_version_increment(DICTIONARY *dict); + +void dictionary_garbage_collect(DICTIONARY *dict); + +void cleanup_destroyed_dictionaries(void); + +// ---------------------------------------------------------------------------- +// Set an item in the dictionary +// +// - if an item with the same name does not exist, create one +// - if an item with the same name exists, then: +// a) if DICT_OPTION_DONT_OVERWRITE_VALUE is set, just return the existing value (ignore the new value) +// else b) reset the value to the new value passed at the call +// +// When DICT_OPTION_VALUE_LINK_DONT_CLONE is set, the value is linked, otherwise it is copied +// When DICT_OPTION_NAME_LINK_DONT_CLONE is set, the name is linked, otherwise it is copied +// +// When neither DICT_OPTION_VALUE_LINK_DONT_CLONE nor DICT_OPTION_NAME_LINK_DONT_CLONE are set, all the +// memory management for names and values is done by the dictionary. +// +// Passing NULL as value, the dictionary will callocz() the newly allocated value, otherwise it will copy it. +// Passing 0 as value_len, the dictionary will set the value to NULL (no allocations for value will be made). +#define dictionary_set(dict, name, value, value_len) dictionary_set_advanced(dict, name, -1, value, value_len, NULL) +void *dictionary_set_advanced(DICTIONARY *dict, const char *name, ssize_t name_len, void *value, size_t value_len, void *constructor_data); + +#define dictionary_set_and_acquire_item(dict, name, value, value_len) dictionary_set_and_acquire_item_advanced(dict, name, -1, value, value_len, NULL) +DICT_ITEM_CONST DICTIONARY_ITEM *dictionary_set_and_acquire_item_advanced(DICTIONARY *dict, const char *name, ssize_t name_len, void *value, size_t value_len, void *constructor_data); + +// set an item in a dictionary view +#define dictionary_view_set_and_acquire_item(dict, name, master_item) dictionary_view_set_and_acquire_item_advanced(dict, name, -1, master_item) +DICT_ITEM_CONST DICTIONARY_ITEM *dictionary_view_set_and_acquire_item_advanced(DICTIONARY *dict, const char *name, ssize_t name_len, DICTIONARY_ITEM *master_item); +#define dictionary_view_set(dict, name, master_item) dictionary_view_set_advanced(dict, name, -1, master_item) +void *dictionary_view_set_advanced(DICTIONARY *dict, const char *name, ssize_t name_len, DICT_ITEM_CONST DICTIONARY_ITEM *master_item); + +// ---------------------------------------------------------------------------- +// Get an item from the dictionary +// If it returns NULL, the item is not found + +#define dictionary_get(dict, name) dictionary_get_advanced(dict, name, -1) +void *dictionary_get_advanced(DICTIONARY *dict, const char *name, ssize_t name_len); + +#define dictionary_get_and_acquire_item(dict, name) dictionary_get_and_acquire_item_advanced(dict, name, -1) +DICT_ITEM_CONST DICTIONARY_ITEM *dictionary_get_and_acquire_item_advanced(DICTIONARY *dict, const char *name, ssize_t name_len); + + +// ---------------------------------------------------------------------------- +// Delete an item from the dictionary +// returns true if the item was found and has been deleted +// returns false if the item was not found in the index + +#define dictionary_del(dict, name) dictionary_del_advanced(dict, name, -1) +bool dictionary_del_advanced(DICTIONARY *dict, const char *name, ssize_t name_len); + +// ---------------------------------------------------------------------------- +// reference counters management + +void dictionary_acquired_item_release(DICTIONARY *dict, DICT_ITEM_CONST DICTIONARY_ITEM *item); + +DICT_ITEM_CONST DICTIONARY_ITEM *dictionary_acquired_item_dup(DICTIONARY *dict, DICT_ITEM_CONST DICTIONARY_ITEM *item); + +const char *dictionary_acquired_item_name(DICT_ITEM_CONST DICTIONARY_ITEM *item); +void *dictionary_acquired_item_value(DICT_ITEM_CONST DICTIONARY_ITEM *item); + +size_t dictionary_acquired_item_references(DICT_ITEM_CONST DICTIONARY_ITEM *item); + +// ---------------------------------------------------------------------------- +// Traverse (walk through) the items of the dictionary. +// The order of traversal is currently the order of insertion. +// +// The callback function may return a negative number to stop the traversal, +// in which case that negative value is returned to the caller. +// +// If all callback calls return zero or positive numbers, the sum of all of +// them is returned to the caller. +// +// You cannot alter the dictionary from inside a dictionary_walkthrough_read() - deadlock! +// You can only delete the current item from inside a dictionary_walkthrough_write() - you can add as many as you want. +// +typedef int (*dict_walkthrough_callback_t)(const DICTIONARY_ITEM *item, void *value, void *data); + +#define dictionary_walkthrough_read(dict, callback, data) dictionary_walkthrough_rw(dict, 'r', callback, data) +#define dictionary_walkthrough_write(dict, callback, data) dictionary_walkthrough_rw(dict, 'w', callback, data) +int dictionary_walkthrough_rw(DICTIONARY *dict, char rw, dict_walkthrough_callback_t walkthrough_callback, void *data); + +typedef int (*dict_item_comparator_t)(const DICTIONARY_ITEM **item1, const DICTIONARY_ITEM **item2); + +#define dictionary_sorted_walkthrough_read(dict, callback, data) dictionary_sorted_walkthrough_rw(dict, 'r', callback, data, NULL) +#define dictionary_sorted_walkthrough_write(dict, callback, data) dictionary_sorted_walkthrough_rw(dict, 'w', callback, data, NULL) +int dictionary_sorted_walkthrough_rw(DICTIONARY *dict, char rw, dict_walkthrough_callback_t walkthrough_callback, void *data, dict_item_comparator_t item_comparator_callback); + +// ---------------------------------------------------------------------------- +// Traverse with foreach +// +// Use like this: +// +// DICTFE dfe = {}; +// for(MY_ITEM *item = dfe_start_read(&dfe, dict); item ; item = dfe_next(&dfe)) { +// // do things with the item and its dfe.name +// } +// dfe_done(&dfe); +// +// You cannot alter the dictionary from within a dfe_read_start() - deadlock! +// You can only delete the current item from inside a dfe_start_write() - you can add as many as you want. +// + +#define DICTIONARY_LOCK_READ 'r' +#define DICTIONARY_LOCK_WRITE 'w' +#define DICTIONARY_LOCK_REENTRANT 'z' + +void dictionary_write_lock(DICTIONARY *dict); +void dictionary_write_unlock(DICTIONARY *dict); + +typedef DICTFE_CONST struct dictionary_foreach { + DICTIONARY *dict; // the dictionary upon we work + + DICTIONARY_ITEM *item; // the item we work on, to remember the position we are at + // this can be used with dictionary_acquired_item_dup() to + // acquire the currently working item. + + const char *name; // the dictionary name of the last item used + void *value; // the dictionary value of the last item used + // same as the return value of dictfe_start() and dictfe_next() + + size_t counter; // counts the number of iterations made, starting from zero + + char rw; // the lock mode 'r' or 'w' + bool locked; // true when the dictionary is locked +} DICTFE; + +#define dfe_start_read(dict, value) dfe_start_rw(dict, value, DICTIONARY_LOCK_READ) +#define dfe_start_write(dict, value) dfe_start_rw(dict, value, DICTIONARY_LOCK_WRITE) +#define dfe_start_reentrant(dict, value) dfe_start_rw(dict, value, DICTIONARY_LOCK_REENTRANT) + +#define dfe_start_rw(dict, value, mode) \ + do { \ + DICTFE value ## _dfe = {}; \ + (void)(value); /* needed to avoid warning when looping without using this */ \ + for((value) = dictionary_foreach_start_rw(&value ## _dfe, (dict), (mode)); \ + (value ## _dfe.item) || (value) ; \ + (value) = dictionary_foreach_next(&value ## _dfe)) \ + { + +#define dfe_done(value) \ + } \ + dictionary_foreach_done(&value ## _dfe); \ + } while(0) + +#define dfe_unlock(value) dictionary_foreach_unlock(&value ## _dfe) + +void *dictionary_foreach_start_rw(DICTFE *dfe, DICTIONARY *dict, char rw); +void *dictionary_foreach_next(DICTFE *dfe); +void dictionary_foreach_done(DICTFE *dfe); +void dictionary_foreach_unlock(DICTFE *dfe); + +// ---------------------------------------------------------------------------- +// Get statistics about the dictionary + +size_t dictionary_version(DICTIONARY *dict); +size_t dictionary_entries(DICTIONARY *dict); +size_t dictionary_referenced_items(DICTIONARY *dict); + +// for all cases that the caller does not provide a stats structure, this is where they are accumulated. +extern struct dictionary_stats dictionary_stats_category_other; + +int dictionary_unittest(size_t entries); + +#endif /* NETDATA_DICTIONARY_H */ diff --git a/src/libnetdata/dictionary/thread-cache.c b/src/libnetdata/dictionary/thread-cache.c new file mode 100644 index 00000000..9dc3de81 --- /dev/null +++ b/src/libnetdata/dictionary/thread-cache.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "thread-cache.h" + +static __thread Pvoid_t thread_cache_judy_array = NULL; + +void *thread_cache_entry_get_or_set(void *key, + ssize_t key_length, + void *value, + void *(*transform_the_value_before_insert)(void *key, size_t key_length, void *value) +) { + if(unlikely(!key || !key_length)) return NULL; + + if(key_length == -1) + key_length = (ssize_t)strlen((char *)key); + + JError_t J_Error; + Pvoid_t *Rc = JudyHSIns(&thread_cache_judy_array, key, key_length, &J_Error); + if (unlikely(Rc == PJERR)) { + fatal("THREAD_CACHE: Cannot insert entry to JudyHS, JU_ERRNO_* == %u, ID == %d", + JU_ERRNO(&J_Error), JU_ERRID(&J_Error)); + } + + if(*Rc == 0) { + // new item added + + *Rc = (transform_the_value_before_insert) ? transform_the_value_before_insert(key, key_length, value) : value; + } + + return *Rc; +} + +void thread_cache_destroy(void) { + if(unlikely(!thread_cache_judy_array)) return; + + JError_t J_Error; + Word_t ret = JudyHSFreeArray(&thread_cache_judy_array, &J_Error); + if(unlikely(ret == (Word_t) JERR)) { + netdata_log_error("THREAD_CACHE: Cannot destroy JudyHS, JU_ERRNO_* == %u, ID == %d", + JU_ERRNO(&J_Error), JU_ERRID(&J_Error)); + } + + internal_error(true, "THREAD_CACHE: hash table freed %lu bytes", ret); + + thread_cache_judy_array = NULL; +} + diff --git a/src/libnetdata/dictionary/thread-cache.h b/src/libnetdata/dictionary/thread-cache.h new file mode 100644 index 00000000..4495ad7d --- /dev/null +++ b/src/libnetdata/dictionary/thread-cache.h @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_THREAD_CACHE_H +#define NETDATA_THREAD_CACHE_H + +#include "../libnetdata.h" + +void *thread_cache_entry_get_or_set(void *key, + ssize_t key_length, + void *value, + void *(*transform_the_value_before_insert)(void *key, size_t key_length, void *value)); + +void thread_cache_destroy(void); + +#endif //NETDATA_THREAD_CACHE_H diff --git a/src/libnetdata/ebpf/README.md b/src/libnetdata/ebpf/README.md new file mode 100644 index 00000000..8d9edb07 --- /dev/null +++ b/src/libnetdata/ebpf/README.md @@ -0,0 +1,13 @@ +<!-- +title: "eBPF" +custom_edit_url: https://github.com/netdata/netdata/edit/master/src/libnetdata/ebpf/README.md +sidebar_label: "eBPF" +learn_status: "Published" +learn_topic_type: "Tasks" +learn_rel_path: "Developers/libnetdata" +--> + +# eBPF library + +Netdata's eBPF library supports the [eBPF collector](/src/collectors/ebpf.plugin/README.md). + diff --git a/src/libnetdata/ebpf/ebpf.c b/src/libnetdata/ebpf/ebpf.c new file mode 100644 index 00000000..4e7c8594 --- /dev/null +++ b/src/libnetdata/ebpf/ebpf.c @@ -0,0 +1,1687 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <dlfcn.h> +#include <sys/utsname.h> + +#include "ebpf.h" +#include "../libnetdata.h" + +char *ebpf_user_config_dir = CONFIG_DIR; +char *ebpf_stock_config_dir = LIBCONFIG_DIR; + +/* +static int clean_kprobe_event(FILE *out, char *filename, char *father_pid, netdata_ebpf_events_t *ptr) +{ + int fd = open(filename, O_WRONLY | O_APPEND, 0); + if (fd < 0) { + if (out) { + fprintf(out, "Cannot open %s : %s\n", filename, strerror(errno)); + } + return 1; + } + + char cmd[1024]; + int length = snprintf(cmd, 1023, "-:kprobes/%c_netdata_%s_%s", ptr->type, ptr->name, father_pid); + int ret = 0; + if (length > 0) { + ssize_t written = write(fd, cmd, strlen(cmd)); + if (written < 0) { + if (out) { + fprintf( + out, "Cannot remove the event (%d, %d) '%s' from %s : %s\n", getppid(), getpid(), cmd, filename, + strerror((int)errno)); + } + ret = 1; + } + } + + close(fd); + + return ret; +} + +int clean_kprobe_events(FILE *out, int pid, netdata_ebpf_events_t *ptr) +{ + char filename[FILENAME_MAX + 1]; + snprintf(filename, FILENAME_MAX, "%s%s", NETDATA_DEBUGFS, "kprobe_events"); + + char removeme[16]; + snprintf(removeme, 15, "%d", pid); + + int i; + for (i = 0; ptr[i].name; i++) { + if (clean_kprobe_event(out, filename, removeme, &ptr[i])) { + break; + } + } + + return 0; +} +*/ + +//---------------------------------------------------------------------------------------------------------------------- + +/** + * Get Kernel version + * + * Get the current kernel from /proc and returns an integer value representing it + * + * @return it returns a value representing the kernel version. + */ +int ebpf_get_kernel_version() +{ + char major[16], minor[16], patch[16]; + char ver[VERSION_STRING_LEN]; + char *version = ver; + + int fd = open("/proc/sys/kernel/osrelease", O_RDONLY | O_CLOEXEC); + if (fd < 0) + return -1; + + ssize_t len = read(fd, ver, sizeof(ver)); + if (len < 0) { + close(fd); + return -1; + } + + close(fd); + + char *move = major; + while (*version && *version != '.') + *move++ = *version++; + *move = '\0'; + + version++; + move = minor; + while (*version && *version != '.') + *move++ = *version++; + *move = '\0'; + + if (*version) + version++; + else + return -1; + + move = patch; + while (*version && *version != '\n' && *version != '-') + *move++ = *version++; + *move = '\0'; + + // This new rule is fixing kernel version according the formula: + // KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + ((c) > 255 ? 255 : (c))) + // that was extracted from /usr/include/linux/version.h + int ipatch = (int)str2l(patch); + if (ipatch > 255) + ipatch = 255; + + return ((int)(str2l(major) * 65536) + (int)(str2l(minor) * 256) + ipatch); +} + +/** + * Get RH release + * + * Read Red Hat release from /etc/redhat-release + * + * @return It returns RH release on success and -1 otherwise + */ +int get_redhat_release() +{ + char buffer[VERSION_STRING_LEN + 1]; + int major, minor; + FILE *fp = fopen("/etc/redhat-release", "r"); + + if (fp) { + major = 0; + minor = -1; + size_t length = fread(buffer, sizeof(char), VERSION_STRING_LEN, fp); + if (length > 4) { + buffer[length] = '\0'; + char *end = strchr(buffer, '.'); + char *start; + if (end) { + *end = 0x0; + + if (end > buffer) { + start = end - 1; + + major = strtol(start, NULL, 10); + start = ++end; + + end++; + if (end) { + end = 0x00; + minor = strtol(start, NULL, 10); + } else { + minor = -1; + } + } + } + } + + fclose(fp); + return ((major * 256) + minor); + } else { + return -1; + } +} + +/** + * Check if the kernel is in a list of rejected ones + * + * @return Returns 1 if the kernel is rejected, 0 otherwise. + */ +static int kernel_is_rejected() +{ + // Get kernel version from system + char version_string[VERSION_STRING_LEN + 1]; + int version_string_len = 0; + + if (read_txt_file("/proc/version_signature", version_string, sizeof(version_string))) { + if (read_txt_file("/proc/version", version_string, sizeof(version_string))) { + struct utsname uname_buf; + if (!uname(&uname_buf)) { + netdata_log_info("Cannot check kernel version"); + return 0; + } + version_string_len = + snprintfz(version_string, VERSION_STRING_LEN, "%s %s", uname_buf.release, uname_buf.version); + } + } + + if (!version_string_len) + version_string_len = strlen(version_string); + + // Open a file with a list of rejected kernels + char *config_dir = getenv("NETDATA_USER_CONFIG_DIR"); + if (config_dir == NULL) { + config_dir = CONFIG_DIR; + } + + char filename[FILENAME_MAX + 1]; + snprintfz(filename, FILENAME_MAX, "%s/ebpf.d/%s", config_dir, EBPF_KERNEL_REJECT_LIST_FILE); + FILE *kernel_reject_list = fopen(filename, "r"); + + if (!kernel_reject_list) { + // Keep this to have compatibility with old versions + snprintfz(filename, FILENAME_MAX, "%s/%s", config_dir, EBPF_KERNEL_REJECT_LIST_FILE); + kernel_reject_list = fopen(filename, "r"); + + if (!kernel_reject_list) { + config_dir = getenv("NETDATA_STOCK_CONFIG_DIR"); + if (config_dir == NULL) { + config_dir = LIBCONFIG_DIR; + } + + snprintfz(filename, FILENAME_MAX, "%s/ebpf.d/%s", config_dir, EBPF_KERNEL_REJECT_LIST_FILE); + kernel_reject_list = fopen(filename, "r"); + + if (!kernel_reject_list) + return 0; + } + } + + // Find if the kernel is in the reject list + char *reject_string = NULL; + size_t buf_len = 0; + ssize_t reject_string_len; + while ((reject_string_len = getline(&reject_string, &buf_len, kernel_reject_list) - 1) > 0) { + if (version_string_len >= reject_string_len) { + if (!strncmp(version_string, reject_string, reject_string_len)) { + netdata_log_info("A buggy kernel is detected"); + fclose(kernel_reject_list); + freez(reject_string); + return 1; + } + } + } + + fclose(kernel_reject_list); + free(reject_string); + + return 0; +} + +/** + * Check Kernel Version + * + * Test kernel version + * + * @param version current kernel version + * + * @return It returns 1 when kernel is supported and 0 otherwise + */ +int ebpf_check_kernel_version(int version) +{ + if (kernel_is_rejected()) + return 0; + + // Kernel 4.11.0 or RH > 7.5 + return (version >= NETDATA_MINIMUM_EBPF_KERNEL || get_redhat_release() >= NETDATA_MINIMUM_RH_VERSION); +} + +/** + * Am I running as Root + * + * Verify the user that is running the collector. + * + * @return It returns 1 for root and 0 otherwise. + */ +int is_ebpf_plugin_running_as_root() +{ + uid_t uid = getuid(), euid = geteuid(); + + if (uid == 0 || euid == 0) { + return 1; + } + + return 0; +} + +/** + * Can the plugin run eBPF code + * + * This function checks kernel version and permissions. + * + * @param kver the kernel version + * @param name the plugin name. + * + * @return It returns 0 on success and -1 otherwise + */ +int ebpf_can_plugin_load_code(int kver, char *plugin_name) +{ + if (!ebpf_check_kernel_version(kver)) { + netdata_log_error("The current collector cannot run on this kernel."); + return -1; + } + + if (!is_ebpf_plugin_running_as_root()) { + netdata_log_error( + "%s should either run as root (now running with uid %u, euid %u) or have special capabilities.", + plugin_name, (unsigned int)getuid(), (unsigned int)geteuid()); + return -1; + } + + return 0; +} + +/** + * Adjust memory + * + * Adjust memory values to load eBPF programs. + * + * @return It returns 0 on success and -1 otherwise + */ +int ebpf_adjust_memory_limit() +{ + struct rlimit r = { RLIM_INFINITY, RLIM_INFINITY }; + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + netdata_log_error("Setrlimit(RLIMIT_MEMLOCK)"); + return -1; + } + + return 0; +} + +//---------------------------------------------------------------------------------------------------------------------- + +/** + * Kernel Name + * + * Select kernel name used by eBPF programs + * + * Netdata delivers for users eBPF programs with specific suffixes that represent the kernels they were + * compiled, when we load the eBPF program, the suffix must be the nereast possible of the kernel running. + * + * @param selector select the kernel version. + * + * @return It returns the string to load kernel. + */ +static char *ebpf_select_kernel_name(uint32_t selector) +{ + static char *kernel_names[] = { NETDATA_IDX_STR_V3_10, NETDATA_IDX_STR_V4_14, NETDATA_IDX_STR_V4_16, + NETDATA_IDX_STR_V4_18, NETDATA_IDX_STR_V5_4, NETDATA_IDX_STR_V5_10, + NETDATA_IDX_STR_V5_11, NETDATA_IDX_STR_V5_14, NETDATA_IDX_STR_V5_15, + NETDATA_IDX_STR_V5_16, NETDATA_IDX_STR_V6_8 + }; + + return kernel_names[selector]; +} + +/** + * Select Max Index + * + * Select last index that will be tested on host. + * + * @param is_rhf is Red Hat fammily? + * @param kver the kernel version + * + * @return it returns the index to access kernel string. + */ +static int ebpf_select_max_index(int is_rhf, uint32_t kver) +{ + if (is_rhf > 0) { // Is Red Hat family + if (kver >= NETDATA_EBPF_KERNEL_5_14) + return NETDATA_IDX_V5_14; + else if (kver >= NETDATA_EBPF_KERNEL_5_4 && kver < NETDATA_EBPF_KERNEL_5_5) // For Oracle Linux + return NETDATA_IDX_V5_4; + else if (kver >= NETDATA_EBPF_KERNEL_4_11) + return NETDATA_IDX_V4_18; + } else { // Kernels from kernel.org + if (kver >= NETDATA_EBPF_KERNEL_6_8) + return NETDATA_IDX_V6_8; + else if (kver >= NETDATA_EBPF_KERNEL_5_16) + return NETDATA_IDX_V5_16; + else if (kver >= NETDATA_EBPF_KERNEL_5_15) + return NETDATA_IDX_V5_15; + else if (kver >= NETDATA_EBPF_KERNEL_5_11) + return NETDATA_IDX_V5_11; + else if (kver >= NETDATA_EBPF_KERNEL_5_10) + return NETDATA_IDX_V5_10; + else if (kver >= NETDATA_EBPF_KERNEL_4_17) + return NETDATA_IDX_V5_4; + else if (kver >= NETDATA_EBPF_KERNEL_4_15) + return NETDATA_IDX_V4_16; + else if (kver >= NETDATA_EBPF_KERNEL_4_11) + return NETDATA_IDX_V4_14; + } + + return NETDATA_IDX_V3_10; +} + +/** + * Select Index + * + * Select index to load data. + * + * @param kernels is the variable with kernel versions. + * @param is_rhf is Red Hat fammily? + * param kver the kernel version + */ +static uint32_t ebpf_select_index(uint32_t kernels, int is_rhf, uint32_t kver) +{ + uint32_t start = ebpf_select_max_index(is_rhf, kver); + uint32_t idx; + + if (is_rhf == -1) + kernels &= ~NETDATA_V5_14; + + for (idx = start; idx; idx--) { + if (kernels & 1 << idx) + break; + } + + return idx; +} + +/** + * Mount Name + * + * Mount name of eBPF program to be loaded. + * + * Netdata eBPF programs has the following format: + * + * Tnetdata_ebpf_N.V.o + * + * where: + * T - Is the eBPF type. When starts with 'p', this means we are only adding probes, + * and when they start with 'r' we are using retprobes. + * N - The eBPF program name. + * V - The kernel version in string format. + * + * @param out the vector where the name will be stored + * @param len the size of the out vector. + * @param path where the binaries are stored + * @param kver the kernel version + * @param name the eBPF program name. + * @param is_return is return or entry ? + */ +static void ebpf_mount_name(char *out, size_t len, char *path, uint32_t kver, const char *name, + int is_return, int is_rhf) +{ + char *version = ebpf_select_kernel_name(kver); + snprintfz(out, len, "%s/ebpf.d/%cnetdata_ebpf_%s.%s%s.o", + path, + (is_return) ? 'r' : 'p', + name, + version, + (is_rhf != -1) ? ".rhf" : ""); +} + +//---------------------------------------------------------------------------------------------------------------------- + +/** + * Statistics from targets + * + * Count the information from targets. + * + * @param report the output structure + * @param targets vector with information about the eBPF plugin. + * @param value factor used to update calculation + */ +static void ebpf_stats_targets(ebpf_plugin_stats_t *report, netdata_ebpf_targets_t *targets, int value) +{ + if (!targets) { + report->probes = report->tracepoints = report->trampolines = 0; + return; + } + + int i = 0; + while (targets[i].name) { + switch (targets[i].mode) { + case EBPF_LOAD_PROBE: { + report->probes += value; + break; + } + case EBPF_LOAD_RETPROBE: { + report->retprobes += value; + break; + } + case EBPF_LOAD_TRACEPOINT: { + report->tracepoints += value; + break; + } + case EBPF_LOAD_TRAMPOLINE: { + report->trampolines += value; + break; + } + } + + i++; + } +} + +/** + * Update General stats + * + * Update eBPF plugin statistics that has relationship with the thread. + * + * This function must be called with mutex associated to charts is locked. + * + * @param report the output structure + * @param em the structure with information about how the module/thread is working. + */ +void ebpf_update_stats(ebpf_plugin_stats_t *report, ebpf_module_t *em) +{ + int value; + + // It is not necessary to report more information. + if (em->enabled > NETDATA_THREAD_EBPF_FUNCTION_RUNNING) + value = -1; + else + value = 1; + + report->threads += value; + report->running += value; + + // In theory the `else if` is useless, because when this function is called, the module should not stay in + // EBPF_LOAD_PLAY_DICE. We have this additional condition to detect errors from developers. + if (em->load & EBPF_LOAD_LEGACY) + report->legacy += value; + else if (em->load & EBPF_LOAD_CORE) + report->core += value; + + if (em->maps_per_core) + report->hash_percpu += value; + else + report->hash_unique += value; + + ebpf_stats_targets(report, em->targets, value); +} + +/** + * Update Kernel memory with memory + * + * This algorithm is an adaptation of https://elixir.bootlin.com/linux/v6.1.14/source/tools/bpf/bpftool/common.c#L402 + * to get 'memlock' data and update report. + * + * @param report the output structure + * @param map pointer to a map. + * @param action What action will be done with this map. + */ +void ebpf_update_kernel_memory(ebpf_plugin_stats_t *report, ebpf_local_maps_t *map, ebpf_stats_action_t action) +{ + char filename[FILENAME_MAX+1]; + snprintfz(filename, FILENAME_MAX, "/proc/self/fdinfo/%d", map->map_fd); + procfile *ff = procfile_open(filename, " \t", PROCFILE_FLAG_DEFAULT); + if(unlikely(!ff)) { + netdata_log_error("Cannot open %s", filename); + return; + } + + ff = procfile_readall(ff); + if(unlikely(!ff)) + return; + + unsigned long j, lines = procfile_lines(ff); + char *memlock = { "memlock" }; + for (j = 0; j < lines ; j++) { + char *cmp = procfile_lineword(ff, j,0); + if (!strncmp(memlock, cmp, 7)) { + uint64_t memsize = (uint64_t) str2l(procfile_lineword(ff, j,1)); + switch (action) { + case EBPF_ACTION_STAT_ADD: { + report->memlock_kern += memsize; + report->hash_tables += 1; +#ifdef NETDATA_DEV_MODE + netdata_log_info("Hash table %u: %s (FD = %d) is consuming %lu bytes totalizing %lu bytes", + report->hash_tables, map->name, map->map_fd, memsize, report->memlock_kern); +#endif + break; + } + case EBPF_ACTION_STAT_REMOVE: { + report->memlock_kern -= memsize; + report->hash_tables -= 1; +#ifdef NETDATA_DEV_MODE + netdata_log_info("Hash table %s (FD = %d) was removed releasing %lu bytes, now we have %u tables loaded totalizing %lu bytes.", + map->name, map->map_fd, memsize, report->hash_tables, report->memlock_kern); +#endif + break; + } + default: { + break; + } + } + break; + } + } + + procfile_close(ff); +} + +/** + * Update Kernel memory with memory + * + * This algorithm is an adaptation of https://elixir.bootlin.com/linux/v6.1.14/source/tools/bpf/bpftool/common.c#L402 + * to get 'memlock' data and update report. + * + * @param report the output structure + * @param map pointer to a map. Last map must fish with name = NULL + * @param action should plugin add or remove values from amount. + */ +void ebpf_update_kernel_memory_with_vector(ebpf_plugin_stats_t *report, + ebpf_local_maps_t *maps, + ebpf_stats_action_t action) +{ + if (!maps) + return; + + ebpf_local_maps_t *map; + int i = 0; + for (map = &maps[i]; maps[i].name; i++, map = &maps[i]) { + int fd = map->map_fd; + if (fd == ND_EBPF_MAP_FD_NOT_INITIALIZED) + continue; + + ebpf_update_kernel_memory(report, map, action); + } +} + +//---------------------------------------------------------------------------------------------------------------------- + +void ebpf_update_pid_table(ebpf_local_maps_t *pid, ebpf_module_t *em) +{ + pid->user_input = em->pid_map_size; +} + +/** + * Update map size + * + * Update map size with information read from configuration files. + * + * @param map the structure with file descriptor to update. + * @param lmap the structure with information from configuration files. + * @param em the structure with information about how the module/thread is working. + * @param map_name the name of the file used to log. + */ +void ebpf_update_map_size(struct bpf_map *map, ebpf_local_maps_t *lmap, ebpf_module_t *em, const char *map_name __maybe_unused) +{ + uint32_t define_size = 0; + uint32_t apps_type = NETDATA_EBPF_MAP_PID | NETDATA_EBPF_MAP_RESIZABLE; + if (lmap->user_input && lmap->user_input != lmap->internal_input) { + define_size = lmap->internal_input; +#ifdef NETDATA_INTERNAL_CHECKS + netdata_log_info("Changing map %s from size %u to %u ", map_name, lmap->internal_input, lmap->user_input); +#endif + } else if (((lmap->type & apps_type) == apps_type) && (!em->apps_charts) && (!em->cgroup_charts)) { + lmap->user_input = ND_EBPF_DEFAULT_MIN_PID; + } else if (((em->apps_charts) || (em->cgroup_charts)) && (em->apps_level != NETDATA_APPS_NOT_SET)) { + switch (em->apps_level) { + case NETDATA_APPS_LEVEL_ALL: { + define_size = lmap->user_input; + break; + } + case NETDATA_APPS_LEVEL_PARENT: { + define_size = ND_EBPF_DEFAULT_PID_SIZE / 2; + break; + } + case NETDATA_APPS_LEVEL_REAL_PARENT: + default: { + define_size = ND_EBPF_DEFAULT_PID_SIZE / 3; + } + } + } + + if (!define_size) + return; + +#ifdef LIBBPF_MAJOR_VERSION + bpf_map__set_max_entries(map, define_size); +#else + bpf_map__resize(map, define_size); +#endif +} + +#ifdef LIBBPF_MAJOR_VERSION +/** + * Update map type + * + * Update map type with information given. + * + * @param map the map we want to modify + * @param w a structure with user input + */ +void ebpf_update_map_type(struct bpf_map *map, ebpf_local_maps_t *w) +{ + if (bpf_map__set_type(map, w->map_type)) { + netdata_log_error("Cannot modify map type for %s", w->name); + } +} + +/** + * Define map type + * + * This PR defines the type used by hash tables according user input. + * + * @param maps the list of maps used with a hash table. + * @param maps_per_core define if map type according user specification. + * @param kver kernel version host is running. + */ +void ebpf_define_map_type(ebpf_local_maps_t *maps, int maps_per_core, int kver) +{ + if (!maps) + return; + + // Before kernel 4.06 there was not percpu hash tables + if (kver < NETDATA_EBPF_KERNEL_4_06) + maps_per_core = CONFIG_BOOLEAN_NO; + + int i = 0; + while (maps[i].name) { + ebpf_local_maps_t *map = &maps[i]; + // maps_per_core is a boolean value in configuration files. + if (maps_per_core) { + if (map->map_type == BPF_MAP_TYPE_HASH) + map->map_type = BPF_MAP_TYPE_PERCPU_HASH; + else if (map->map_type == BPF_MAP_TYPE_ARRAY) + map->map_type = BPF_MAP_TYPE_PERCPU_ARRAY; + } else { + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) + map->map_type = BPF_MAP_TYPE_HASH; + else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + map->map_type = BPF_MAP_TYPE_ARRAY; + } + + i++; + } +} +#endif + +/** + * Update Legacy map + * + * Update map for eBPF legacy code. + * + * @param program the structure with values read from binary. + * @param em the structure with information about how the module/thread is working. + */ +static void ebpf_update_legacy_map(struct bpf_object *program, ebpf_module_t *em) +{ + struct bpf_map *map; + ebpf_local_maps_t *maps = em->maps; + if (!maps) + return; + + bpf_map__for_each(map, program) + { + const char *map_name = bpf_map__name(map); + int i = 0; + while (maps[i].name) { + ebpf_local_maps_t *w = &maps[i]; + + if (!strcmp(w->name, map_name)) { + // Modify size + if (w->type & NETDATA_EBPF_MAP_RESIZABLE) { + ebpf_update_map_size(map, w, em, map_name); + } + +#ifdef LIBBPF_MAJOR_VERSION + ebpf_update_map_type(map, w); +#endif + } + + i++; + } + } +} + +size_t ebpf_count_programs(struct bpf_object *obj) +{ + size_t tot = 0; + struct bpf_program *prog; + bpf_object__for_each_program(prog, obj) + { + tot++; + } + + return tot; +} + +static ebpf_specify_name_t *ebpf_find_names(ebpf_specify_name_t *names, const char *prog_name) +{ + size_t i = 0; + while (names[i].program_name) { + if (!strcmp(prog_name, names[i].program_name)) + return &names[i]; + + i++; + } + + return NULL; +} + +static struct bpf_link **ebpf_attach_programs(struct bpf_object *obj, size_t length, ebpf_specify_name_t *names) +{ + struct bpf_link **links = callocz(length , sizeof(struct bpf_link *)); + size_t i = 0; + struct bpf_program *prog; + ebpf_specify_name_t *w; + bpf_object__for_each_program(prog, obj) + { + if (names) { + const char *name = bpf_program__name(prog); + w = ebpf_find_names(names, name); + } else + w = NULL; + + if (w) { + enum bpf_prog_type type = bpf_program__get_type(prog); + if (type == BPF_PROG_TYPE_KPROBE) + links[i] = bpf_program__attach_kprobe(prog, w->retprobe, w->optional); + } else + links[i] = bpf_program__attach(prog); + + if (libbpf_get_error(links[i])) { + links[i] = NULL; + } + + i++; + } + + return links; +} + +static void ebpf_update_maps(ebpf_module_t *em, struct bpf_object *obj) +{ + if (!em->maps) + return; + + ebpf_local_maps_t *maps = em->maps; + struct bpf_map *map; + bpf_map__for_each(map, obj) + { + int fd = bpf_map__fd(map); + if (maps) { + const char *map_name = bpf_map__name(map); + int j = 0; + while (maps[j].name) { + ebpf_local_maps_t *w = &maps[j]; + if (w->map_fd == ND_EBPF_MAP_FD_NOT_INITIALIZED && !strcmp(map_name, w->name)) + w->map_fd = fd; + + j++; + } + } + } +} + +/** + * Update Controller + * + * Update controller value with user input. + * + * @param fd the table file descriptor + * @param em structure with information about eBPF program we will load. + */ +void ebpf_update_controller(int fd, ebpf_module_t *em) +{ + uint32_t values[NETDATA_CONTROLLER_END] = { + (em->apps_charts & NETDATA_EBPF_APPS_FLAG_YES) | em->cgroup_charts, + em->apps_level, 0, 0, 0, 0 + }; + uint32_t key; + uint32_t end = NETDATA_CONTROLLER_PID_TABLE_ADD; + + for (key = NETDATA_CONTROLLER_APPS_ENABLED; key < end; key++) { + int ret = bpf_map_update_elem(fd, &key, &values[key], BPF_ANY); + if (ret) + netdata_log_error("Add key(%u) for controller table failed.", key); + } +} + +/** + * Update Legacy controller + * + * Update legacy controller table when eBPF program has it. + * + * @param em structure with information about eBPF program we will load. + * @param obj bpf object with tables. + */ +static void ebpf_update_legacy_controller(ebpf_module_t *em, struct bpf_object *obj) +{ + ebpf_local_maps_t *maps = em->maps; + if (!maps) + return; + + struct bpf_map *map; + bpf_map__for_each(map, obj) + { + size_t i = 0; + while (maps[i].name) { + ebpf_local_maps_t *w = &maps[i]; + if (w->map_fd != ND_EBPF_MAP_FD_NOT_INITIALIZED && (w->type & NETDATA_EBPF_MAP_CONTROLLER)) { + w->type &= ~NETDATA_EBPF_MAP_CONTROLLER; + w->type |= NETDATA_EBPF_MAP_CONTROLLER_UPDATED; + + ebpf_update_controller(w->map_fd, em); + } + i++; + } + } +} + +/** + * Load Program + * + * Load eBPF program into kernel + * + * @param plugins_dir directory where binary are stored + * @param em structure with information about eBPF program we will load. + * @param kver the kernel version according /usr/include/linux/version.h + * @param is_rhf is a kernel from Red Hat Family? + * @param obj structure where we will store object loaded. + * + * @return it returns a link for each target we associated an eBPF program. + */ +struct bpf_link **ebpf_load_program(char *plugins_dir, ebpf_module_t *em, int kver, int is_rhf, + struct bpf_object **obj) +{ + char lpath[4096]; + + uint32_t idx = ebpf_select_index(em->kernels, is_rhf, kver); + + ebpf_mount_name(lpath, 4095, plugins_dir, idx, em->info.thread_name, em->mode, is_rhf); + + // When this function is called ebpf.plugin is using legacy code, so we should reset the variable + em->load &= ~ NETDATA_EBPF_LOAD_METHODS; + em->load |= EBPF_LOAD_LEGACY; + + *obj = bpf_object__open_file(lpath, NULL); + if (!*obj) + return NULL; + + if (libbpf_get_error(obj)) { + bpf_object__close(*obj); + return NULL; + } + + ebpf_update_legacy_map(*obj, em); + + if (bpf_object__load(*obj)) { + netdata_log_error("ERROR: loading BPF object file failed %s\n", lpath); + bpf_object__close(*obj); + return NULL; + } + + ebpf_update_maps(em, *obj); + ebpf_update_legacy_controller(em, *obj); + + size_t count_programs = ebpf_count_programs(*obj); + +#ifdef NETDATA_INTERNAL_CHECKS + netdata_log_info("eBPF program %s loaded with success!", lpath); +#endif + + return ebpf_attach_programs(*obj, count_programs, em->names); +} + +char *ebpf_find_symbol(char *search) +{ + char filename[FILENAME_MAX + 1]; + char *ret = NULL; + snprintfz(filename, FILENAME_MAX, "%s%s", netdata_configured_host_prefix, NETDATA_KALLSYMS); + procfile *ff = procfile_open(filename, " \t", PROCFILE_FLAG_DEFAULT); + if(unlikely(!ff)) { + netdata_log_error("Cannot open %s%s", netdata_configured_host_prefix, NETDATA_KALLSYMS); + return ret; + } + + ff = procfile_readall(ff); + if(unlikely(!ff)) + return ret; + + unsigned long i, lines = procfile_lines(ff); + size_t length = strlen(search); + for(i = 0; i < lines ; i++) { + char *cmp = procfile_lineword(ff, i,2); + if (!strncmp(search, cmp, length)) { + ret = strdupz(cmp); + break; + } + } + + procfile_close(ff); + + return ret; +} + +void ebpf_update_names(ebpf_specify_name_t *opt, ebpf_module_t *em) +{ + int mode = em->mode; + em->names = opt; + + size_t i = 0; + while (opt[i].program_name) { + opt[i].retprobe = (mode == MODE_RETURN); + opt[i].optional = ebpf_find_symbol(opt[i].function_to_attach); + + i++; + } +} + +//---------------------------------------------------------------------------------------------------------------------- + +void ebpf_mount_config_name(char *filename, size_t length, char *path, const char *config) +{ + snprintf(filename, length, "%s/ebpf.d/%s", path, config); +} + +int ebpf_load_config(struct config *config, char *filename) +{ + return appconfig_load(config, filename, 0, NULL); +} + + +static netdata_run_mode_t ebpf_select_mode(char *mode) +{ + if (!strcasecmp(mode,EBPF_CFG_LOAD_MODE_RETURN )) + return MODE_RETURN; + else if (!strcasecmp(mode, "dev")) + return MODE_DEVMODE; + + return MODE_ENTRY; +} + +static void ebpf_select_mode_string(char *output, size_t len, netdata_run_mode_t sel) +{ + if (sel == MODE_RETURN) + strncpyz(output, EBPF_CFG_LOAD_MODE_RETURN, len); + else + strncpyz(output, EBPF_CFG_LOAD_MODE_DEFAULT, len); +} + +/** + * Convert string to load mode + * + * Convert the string given as argument to value present in enum. + * + * @param str value read from configuration file. + * + * @return It returns the value to be used. + */ +netdata_ebpf_load_mode_t epbf_convert_string_to_load_mode(char *str) +{ + if (!strcasecmp(str, EBPF_CFG_CORE_PROGRAM)) + return EBPF_LOAD_CORE; + else if (!strcasecmp(str, EBPF_CFG_LEGACY_PROGRAM)) + return EBPF_LOAD_LEGACY; + + return EBPF_LOAD_PLAY_DICE; +} + +/** + * Convert load mode to string + * + * @param mode value that will select the string + * + * @return It returns the string associated to mode. + */ +static char *ebpf_convert_load_mode_to_string(netdata_ebpf_load_mode_t mode) +{ + if (mode & EBPF_LOAD_CORE) + return EBPF_CFG_CORE_PROGRAM; + else if (mode & EBPF_LOAD_LEGACY) + return EBPF_CFG_LEGACY_PROGRAM; + + return EBPF_CFG_DEFAULT_PROGRAM; +} + +/** + * Convert collect pid to string + * + * @param level value that will select the string + * + * @return It returns the string associated to level. + */ +static char *ebpf_convert_collect_pid_to_string(netdata_apps_level_t level) +{ + if (level == NETDATA_APPS_LEVEL_REAL_PARENT) + return EBPF_CFG_PID_REAL_PARENT; + else if (level == NETDATA_APPS_LEVEL_PARENT) + return EBPF_CFG_PID_PARENT; + else if (level == NETDATA_APPS_LEVEL_ALL) + return EBPF_CFG_PID_ALL; + + return EBPF_CFG_PID_INTERNAL_USAGE; +} + +/** + * Convert string to apps level + * + * @param str the argument read from config files + * + * @return it returns the level associated to the string or default when it is a wrong value + */ +netdata_apps_level_t ebpf_convert_string_to_apps_level(char *str) +{ + if (!strcasecmp(str, EBPF_CFG_PID_REAL_PARENT)) + return NETDATA_APPS_LEVEL_REAL_PARENT; + else if (!strcasecmp(str, EBPF_CFG_PID_PARENT)) + return NETDATA_APPS_LEVEL_PARENT; + else if (!strcasecmp(str, EBPF_CFG_PID_ALL)) + return NETDATA_APPS_LEVEL_ALL; + + return NETDATA_APPS_NOT_SET; +} + +/** + * CO-RE type + * + * Select the preferential type of CO-RE + * + * @param str value read from configuration file. + * @param lmode load mode used by collector. + */ +netdata_ebpf_program_loaded_t ebpf_convert_core_type(char *str, netdata_run_mode_t lmode) +{ + if (!strcasecmp(str, EBPF_CFG_ATTACH_TRACEPOINT)) + return EBPF_LOAD_TRACEPOINT; + else if (!strcasecmp(str, EBPF_CFG_ATTACH_PROBE)) { + return (lmode == MODE_ENTRY) ? EBPF_LOAD_PROBE : EBPF_LOAD_RETPROBE; + } + + return EBPF_LOAD_TRAMPOLINE; +} + +#ifdef LIBBPF_MAJOR_VERSION +/** + * Adjust Thread Load + * + * Adjust thread configuration according specified load. + * + * @param mod the main structure that will be adjusted. + * @param file the btf file used with thread. + */ +void ebpf_adjust_thread_load(ebpf_module_t *mod, struct btf *file) +{ + if (!file) { + mod->load &= ~EBPF_LOAD_CORE; + mod->load |= EBPF_LOAD_LEGACY; + } else if (mod->load == EBPF_LOAD_PLAY_DICE && file) { + mod->load &= ~EBPF_LOAD_LEGACY; + mod->load |= EBPF_LOAD_CORE; + } +} + +/** + * Parse BTF file + * + * Parse a specific BTF file present on filesystem + * + * @param filename the file that will be parsed. + * + * @return It returns a pointer for the file on success and NULL otherwise. + */ +struct btf *ebpf_parse_btf_file(const char *filename) +{ + struct btf *bf = btf__parse(filename, NULL); + if (libbpf_get_error(bf)) { + fprintf(stderr, "Cannot parse btf file"); + btf__free(bf); + return NULL; + } + + return bf; +} + +/** + * Load default btf file + * + * Load the default BTF file on environment. + * + * @param path is the fullpath + * @param filename is the file inside BTF path. + */ +struct btf *ebpf_load_btf_file(char *path, char *filename) +{ + char fullpath[PATH_MAX + 1]; + snprintfz(fullpath, PATH_MAX, "%s/%s", path, filename); + struct btf *ret = ebpf_parse_btf_file(fullpath); + if (!ret) + netdata_log_info("Your environment does not have BTF file %s/%s. The plugin will work with 'legacy' code.", + path, filename); + + return ret; +} + +/** + * Find BTF attach type + * + * Search type fr current btf file. + * + * @param file is the structure for the btf file already parsed. + */ +static inline const struct btf_type *ebpf_find_btf_attach_type(struct btf *file) +{ + int id = btf__find_by_name_kind(file, "bpf_attach_type", BTF_KIND_ENUM); + if (id < 0) { + fprintf(stderr, "Cannot find 'bpf_attach_type'"); + + return NULL; + } + + return btf__type_by_id(file, id); +} + +/** + * Is function inside BTF + * + * Look for a specific function inside the given BTF file. + * + * @param file is the structure for the btf file already parsed. + * @param function is the function that we want to find. + */ +int ebpf_is_function_inside_btf(struct btf *file, char *function) +{ + const struct btf_type *type = ebpf_find_btf_attach_type(file); + if (!type) + return -1; + + const struct btf_enum *e = btf_enum(type); + int i, id; + for (id = -1, i = 0; i < btf_vlen(type); i++, e++) { + if (!strcmp(btf__name_by_offset(file, e->name_off), "BPF_TRACE_FENTRY")) { + id = btf__find_by_name_kind(file, function, BTF_KIND_FUNC); + break; + } + } + + return (id > 0) ? 1 : 0; +} +#endif + +/** + * Update target with configuration + * + * Update target load mode with value. + * + * @param em the module structure + * @param value value used to update. + */ +static void ebpf_update_target_with_conf(ebpf_module_t *em, netdata_ebpf_program_loaded_t value) +{ + netdata_ebpf_targets_t *targets = em->targets; + if (!targets) { + return; + } + + int i = 0; + while (targets[i].name) { + targets[i].mode = value; + i++; + } +} + +/** + * Select Load Mode + * + * Select the load mode according the given inputs. + * + * @param btf_file a pointer to the loaded btf file. + * @parma load current value. + * @param btf_file a pointer to the loaded btf file. + * @param is_rhf is Red Hat family? + * + * @return it returns the new load mode. + */ +static netdata_ebpf_load_mode_t ebpf_select_load_mode(struct btf *btf_file __maybe_unused, + netdata_ebpf_load_mode_t load, + int kver __maybe_unused, + int is_rh __maybe_unused) +{ +#ifdef LIBBPF_MAJOR_VERSION + if ((load & EBPF_LOAD_CORE) || (load & EBPF_LOAD_PLAY_DICE)) { + // Quick fix for Oracle linux 8.x + load = (!btf_file || (is_rh && (kver >= NETDATA_EBPF_KERNEL_5_4 && kver < NETDATA_EBPF_KERNEL_5_5))) ? + EBPF_LOAD_LEGACY : EBPF_LOAD_CORE; + } +#else + load = EBPF_LOAD_LEGACY; +#endif + + return load; +} + +/** + * Update Module using config + * + * Update configuration for a specific thread. + * + * @param modules structure that will be updated + * @param origin specify the configuration file loaded + * @param btf_file a pointer to the loaded btf file. + * @param is_rhf is Red Hat family? + */ +void ebpf_update_module_using_config(ebpf_module_t *modules, netdata_ebpf_load_mode_t origin, struct btf *btf_file, + int kver, int is_rh) +{ + char default_value[EBPF_MAX_MODE_LENGTH + 1]; + ebpf_select_mode_string(default_value, EBPF_MAX_MODE_LENGTH, modules->mode); + char *load_mode = appconfig_get(modules->cfg, EBPF_GLOBAL_SECTION, EBPF_CFG_LOAD_MODE, default_value); + modules->mode = ebpf_select_mode(load_mode); + + modules->update_every = (int)appconfig_get_number(modules->cfg, EBPF_GLOBAL_SECTION, + EBPF_CFG_UPDATE_EVERY, modules->update_every); + + modules->apps_charts = appconfig_get_boolean(modules->cfg, EBPF_GLOBAL_SECTION, EBPF_CFG_APPLICATION, + (int) (modules->apps_charts & NETDATA_EBPF_APPS_FLAG_YES)); + + modules->cgroup_charts = appconfig_get_boolean(modules->cfg, EBPF_GLOBAL_SECTION, EBPF_CFG_CGROUP, + modules->cgroup_charts); + + modules->pid_map_size = (uint32_t)appconfig_get_number(modules->cfg, EBPF_GLOBAL_SECTION, EBPF_CFG_PID_SIZE, + modules->pid_map_size); + + modules->lifetime = (uint32_t) appconfig_get_number(modules->cfg, EBPF_GLOBAL_SECTION, + EBPF_CFG_LIFETIME, EBPF_DEFAULT_LIFETIME); + + char *value = ebpf_convert_load_mode_to_string(modules->load & NETDATA_EBPF_LOAD_METHODS); + char *type_format = appconfig_get(modules->cfg, EBPF_GLOBAL_SECTION, EBPF_CFG_TYPE_FORMAT, value); + netdata_ebpf_load_mode_t load = epbf_convert_string_to_load_mode(type_format); + load = ebpf_select_load_mode(btf_file, load, kver, is_rh); + modules->load = origin | load; + + char *core_attach = appconfig_get(modules->cfg, EBPF_GLOBAL_SECTION, EBPF_CFG_CORE_ATTACH, EBPF_CFG_ATTACH_TRAMPOLINE); + netdata_ebpf_program_loaded_t fill_lm = ebpf_convert_core_type(core_attach, modules->mode); + ebpf_update_target_with_conf(modules, fill_lm); + + value = ebpf_convert_collect_pid_to_string(modules->apps_level); + char *collect_pid = appconfig_get(modules->cfg, EBPF_GLOBAL_SECTION, EBPF_CFG_COLLECT_PID, value); + modules->apps_level = ebpf_convert_string_to_apps_level(collect_pid); + + modules->maps_per_core = appconfig_get_boolean(modules->cfg, EBPF_GLOBAL_SECTION, EBPF_CFG_MAPS_PER_CORE, + modules->maps_per_core); + if (kver < NETDATA_EBPF_KERNEL_4_06) + modules->maps_per_core = CONFIG_BOOLEAN_NO; + +#ifdef NETDATA_DEV_MODE + netdata_log_info("The thread %s was configured with: mode = %s; update every = %d; apps = %s; cgroup = %s; ebpf type format = %s; ebpf co-re tracing = %s; collect pid = %s; maps per core = %s, lifetime=%u", + modules->info.thread_name, + load_mode, + modules->update_every, + (modules->apps_charts)?"enabled":"disabled", + (modules->cgroup_charts)?"enabled":"disabled", + type_format, + core_attach, + collect_pid, + (modules->maps_per_core)?"enabled":"disabled", + modules->lifetime + ); +#endif +} + +/** + * Update module + * + * When this function is called, it will load the configuration file and after this + * it updates the global information of ebpf_module. + * If the module has specific configuration, this function will load it, but it will not + * update the variables. + * + * @param em the module structure + * @param btf_file a pointer to the loaded btf file. + * @param is_rhf is Red Hat family? + * @param kver the kernel version + */ +void ebpf_update_module(ebpf_module_t *em, struct btf *btf_file, int kver, int is_rh) +{ + char filename[FILENAME_MAX+1]; + netdata_ebpf_load_mode_t origin; + + ebpf_mount_config_name(filename, FILENAME_MAX, ebpf_user_config_dir, em->config_file); + if (!ebpf_load_config(em->cfg, filename)) { + ebpf_mount_config_name(filename, FILENAME_MAX, ebpf_stock_config_dir, em->config_file); + if (!ebpf_load_config(em->cfg, filename)) { + netdata_log_error("Cannot load the ebpf configuration file %s", em->config_file); + return; + } + // If user defined data globally, we will have here EBPF_LOADED_FROM_USER, we need to consider this, to avoid + // forcing users to configure thread by thread. + origin = (!(em->load & NETDATA_EBPF_LOAD_SOURCE)) ? EBPF_LOADED_FROM_STOCK : em->load & NETDATA_EBPF_LOAD_SOURCE; + } else + origin = EBPF_LOADED_FROM_USER; + + ebpf_update_module_using_config(em, origin, btf_file, kver, is_rh); +} + +/** + * Adjust Apps Cgroup + * + * Apps and cgroup has internal cleanup that needs attaching tracers to release_task, to avoid overload the function + * we will enable this integration by default, if and only if, we are running with trampolines. + * + * @param em a pointer to the main thread structure. + * @param mode is the mode used with different + */ +void ebpf_adjust_apps_cgroup(ebpf_module_t *em, netdata_ebpf_program_loaded_t mode) +{ + if ((em->load & EBPF_LOADED_FROM_STOCK) && + (em->apps_charts || em->cgroup_charts) && + mode != EBPF_LOAD_TRAMPOLINE) { + em->apps_charts = NETDATA_EBPF_APPS_FLAG_NO; + em->cgroup_charts = 0; + } +} + +//---------------------------------------------------------------------------------------------------------------------- + +/** + * Load Address + * + * Helper used to get address from /proc/kallsym + * + * @param fa address structure + * @param fd file descriptor loaded inside kernel. If a negative value is given + * the function will load address and it won't update hash table. + */ +void ebpf_load_addresses(ebpf_addresses_t *fa, int fd) +{ + if (fa->addr) + return ; + + procfile *ff = procfile_open("/proc/kallsyms", " \t:", PROCFILE_FLAG_DEFAULT); + if (!ff) + return; + + ff = procfile_readall(ff); + if (!ff) + return; + + fa->hash = simple_hash(fa->function); + + size_t lines = procfile_lines(ff), l; + for(l = 0; l < lines ;l++) { + char *fcnt = procfile_lineword(ff, l, 2); + uint32_t hash = simple_hash(fcnt); + if (fa->hash == hash && !strcmp(fcnt, fa->function)) { + char *type = procfile_lineword(ff, l, 2); + fa->type = type[0]; + if (fd > 0) { + char addr[128]; + snprintf(addr, 127, "0x%s", procfile_lineword(ff, l, 0)); + fa->addr = (unsigned long) strtoul(addr, NULL, 16); + uint32_t key = 0; + bpf_map_update_elem(fd, &key, &fa->addr, BPF_ANY); + } else + fa->addr = 1; + break; + } + } + + procfile_close(ff); +} + +//---------------------------------------------------------------------------------------------------------------------- + +/** + * Fill Algorithms + * + * Set one unique dimension for all vector position. + * + * @param algorithms the output vector + * @param length number of elements of algorithms vector + * @param algorithm algorithm used on charts. +*/ +void ebpf_fill_algorithms(int *algorithms, size_t length, int algorithm) +{ + size_t i; + for (i = 0; i < length; i++) { + algorithms[i] = algorithm; + } +} + +/** + * Fill Histogram dimension + * + * Fill the histogram dimension with the specified ranges + */ +char **ebpf_fill_histogram_dimension(size_t maximum) +{ + char *dimensions[] = { "us", "ms", "s"}; + int previous_dim = 0, current_dim = 0; + uint32_t previous_level = 1000, current_level = 1000; + uint32_t previous_divisor = 1, current_divisor = 1; + uint32_t current = 1, previous = 0; + uint32_t selector; + char **out = callocz(maximum, sizeof(char *)); + char range[128]; + size_t end = maximum - 1; + for (selector = 0; selector < end; selector++) { + snprintf(range, 127, "%u%s->%u%s", previous/previous_divisor, dimensions[previous_dim], + current/current_divisor, dimensions[current_dim]); + out[selector] = strdupz(range); + previous = current; + current <<= 1; + + if (previous_dim != 2 && previous > previous_level) { + previous_dim++; + + previous_divisor *= 1000; + previous_level *= 1000; + } + + if (current_dim != 2 && current > current_level) { + current_dim++; + + current_divisor *= 1000; + current_level *= 1000; + } + } + snprintf(range, 127, "%u%s->+Inf", previous/previous_divisor, dimensions[previous_dim]); + out[selector] = strdupz(range); + + return out; +} + +/** + * Histogram dimension cleanup + * + * Cleanup dimensions allocated with function ebpf_fill_histogram_dimension + * + * @param ptr + * @param length + */ +void ebpf_histogram_dimension_cleanup(char **ptr, size_t length) +{ + size_t i; + for (i = 0; i < length; i++) { + freez(ptr[i]); + } + freez(ptr); +} + +//---------------------------------------------------------------------------------------------------------------------- + +/** + * Open tracepoint path + * + * @param filename pointer to store the path + * @param length file length + * @param subsys is the name of your subsystem. + * @param eventname is the name of the event to trace. + * @param flags flags used with syscall open + * + * @return it returns a positive value on success and a negative otherwise. + */ +static inline int ebpf_open_tracepoint_path(char *filename, size_t length, char *subsys, char *eventname, int flags) +{ + snprintfz(filename, length, "%s/events/%s/%s/enable", NETDATA_DEBUGFS, subsys, eventname); + return open(filename, flags | O_CLOEXEC, 0); +} + +/** + * Is tracepoint enabled + * + * Check whether the tracepoint is enabled. + * + * @param subsys is the name of your subsystem. + * @param eventname is the name of the event to trace. + * + * @return it returns 1 when it is enabled, 0 when it is disabled and -1 on error. + */ +int ebpf_is_tracepoint_enabled(char *subsys, char *eventname) +{ + char text[FILENAME_MAX + 1]; + int fd = ebpf_open_tracepoint_path(text, FILENAME_MAX, subsys, eventname, O_RDONLY); + if (fd < 0) { + return -1; + } + + ssize_t length = read(fd, text, 1); + if (length != 1) { + close(fd); + return -1; + } + close(fd); + + return (text[0] == '1') ? CONFIG_BOOLEAN_YES : CONFIG_BOOLEAN_NO; +} + +/** + * Change Tracing values + * + * Change value for specific tracepoint enabling or disabling it according value given. + * + * @param subsys is the name of your subsystem. + * @param eventname is the name of the event to trace. + * @param value a value to enable (1) or disable (0) a tracepoint. + * + * @return It returns 0 on success and -1 otherwise + */ +static int ebpf_change_tracing_values(char *subsys, char *eventname, char *value) +{ + if (strcmp("0", value) && strcmp("1", value)) { + netdata_log_error("Invalid value given to either enable or disable a tracepoint."); + return -1; + } + + char filename[1024]; + int fd = ebpf_open_tracepoint_path(filename, 1023, subsys, eventname, O_WRONLY); + if (fd < 0) { + return -1; + } + + ssize_t written = write(fd, value, strlen(value)); + if (written < 0) { + close(fd); + return -1; + } + + close(fd); + return 0; +} + +/** + * Enable tracing values + * + * Enable a tracepoint on a system + * + * @param subsys is the name of your subsystem. + * @param eventname is the name of the event to trace. + * + * @return It returns 0 on success and -1 otherwise + */ +int ebpf_enable_tracing_values(char *subsys, char *eventname) +{ + return ebpf_change_tracing_values(subsys, eventname, "1"); +} + +/** + * Disable tracing values + * + * Disable tracing points enabled by collector + * + * @param subsys is the name of your subsystem. + * @param eventname is the name of the event to trace. + * + * @return It returns 0 on success and -1 otherwise + */ +int ebpf_disable_tracing_values(char *subsys, char *eventname) +{ + return ebpf_change_tracing_values(subsys, eventname, "0"); +} + +/** + * Select PC prefix + * + * Identify the prefix to run on PC architecture. + * + * @return It returns 32 or 64 according to host arch. + */ +static uint32_t ebpf_select_pc_prefix() +{ + long counter = 1; + uint32_t i; + for (i = 0; i < 128; i++) { + counter <<= 1; + if (counter < 0) + break; + } + + return counter; +} + +/** + * Select Host Prefix + * + * Select prefix to syscall when host is running a kernel newer than 4.17.0 + * + * @param output the vector to store data. + * @param length length of output vector. + * @param syscall the syscall that prefix will be attached; + * @param kver the current kernel version in format MAJOR*65536 + MINOR*256 + PATCH + */ +void ebpf_select_host_prefix(char *output, size_t length, char *syscall, int kver) +{ + if (kver < NETDATA_EBPF_KERNEL_4_17) + snprintfz(output, length, "sys_%s", syscall); + else { + uint32_t arch = ebpf_select_pc_prefix(); + // Prefix selected according https://www.kernel.org/doc/html/latest/process/adding-syscalls.html + char *prefix = (arch == 32) ? "__ia32" : "__x64"; + snprintfz(output, length, "%s_sys_%s", prefix, syscall); + } +} + diff --git a/src/libnetdata/ebpf/ebpf.h b/src/libnetdata/ebpf/ebpf.h new file mode 100644 index 00000000..1c612ad3 --- /dev/null +++ b/src/libnetdata/ebpf/ebpf.h @@ -0,0 +1,495 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_EBPF_H +#define NETDATA_EBPF_H 1 + +#define NETDATA_EBPF_PLUGIN_NAME "ebpf.plugin" + +#include <bpf/bpf.h> +#include <bpf/libbpf.h> +#ifdef LIBBPF_DEPRECATED +#include <bpf/btf.h> +#include <linux/btf.h> +#endif +#include <stdlib.h> // Necessary for stdtoul +#include "libnetdata/aral/aral.h" + +#define NETDATA_DEBUGFS "/sys/kernel/debug/tracing/" +#define NETDATA_KALLSYMS "/proc/kallsyms" + +// Config files +#define EBPF_GLOBAL_SECTION "global" +#define EBPF_CFG_LOAD_MODE "ebpf load mode" +#define EBPF_CFG_LOAD_MODE_DEFAULT "entry" +#define EBPF_CFG_LOAD_MODE_RETURN "return" +#define EBPF_MAX_MODE_LENGTH 6 + +#define EBPF_CFG_TYPE_FORMAT "ebpf type format" +#define EBPF_CFG_DEFAULT_PROGRAM "auto" +#define EBPF_CFG_CORE_PROGRAM "CO-RE" +#define EBPF_CFG_LEGACY_PROGRAM "legacy" + +#define EBPF_CFG_COLLECT_PID "collect pid" +#define EBPF_CFG_PID_REAL_PARENT "real parent" +#define EBPF_CFG_PID_PARENT "parent" +#define EBPF_CFG_PID_ALL "all" +#define EBPF_CFG_PID_INTERNAL_USAGE "not used" + +#define EBPF_CFG_CORE_ATTACH "ebpf co-re tracing" +#define EBPF_CFG_ATTACH_TRAMPOLINE "trampoline" +#define EBPF_CFG_ATTACH_TRACEPOINT "tracepoint" +#define EBPF_CFG_ATTACH_PROBE "probe" + +#define EBPF_CFG_PROGRAM_PATH "btf path" + +#define EBPF_CFG_MAPS_PER_CORE "maps per core" + +#define EBPF_CFG_UPDATE_EVERY "update every" +#define EBPF_CFG_LIFETIME "lifetime" +#define EBPF_CFG_UPDATE_APPS_EVERY_DEFAULT 10 +#define EBPF_CFG_PID_SIZE "pid table size" +#define EBPF_CFG_APPLICATION "apps" +#define EBPF_CFG_CGROUP "cgroups" + +#define EBPF_COMMON_FNCT_CLEAN_UP "release_task" + +/** + * The RedHat magic number was got doing: + * + * 1797 = 7*256 + 5 + * + * For more details, please, read /usr/include/linux/version.h + * in any Red Hat installation. + */ +#define NETDATA_MINIMUM_RH_VERSION 1797 + +/** + * 2048 = 8*256 + 0 + */ +#define NETDATA_RH_8 2048 + +/** + * Kernel Version + * + * Kernel versions are calculated using the following formula: + * + * VERSION = LINUX_VERSION_MAJOR*65536 + LINUX_VERSION_PATCHLEVEL*256 + LINUX_VERSION_SUBLEVEL + * + * Where LINUX_VERSION_MAJOR, LINUX_VERSION_PATCHLEVEL, and LINUX_VERSION_SUBLEVEL are extracted + * from /usr/include/linux/version.h. + * + * LINUX_VERSION_SUBLEVEL has the maximum value 255, but linux can have more SUBLEVELS. + * + */ +enum netdata_ebpf_kernel_versions { + NETDATA_EBPF_KERNEL_4_06 = 263680, // 264960 = 4 * 65536 + 6 * 256 + NETDATA_EBPF_KERNEL_4_11 = 264960, // 264960 = 4 * 65536 + 15 * 256 + NETDATA_EBPF_KERNEL_4_14 = 265728, // 264960 = 4 * 65536 + 14 * 256 + NETDATA_EBPF_KERNEL_4_15 = 265984, // 265984 = 4 * 65536 + 15 * 256 + NETDATA_EBPF_KERNEL_4_17 = 266496, // 266496 = 4 * 65536 + 17 * 256 + NETDATA_EBPF_KERNEL_5_0 = 327680, // 327680 = 5 * 65536 + 0 * 256 + NETDATA_EBPF_KERNEL_5_4 = 328704, // 327680 = 5 * 65536 + 4 * 256 + NETDATA_EBPF_KERNEL_5_5 = 328960, // 327680 = 5 * 65536 + 5 * 256 + NETDATA_EBPF_KERNEL_5_10 = 330240, // 330240 = 5 * 65536 + 10 * 256 + NETDATA_EBPF_KERNEL_5_11 = 330496, // 330240 = 5 * 65536 + 11 * 256 + NETDATA_EBPF_KERNEL_5_14 = 331264, // 331264 = 5 * 65536 + 14 * 256 + NETDATA_EBPF_KERNEL_5_15 = 331520, // 331520 = 5 * 65536 + 15 * 256 + NETDATA_EBPF_KERNEL_5_16 = 331776, // 331776 = 5 * 65536 + 16 * 256 + NETDATA_EBPF_KERNEL_6_8 = 395264 // 395264 = 5 * 65536 + 8 * 256 +}; + +enum netdata_kernel_flag { + NETDATA_V3_10 = 1 << 0, + NETDATA_V4_14 = 1 << 1, + NETDATA_V4_16 = 1 << 2, + NETDATA_V4_18 = 1 << 3, + NETDATA_V5_4 = 1 << 4, + NETDATA_V5_10 = 1 << 5, + NETDATA_V5_11 = 1 << 6, + NETDATA_V5_14 = 1 << 7, + NETDATA_V5_15 = 1 << 8, + NETDATA_V5_16 = 1 << 9, + NETDATA_V6_8 = 1 << 10 +}; + +enum netdata_kernel_idx { + NETDATA_IDX_V3_10, + NETDATA_IDX_V4_14, + NETDATA_IDX_V4_16, + NETDATA_IDX_V4_18, + NETDATA_IDX_V5_4 , + NETDATA_IDX_V5_10, + NETDATA_IDX_V5_11, + NETDATA_IDX_V5_14, + NETDATA_IDX_V5_15, + NETDATA_IDX_V5_16, + NETDATA_IDX_V6_8 +}; + +#define NETDATA_IDX_STR_V3_10 "3.10" +#define NETDATA_IDX_STR_V4_14 "4.14" +#define NETDATA_IDX_STR_V4_16 "4.16" +#define NETDATA_IDX_STR_V4_18 "4.18" +#define NETDATA_IDX_STR_V5_4 "5.4" +#define NETDATA_IDX_STR_V5_10 "5.10" +#define NETDATA_IDX_STR_V5_11 "5.11" +#define NETDATA_IDX_STR_V5_14 "5.14" +#define NETDATA_IDX_STR_V5_15 "5.15" +#define NETDATA_IDX_STR_V5_16 "5.16" +#define NETDATA_IDX_STR_V6_8 "6.8" + +/** + * Minimum value has relationship with libbpf support. + */ +#define NETDATA_MINIMUM_EBPF_KERNEL NETDATA_EBPF_KERNEL_4_11 + +#define VERSION_STRING_LEN 256 +#define EBPF_KERNEL_REJECT_LIST_FILE "ebpf_kernel_reject_list.txt" + +#define ND_EBPF_DEFAULT_MIN_PID 1U +#define ND_EBPF_MAP_FD_NOT_INITIALIZED ((int)-1) + +typedef struct ebpf_addresses { + char *function; + uint32_t hash; + // We use long as address, because it matches system length + unsigned long addr; + uint32_t type; +} ebpf_addresses_t; + +extern char *ebpf_user_config_dir; +extern char *ebpf_stock_config_dir; + +typedef struct ebpf_data { + int *map_fd; + + char *kernel_string; + uint32_t running_on_kernel; + int isrh; +} ebpf_data_t; + +typedef enum { + MODE_RETURN = 0, // This attaches kprobe when the function returns + MODE_DEVMODE, // This stores log given description about the errors raised + MODE_ENTRY // This attaches kprobe when the function is called +} netdata_run_mode_t; + +#define ND_EBPF_DEFAULT_PID_SIZE 32768U + +enum netdata_ebpf_map_type { + NETDATA_EBPF_MAP_STATIC = 0, + NETDATA_EBPF_MAP_RESIZABLE = 1, + NETDATA_EBPF_MAP_CONTROLLER = 2, + NETDATA_EBPF_MAP_CONTROLLER_UPDATED = 4, + NETDATA_EBPF_MAP_PID = 8 +}; + +enum netdata_controller { + NETDATA_CONTROLLER_APPS_ENABLED, + NETDATA_CONTROLLER_APPS_LEVEL, + + // These index show the number of elements + // stored inside hash tables. + // + // We have indexes to count increase and + // decrease events, because __sync_fetch_and_sub + // generates compilation errors. + NETDATA_CONTROLLER_PID_TABLE_ADD, + NETDATA_CONTROLLER_PID_TABLE_DEL, + NETDATA_CONTROLLER_TEMP_TABLE_ADD, + NETDATA_CONTROLLER_TEMP_TABLE_DEL, + + NETDATA_CONTROLLER_END +}; + +// Control how Netdata will monitor PIDs (apps and cgroups) +typedef enum netdata_apps_level { + NETDATA_APPS_LEVEL_REAL_PARENT, + NETDATA_APPS_LEVEL_PARENT, + NETDATA_APPS_LEVEL_ALL, + + // Present only in user ring + NETDATA_APPS_NOT_SET +} netdata_apps_level_t; + +typedef struct ebpf_local_maps { + char *name; + uint32_t internal_input; + uint32_t user_input; + uint32_t type; + int map_fd; +#ifdef LIBBPF_MAJOR_VERSION + enum bpf_map_type map_type; +#endif +} ebpf_local_maps_t; + +typedef struct ebpf_specify_name { + char *program_name; + char *function_to_attach; + char *optional; + bool retprobe; +} ebpf_specify_name_t; + +typedef enum netdata_ebpf_load_mode { + EBPF_LOAD_LEGACY = 1<<0, // Select legacy mode, this means we will load binaries + EBPF_LOAD_CORE = 1<<1, // When CO-RE is used, it is necessary to use the source code + EBPF_LOAD_PLAY_DICE = 1<<2, // Take a look on environment and choose the best option + EBPF_LOADED_FROM_STOCK = 1<<3, // Configuration loaded from Stock file + EBPF_LOADED_FROM_USER = 1<<4 // Configuration loaded from user +} netdata_ebpf_load_mode_t; +#define NETDATA_EBPF_LOAD_METHODS (EBPF_LOAD_LEGACY|EBPF_LOAD_CORE|EBPF_LOAD_PLAY_DICE) +#define NETDATA_EBPF_LOAD_SOURCE (EBPF_LOADED_FROM_STOCK|EBPF_LOADED_FROM_USER) + +typedef enum netdata_ebpf_program_loaded { + EBPF_LOAD_PROBE, // Attach probes on targets + EBPF_LOAD_RETPROBE, // Attach retprobes on targets + EBPF_LOAD_TRACEPOINT, // This stores log given description about the errors raised + EBPF_LOAD_TRAMPOLINE, // This attaches kprobe when the function is called +} netdata_ebpf_program_loaded_t; + +typedef struct netdata_ebpf_targets { + char *name; + netdata_ebpf_program_loaded_t mode; +} netdata_ebpf_targets_t; + +typedef struct ebpf_plugin_stats { + // Load options + uint32_t legacy; // Legacy codes + uint32_t core; // CO-RE codes, this means we are using source code compiled. + + uint32_t threads; // Total number of threads + uint32_t running; // total number of threads running + + uint32_t probes; // Number of kprobes loaded + uint32_t retprobes; // Number of kretprobes loaded + uint32_t tracepoints; // Number of tracepoints used + uint32_t trampolines; // Number of trampolines used + + uint64_t memlock_kern; // The same information reported by bpftool, but it is not accurated + // https://lore.kernel.org/linux-mm/20230112155326.26902-5-laoar.shao@gmail.com/T/ + uint32_t hash_tables; // Number of hash tables used on the system. + + uint32_t hash_percpu; // Number of threads running per cpu maps + uint32_t hash_unique; // Number of threads running an unique map for all cores. +} ebpf_plugin_stats_t; + +typedef enum ebpf_stats_action { + EBPF_ACTION_STAT_ADD, + EBPF_ACTION_STAT_REMOVE, +} ebpf_stats_action_t; + +typedef enum netdata_apps_integration_flags { + NETDATA_EBPF_APPS_FLAG_NO, + NETDATA_EBPF_APPS_FLAG_YES, + NETDATA_EBPF_APPS_FLAG_CHART_CREATED +} netdata_apps_integration_flags_t; + +#define NETDATA_EBPF_CHART_MEM_LENGTH 48 +#define NETDATA_EBPF_STAT_DIMENSION_MEMORY "memory" +#define NETDATA_EBPF_STAT_DIMENSION_ARAL "aral" + +enum ebpf_threads_status { + NETDATA_THREAD_EBPF_RUNNING, // started by plugin + NETDATA_THREAD_EBPF_FUNCTION_RUNNING, // started by function + NETDATA_THREAD_EBPF_STOPPING, // stopping thread + NETDATA_THREAD_EBPF_STOPPED, // thread stopped + NETDATA_THREAD_EBPF_NOT_RUNNING // thread was never started +}; + +enum ebpf_global_table_values { + NETDATA_EBPF_GLOBAL_TABLE_PID_TABLE_ADD, // Count elements added inside PID table + NETDATA_EBPF_GLOBAL_TABLE_PID_TABLE_DEL, // Count elements removed from PID table + NETDATA_EBPF_GLOBAL_TABLE_TEMP_TABLE_ADD, // Count elements added inside TEMP table + NETDATA_EBPF_GLOBAL_TABLE_TEMP_TABLE_DEL, // Count elements removed from TEMP table + + NETDATA_EBPF_GLOBAL_TABLE_STATUS_END +}; + +typedef uint64_t netdata_idx_t; + +typedef struct ebpf_module { + // Constants used with module + struct { + const char *thread_name; + const char *config_name; + const char *thread_description; + } info; + + // Helpers used with plugin + struct { + void *(*start_routine)(void *); // the thread function + void (*apps_routine)(struct ebpf_module *em, void *ptr); // the apps charts + void (*fnct_routine)(BUFFER *bf, struct ebpf_module *em); // the function used for exteernal requests + const char *fcnt_name; // name given to cloud + const char *fcnt_desc; // description given about function + const char *fcnt_thread_chart_name; + int order_thread_chart; + const char *fcnt_thread_lifetime_name; + int order_thread_lifetime; + } functions; + + enum ebpf_threads_status enabled; + int update_every; + int global_charts; + netdata_apps_integration_flags_t apps_charts; + netdata_apps_level_t apps_level; + int cgroup_charts; + netdata_run_mode_t mode; + uint32_t thread_id; + int optional; + ebpf_local_maps_t *maps; + ebpf_specify_name_t *names; + uint32_t pid_map_size; + struct config *cfg; + const char *config_file; + uint64_t kernels; + netdata_ebpf_load_mode_t load; + netdata_ebpf_targets_t *targets; + struct bpf_link **probe_links; + struct bpf_object *objects; + struct netdata_static_thread *thread; + + // charts + char memory_usage[NETDATA_EBPF_CHART_MEM_LENGTH]; + char memory_allocations[NETDATA_EBPF_CHART_MEM_LENGTH]; + int maps_per_core; + + // period to run + uint32_t running_time; // internal usage, this is used to reset a value when a new request happens. + uint32_t lifetime; + + netdata_idx_t hash_table_stats[NETDATA_EBPF_GLOBAL_TABLE_STATUS_END]; +} ebpf_module_t; + +#define EBPF_DEFAULT_LIFETIME 300 +// This will be present until all functions are merged. The deadline is planned for 68 years since plugin start +#define EBPF_NON_FUNCTION_LIFE_TIME UINT_MAX + +int ebpf_get_kernel_version(); +int get_redhat_release(); +char *ebpf_kernel_suffix(int version, int isrh); +struct bpf_link **ebpf_load_program(char *plugins_dir, ebpf_module_t *em, int kver, int is_rhf, + struct bpf_object **obj); + +void ebpf_mount_config_name(char *filename, size_t length, char *path, const char *config); +int ebpf_load_config(struct config *config, char *filename); +void ebpf_update_module(ebpf_module_t *em, struct btf *btf_file, int kver, int is_rh); +void ebpf_update_names(ebpf_specify_name_t *opt, ebpf_module_t *em); +void ebpf_adjust_apps_cgroup(ebpf_module_t *em, netdata_ebpf_program_loaded_t mode); +char *ebpf_find_symbol(char *search); +void ebpf_load_addresses(ebpf_addresses_t *fa, int fd); +void ebpf_fill_algorithms(int *algorithms, size_t length, int algorithm); +char **ebpf_fill_histogram_dimension(size_t maximum); +void ebpf_update_stats(ebpf_plugin_stats_t *report, ebpf_module_t *em); +void ebpf_update_controller(int fd, ebpf_module_t *em); +void ebpf_update_map_size(struct bpf_map *map, ebpf_local_maps_t *lmap, ebpf_module_t *em, const char *map_name); + +// Histogram +#define NETDATA_EBPF_HIST_MAX_BINS 24UL +#define NETDATA_DISK_MAX 256U +#define NETDATA_DISK_HISTOGRAM_LENGTH (NETDATA_DISK_MAX * NETDATA_EBPF_HIST_MAX_BINS) + +typedef struct netdata_ebpf_histogram { + char *name; + char *title; + char *ctx; + int order; + uint64_t histogram[NETDATA_EBPF_HIST_MAX_BINS]; +} netdata_ebpf_histogram_t; + +enum fs_btf_counters { + NETDATA_KEY_BTF_READ, + NETDATA_KEY_BTF_WRITE, + NETDATA_KEY_BTF_OPEN, + NETDATA_KEY_BTF_SYNC_ATTR, + NETDATA_KEY_BTF_OPEN2, + + NETDATA_FS_BTF_END +}; + +typedef struct ebpf_filesystem_partitions { + char *filesystem; + char *optional_filesystem; + char *family; + char *family_name; + struct bpf_object *objects; + struct bpf_link **probe_links; + + netdata_ebpf_histogram_t hread; + netdata_ebpf_histogram_t hwrite; + netdata_ebpf_histogram_t hopen; + netdata_ebpf_histogram_t hadditional; + + uint32_t flags; + uint32_t enabled; + + ebpf_addresses_t addresses; + uint64_t kernels; + ebpf_local_maps_t *fs_maps; + + // BPF structure +#ifdef LIBBPF_MAJOR_VERSION + struct filesystem_bpf *fs_obj; +#else + void *fs_obj; +#endif + const char *functions[NETDATA_FS_BTF_END]; +} ebpf_filesystem_partitions_t; + +typedef struct ebpf_sync_syscalls { + char *syscall; + int enabled; + uint32_t flags; + + // BTF structure + struct bpf_object *objects; + struct bpf_link **probe_links; + + // BPF structure +#ifdef LIBBPF_MAJOR_VERSION + struct sync_bpf *sync_obj; +#else + void *sync_obj; +#endif + ebpf_local_maps_t *sync_maps; +} ebpf_sync_syscalls_t; + +void ebpf_histogram_dimension_cleanup(char **ptr, size_t length); + +// Tracepoint helpers +// For more information related to tracepoints read https://www.kernel.org/doc/html/latest/trace/tracepoints.html +int ebpf_is_tracepoint_enabled(char *subsys, char *eventname); +int ebpf_enable_tracing_values(char *subsys, char *eventname); +int ebpf_disable_tracing_values(char *subsys, char *eventname); + +// BTF Section +#define EBPF_DEFAULT_BTF_FILE "vmlinux" +#define EBPF_DEFAULT_BTF_PATH "/sys/kernel/btf" +#define EBPF_DEFAULT_ERROR_MSG "Cannot open or load BPF file for thread" + +// BTF helpers +#define NETDATA_EBPF_MAX_SYSCALL_LENGTH 255 + +netdata_ebpf_load_mode_t epbf_convert_string_to_load_mode(char *str); +netdata_ebpf_program_loaded_t ebpf_convert_core_type(char *str, netdata_run_mode_t lmode); +void ebpf_select_host_prefix(char *output, size_t length, char *syscall, int kver); +#ifdef LIBBPF_MAJOR_VERSION +void ebpf_adjust_thread_load(ebpf_module_t *mod, struct btf *file); +struct btf *ebpf_parse_btf_file(const char *filename); +struct btf *ebpf_load_btf_file(char *path, char *filename); +int ebpf_is_function_inside_btf(struct btf *file, char *function); +void ebpf_update_map_type(struct bpf_map *map, ebpf_local_maps_t *w); +void ebpf_define_map_type(ebpf_local_maps_t *maps, int maps_per_core, int kver); +#endif + +void ebpf_update_kernel_memory_with_vector(ebpf_plugin_stats_t *report, ebpf_local_maps_t *maps, + ebpf_stats_action_t action); +void ebpf_update_kernel_memory(ebpf_plugin_stats_t *report, ebpf_local_maps_t *map, ebpf_stats_action_t action); +int ebpf_statistic_create_aral_chart(char *name, ebpf_module_t *em); +void ebpf_statistic_obsolete_aral_chart(ebpf_module_t *em, int prio); +void ebpf_send_data_aral_chart(ARAL *memory, ebpf_module_t *em); + +int ebpf_can_plugin_load_code(int kver, char *plugin_name); +int ebpf_adjust_memory_limit(); + +#endif /* NETDATA_EBPF_H */ diff --git a/src/libnetdata/eval/README.md b/src/libnetdata/eval/README.md new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/src/libnetdata/eval/README.md @@ -0,0 +1 @@ + diff --git a/src/libnetdata/eval/eval.c b/src/libnetdata/eval/eval.c new file mode 100644 index 00000000..bacac9c1 --- /dev/null +++ b/src/libnetdata/eval/eval.c @@ -0,0 +1,1251 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" + +typedef enum __attribute__((packed)) { + EVAL_VALUE_INVALID = 0, + EVAL_VALUE_NUMBER, + EVAL_VALUE_VARIABLE, + EVAL_VALUE_EXPRESSION +} EVAL_VALUE_TYPE; + +// ---------------------------------------------------------------------------- +// data structures for storing the parsed expression in memory + +typedef struct eval_variable { + STRING *name; + struct eval_variable *next; +} EVAL_VARIABLE; + +typedef struct eval_value { + EVAL_VALUE_TYPE type; + + union { + NETDATA_DOUBLE number; + EVAL_VARIABLE *variable; + struct eval_node *expression; + }; +} EVAL_VALUE; + +typedef struct eval_node { + int id; + unsigned char operator; + int precedence; + + int count; + EVAL_VALUE ops[]; +} EVAL_NODE; + +struct eval_expression { + STRING *source; + STRING *parsed_as; + + NETDATA_DOUBLE result; + + int error; + BUFFER *error_msg; + + EVAL_NODE *nodes; + + void *variable_lookup_cb_data; + eval_expression_variable_lookup_t variable_lookup_cb; +}; + +// these are used for EVAL_NODE.operator +// they are used as internal IDs to identify an operator +// THEY ARE NOT USED FOR PARSING OPERATORS LIKE THAT +#define EVAL_OPERATOR_NOP '\0' +#define EVAL_OPERATOR_EXPRESSION_OPEN '(' +#define EVAL_OPERATOR_EXPRESSION_CLOSE ')' +#define EVAL_OPERATOR_NOT '!' +#define EVAL_OPERATOR_PLUS '+' +#define EVAL_OPERATOR_MINUS '-' +#define EVAL_OPERATOR_AND '&' +#define EVAL_OPERATOR_OR '|' +#define EVAL_OPERATOR_GREATER_THAN_OR_EQUAL 'G' +#define EVAL_OPERATOR_LESS_THAN_OR_EQUAL 'L' +#define EVAL_OPERATOR_NOT_EQUAL '~' +#define EVAL_OPERATOR_EQUAL '=' +#define EVAL_OPERATOR_LESS '<' +#define EVAL_OPERATOR_GREATER '>' +#define EVAL_OPERATOR_MULTIPLY '*' +#define EVAL_OPERATOR_DIVIDE '/' +#define EVAL_OPERATOR_SIGN_PLUS 'P' +#define EVAL_OPERATOR_SIGN_MINUS 'M' +#define EVAL_OPERATOR_ABS 'A' +#define EVAL_OPERATOR_IF_THEN_ELSE '?' + +// ---------------------------------------------------------------------------- +// forward function definitions + +static inline void eval_node_free(EVAL_NODE *op); +static inline EVAL_NODE *parse_full_expression(const char **string, int *error); +static inline EVAL_NODE *parse_one_full_operand(const char **string, int *error); +static inline NETDATA_DOUBLE eval_node(EVAL_EXPRESSION *exp, EVAL_NODE *op, int *error); +static inline void print_parsed_as_node(BUFFER *out, EVAL_NODE *op, int *error); +static inline void print_parsed_as_constant(BUFFER *out, NETDATA_DOUBLE n); + +// ---------------------------------------------------------------------------- +// evaluation of expressions + +static inline NETDATA_DOUBLE eval_variable(EVAL_EXPRESSION *exp, EVAL_VARIABLE *v, int *error) { + NETDATA_DOUBLE n; + + if(exp->variable_lookup_cb && exp->variable_lookup_cb(v->name, exp->variable_lookup_cb_data, &n)) { + buffer_sprintf(exp->error_msg, "[ ${%s} = ", string2str(v->name)); + print_parsed_as_constant(exp->error_msg, n); + buffer_strcat(exp->error_msg, " ] "); + return n; + } + + *error = EVAL_ERROR_UNKNOWN_VARIABLE; + buffer_sprintf(exp->error_msg, "[ undefined variable '%s' ] ", string2str(v->name)); + return NAN; +} + +static inline NETDATA_DOUBLE eval_value(EVAL_EXPRESSION *exp, EVAL_VALUE *v, int *error) { + NETDATA_DOUBLE n; + + switch(v->type) { + case EVAL_VALUE_EXPRESSION: + n = eval_node(exp, v->expression, error); + break; + + case EVAL_VALUE_NUMBER: + n = v->number; + break; + + case EVAL_VALUE_VARIABLE: + n = eval_variable(exp, v->variable, error); + break; + + default: + *error = EVAL_ERROR_INVALID_VALUE; + n = 0; + break; + } + + return n; +} + +static inline int is_true(NETDATA_DOUBLE n) { + if(isnan(n)) return 0; + if(isinf(n)) return 1; + if(n == 0) return 0; + return 1; +} + +NETDATA_DOUBLE eval_and(EVAL_EXPRESSION *exp, EVAL_NODE *op, int *error) { + return is_true(eval_value(exp, &op->ops[0], error)) && is_true(eval_value(exp, &op->ops[1], error)); +} +NETDATA_DOUBLE eval_or(EVAL_EXPRESSION *exp, EVAL_NODE *op, int *error) { + return is_true(eval_value(exp, &op->ops[0], error)) || is_true(eval_value(exp, &op->ops[1], error)); +} +NETDATA_DOUBLE eval_greater_than_or_equal(EVAL_EXPRESSION *exp, EVAL_NODE *op, int *error) { + NETDATA_DOUBLE n1 = eval_value(exp, &op->ops[0], error); + NETDATA_DOUBLE n2 = eval_value(exp, &op->ops[1], error); + return isgreaterequal(n1, n2); +} +NETDATA_DOUBLE eval_less_than_or_equal(EVAL_EXPRESSION *exp, EVAL_NODE *op, int *error) { + NETDATA_DOUBLE n1 = eval_value(exp, &op->ops[0], error); + NETDATA_DOUBLE n2 = eval_value(exp, &op->ops[1], error); + return islessequal(n1, n2); +} +NETDATA_DOUBLE eval_equal(EVAL_EXPRESSION *exp, EVAL_NODE *op, int *error) { + NETDATA_DOUBLE n1 = eval_value(exp, &op->ops[0], error); + NETDATA_DOUBLE n2 = eval_value(exp, &op->ops[1], error); + if(isnan(n1) && isnan(n2)) return 1; + if(isinf(n1) && isinf(n2)) return 1; + if(isnan(n1) || isnan(n2)) return 0; + if(isinf(n1) || isinf(n2)) return 0; + return considered_equal_ndd(n1, n2); +} +NETDATA_DOUBLE eval_not_equal(EVAL_EXPRESSION *exp, EVAL_NODE *op, int *error) { + return !eval_equal(exp, op, error); +} +NETDATA_DOUBLE eval_less(EVAL_EXPRESSION *exp, EVAL_NODE *op, int *error) { + NETDATA_DOUBLE n1 = eval_value(exp, &op->ops[0], error); + NETDATA_DOUBLE n2 = eval_value(exp, &op->ops[1], error); + return isless(n1, n2); +} +NETDATA_DOUBLE eval_greater(EVAL_EXPRESSION *exp, EVAL_NODE *op, int *error) { + NETDATA_DOUBLE n1 = eval_value(exp, &op->ops[0], error); + NETDATA_DOUBLE n2 = eval_value(exp, &op->ops[1], error); + return isgreater(n1, n2); +} +NETDATA_DOUBLE eval_plus(EVAL_EXPRESSION *exp, EVAL_NODE *op, int *error) { + NETDATA_DOUBLE n1 = eval_value(exp, &op->ops[0], error); + NETDATA_DOUBLE n2 = eval_value(exp, &op->ops[1], error); + if(isnan(n1) || isnan(n2)) return NAN; + if(isinf(n1) || isinf(n2)) return INFINITY; + return n1 + n2; +} +NETDATA_DOUBLE eval_minus(EVAL_EXPRESSION *exp, EVAL_NODE *op, int *error) { + NETDATA_DOUBLE n1 = eval_value(exp, &op->ops[0], error); + NETDATA_DOUBLE n2 = eval_value(exp, &op->ops[1], error); + if(isnan(n1) || isnan(n2)) return NAN; + if(isinf(n1) || isinf(n2)) return INFINITY; + return n1 - n2; +} +NETDATA_DOUBLE eval_multiply(EVAL_EXPRESSION *exp, EVAL_NODE *op, int *error) { + NETDATA_DOUBLE n1 = eval_value(exp, &op->ops[0], error); + NETDATA_DOUBLE n2 = eval_value(exp, &op->ops[1], error); + if(isnan(n1) || isnan(n2)) return NAN; + if(isinf(n1) || isinf(n2)) return INFINITY; + return n1 * n2; +} +NETDATA_DOUBLE eval_divide(EVAL_EXPRESSION *exp, EVAL_NODE *op, int *error) { + NETDATA_DOUBLE n1 = eval_value(exp, &op->ops[0], error); + NETDATA_DOUBLE n2 = eval_value(exp, &op->ops[1], error); + if(isnan(n1) || isnan(n2)) return NAN; + if(isinf(n1) || isinf(n2)) return INFINITY; + return n1 / n2; +} +NETDATA_DOUBLE eval_nop(EVAL_EXPRESSION *exp, EVAL_NODE *op, int *error) { + return eval_value(exp, &op->ops[0], error); +} +NETDATA_DOUBLE eval_not(EVAL_EXPRESSION *exp, EVAL_NODE *op, int *error) { + return !is_true(eval_value(exp, &op->ops[0], error)); +} +NETDATA_DOUBLE eval_sign_plus(EVAL_EXPRESSION *exp, EVAL_NODE *op, int *error) { + return eval_value(exp, &op->ops[0], error); +} +NETDATA_DOUBLE eval_sign_minus(EVAL_EXPRESSION *exp, EVAL_NODE *op, int *error) { + NETDATA_DOUBLE n1 = eval_value(exp, &op->ops[0], error); + if(isnan(n1)) return NAN; + if(isinf(n1)) return INFINITY; + return -n1; +} +NETDATA_DOUBLE eval_abs(EVAL_EXPRESSION *exp, EVAL_NODE *op, int *error) { + NETDATA_DOUBLE n1 = eval_value(exp, &op->ops[0], error); + if(isnan(n1)) return NAN; + if(isinf(n1)) return INFINITY; + return ABS(n1); +} +NETDATA_DOUBLE eval_if_then_else(EVAL_EXPRESSION *exp, EVAL_NODE *op, int *error) { + if(is_true(eval_value(exp, &op->ops[0], error))) + return eval_value(exp, &op->ops[1], error); + else + return eval_value(exp, &op->ops[2], error); +} + +static struct operator { + const char *print_as; + char precedence; + char parameters; + char isfunction; + NETDATA_DOUBLE (*eval)(EVAL_EXPRESSION *exp, EVAL_NODE *op, int *error); +} operators[256] = { + // this is a random access array + // we always access it with a known EVAL_OPERATOR_X + + [EVAL_OPERATOR_AND] = { "&&", 2, 2, 0, eval_and }, + [EVAL_OPERATOR_OR] = { "||", 2, 2, 0, eval_or }, + [EVAL_OPERATOR_GREATER_THAN_OR_EQUAL] = { ">=", 3, 2, 0, eval_greater_than_or_equal }, + [EVAL_OPERATOR_LESS_THAN_OR_EQUAL] = { "<=", 3, 2, 0, eval_less_than_or_equal }, + [EVAL_OPERATOR_NOT_EQUAL] = { "!=", 3, 2, 0, eval_not_equal }, + [EVAL_OPERATOR_EQUAL] = { "==", 3, 2, 0, eval_equal }, + [EVAL_OPERATOR_LESS] = { "<", 3, 2, 0, eval_less }, + [EVAL_OPERATOR_GREATER] = { ">", 3, 2, 0, eval_greater }, + [EVAL_OPERATOR_PLUS] = { "+", 4, 2, 0, eval_plus }, + [EVAL_OPERATOR_MINUS] = { "-", 4, 2, 0, eval_minus }, + [EVAL_OPERATOR_MULTIPLY] = { "*", 5, 2, 0, eval_multiply }, + [EVAL_OPERATOR_DIVIDE] = { "/", 5, 2, 0, eval_divide }, + [EVAL_OPERATOR_NOT] = { "!", 6, 1, 0, eval_not }, + [EVAL_OPERATOR_SIGN_PLUS] = { "+", 6, 1, 0, eval_sign_plus }, + [EVAL_OPERATOR_SIGN_MINUS] = { "-", 6, 1, 0, eval_sign_minus }, + [EVAL_OPERATOR_ABS] = { "abs(",6,1, 1, eval_abs }, + [EVAL_OPERATOR_IF_THEN_ELSE] = { "?", 7, 3, 0, eval_if_then_else }, + [EVAL_OPERATOR_NOP] = { NULL, 8, 1, 0, eval_nop }, + [EVAL_OPERATOR_EXPRESSION_OPEN] = { NULL, 8, 1, 0, eval_nop }, + + // this should exist in our evaluation list + [EVAL_OPERATOR_EXPRESSION_CLOSE] = { NULL, 99, 1, 0, eval_nop } +}; + +#define eval_precedence(operator) (operators[(unsigned char)(operator)].precedence) + +static inline NETDATA_DOUBLE eval_node(EVAL_EXPRESSION *exp, EVAL_NODE *op, int *error) { + if(unlikely(op->count != operators[op->operator].parameters)) { + *error = EVAL_ERROR_INVALID_NUMBER_OF_OPERANDS; + return 0; + } + + NETDATA_DOUBLE n = operators[op->operator].eval(exp, op, error); + + return n; +} + +// ---------------------------------------------------------------------------- +// parsed-as generation + +static inline void print_parsed_as_variable(BUFFER *out, EVAL_VARIABLE *v, int *error) { + (void)error; + buffer_sprintf(out, "${%s}", string2str(v->name)); +} + +static inline void print_parsed_as_constant(BUFFER *out, NETDATA_DOUBLE n) { + if(unlikely(isnan(n))) { + buffer_strcat(out, "nan"); + return; + } + + if(unlikely(isinf(n))) { + buffer_strcat(out, "inf"); + return; + } + + char b[100+1], *s; + snprintfz(b, sizeof(b) - 1, NETDATA_DOUBLE_FORMAT, n); + + s = &b[strlen(b) - 1]; + while(s > b && *s == '0') { + *s ='\0'; + s--; + } + + if(s > b && *s == '.') + *s = '\0'; + + buffer_strcat(out, b); +} + +static inline void print_parsed_as_value(BUFFER *out, EVAL_VALUE *v, int *error) { + switch(v->type) { + case EVAL_VALUE_EXPRESSION: + print_parsed_as_node(out, v->expression, error); + break; + + case EVAL_VALUE_NUMBER: + print_parsed_as_constant(out, v->number); + break; + + case EVAL_VALUE_VARIABLE: + print_parsed_as_variable(out, v->variable, error); + break; + + default: + *error = EVAL_ERROR_INVALID_VALUE; + break; + } +} + +static inline void print_parsed_as_node(BUFFER *out, EVAL_NODE *op, int *error) { + if(unlikely(op->count != operators[op->operator].parameters)) { + *error = EVAL_ERROR_INVALID_NUMBER_OF_OPERANDS; + return; + } + + if(operators[op->operator].parameters == 1) { + + if(operators[op->operator].print_as) + buffer_sprintf(out, "%s", operators[op->operator].print_as); + + //if(op->operator == EVAL_OPERATOR_EXPRESSION_OPEN) + // buffer_strcat(out, "("); + + print_parsed_as_value(out, &op->ops[0], error); + + //if(op->operator == EVAL_OPERATOR_EXPRESSION_OPEN) + // buffer_strcat(out, ")"); + } + + else if(operators[op->operator].parameters == 2) { + buffer_strcat(out, "("); + print_parsed_as_value(out, &op->ops[0], error); + + if(operators[op->operator].print_as) + buffer_sprintf(out, " %s ", operators[op->operator].print_as); + + print_parsed_as_value(out, &op->ops[1], error); + buffer_strcat(out, ")"); + } + else if(op->operator == EVAL_OPERATOR_IF_THEN_ELSE && operators[op->operator].parameters == 3) { + buffer_strcat(out, "("); + print_parsed_as_value(out, &op->ops[0], error); + + if(operators[op->operator].print_as) + buffer_sprintf(out, " %s ", operators[op->operator].print_as); + + print_parsed_as_value(out, &op->ops[1], error); + buffer_strcat(out, " : "); + print_parsed_as_value(out, &op->ops[2], error); + buffer_strcat(out, ")"); + } + + if(operators[op->operator].isfunction) + buffer_strcat(out, ")"); +} + +// ---------------------------------------------------------------------------- +// parsing expressions + +// skip spaces +static inline void skip_spaces(const char **string) { + const char *s = *string; + while(isspace((uint8_t)*s)) s++; + *string = s; +} + +// what character can appear just after an operator keyword +// like NOT AND OR ? +static inline int isoperatorterm_word(const char s) { + if(isspace(s) || s == '(' || s == '$' || s == '!' || s == '-' || s == '+' || isdigit(s) || !s) + return 1; + + return 0; +} + +// what character can appear just after an operator symbol? +static inline int isoperatorterm_symbol(const char s) { + if(isoperatorterm_word(s) || isalpha(s)) + return 1; + + return 0; +} + +// return 1 if the character should never appear in a variable +static inline int isvariableterm(const char s) { + if(isalnum(s) || s == '.' || s == '_') + return 0; + + return 1; +} + +// ---------------------------------------------------------------------------- +// parse operators + +static inline int parse_and(const char **string) { + const char *s = *string; + + // AND + if((s[0] == 'A' || s[0] == 'a') && (s[1] == 'N' || s[1] == 'n') && (s[2] == 'D' || s[2] == 'd') && isoperatorterm_word(s[3])) { + *string = &s[4]; + return 1; + } + + // && + if(s[0] == '&' && s[1] == '&' && isoperatorterm_symbol(s[2])) { + *string = &s[2]; + return 1; + } + + return 0; +} + +static inline int parse_or(const char **string) { + const char *s = *string; + + // OR + if((s[0] == 'O' || s[0] == 'o') && (s[1] == 'R' || s[1] == 'r') && isoperatorterm_word(s[2])) { + *string = &s[3]; + return 1; + } + + // || + if(s[0] == '|' && s[1] == '|' && isoperatorterm_symbol(s[2])) { + *string = &s[2]; + return 1; + } + + return 0; +} + +static inline int parse_greater_than_or_equal(const char **string) { + const char *s = *string; + + // >= + if(s[0] == '>' && s[1] == '=' && isoperatorterm_symbol(s[2])) { + *string = &s[2]; + return 1; + } + + return 0; +} + +static inline int parse_less_than_or_equal(const char **string) { + const char *s = *string; + + // <= + if (s[0] == '<' && s[1] == '=' && isoperatorterm_symbol(s[2])) { + *string = &s[2]; + return 1; + } + + return 0; +} + +static inline int parse_greater(const char **string) { + const char *s = *string; + + // > + if(s[0] == '>' && isoperatorterm_symbol(s[1])) { + *string = &s[1]; + return 1; + } + + return 0; +} + +static inline int parse_less(const char **string) { + const char *s = *string; + + // < + if(s[0] == '<' && isoperatorterm_symbol(s[1])) { + *string = &s[1]; + return 1; + } + + return 0; +} + +static inline int parse_equal(const char **string) { + const char *s = *string; + + // == + if(s[0] == '=' && s[1] == '=' && isoperatorterm_symbol(s[2])) { + *string = &s[2]; + return 1; + } + + // = + if(s[0] == '=' && isoperatorterm_symbol(s[1])) { + *string = &s[1]; + return 1; + } + + return 0; +} + +static inline int parse_not_equal(const char **string) { + const char *s = *string; + + // != + if(s[0] == '!' && s[1] == '=' && isoperatorterm_symbol(s[2])) { + *string = &s[2]; + return 1; + } + + // <> + if(s[0] == '<' && s[1] == '>' && isoperatorterm_symbol(s[2])) { + *string = &s[2]; + } + + return 0; +} + +static inline int parse_not(const char **string) { + const char *s = *string; + + // NOT + if((s[0] == 'N' || s[0] == 'n') && (s[1] == 'O' || s[1] == 'o') && (s[2] == 'T' || s[2] == 't') && isoperatorterm_word(s[3])) { + *string = &s[3]; + return 1; + } + + if(s[0] == '!') { + *string = &s[1]; + return 1; + } + + return 0; +} + +static inline int parse_multiply(const char **string) { + const char *s = *string; + + // * + if(s[0] == '*' && isoperatorterm_symbol(s[1])) { + *string = &s[1]; + return 1; + } + + return 0; +} + +static inline int parse_divide(const char **string) { + const char *s = *string; + + // / + if(s[0] == '/' && isoperatorterm_symbol(s[1])) { + *string = &s[1]; + return 1; + } + + return 0; +} + +static inline int parse_minus(const char **string) { + const char *s = *string; + + // - + if(s[0] == '-' && isoperatorterm_symbol(s[1])) { + *string = &s[1]; + return 1; + } + + return 0; +} + +static inline int parse_plus(const char **string) { + const char *s = *string; + + // + + if(s[0] == '+' && isoperatorterm_symbol(s[1])) { + *string = &s[1]; + return 1; + } + + return 0; +} + +static inline int parse_open_subexpression(const char **string) { + const char *s = *string; + + // ( + if(s[0] == '(') { + *string = &s[1]; + return 1; + } + + return 0; +} + +#define parse_close_function(x) parse_close_subexpression(x) + +static inline int parse_close_subexpression(const char **string) { + const char *s = *string; + + // ) + if(s[0] == ')') { + *string = &s[1]; + return 1; + } + + return 0; +} + +static inline int parse_variable(const char **string, char *buffer, size_t len) { + const char *s = *string; + + // $ + if(*s == '$') { + size_t i = 0; + s++; + + if(*s == '{') { + // ${variable_name} + + s++; + while (*s && *s != '}' && i < len) + buffer[i++] = *s++; + + if(*s == '}') + s++; + } + else { + // $variable_name + + while (*s && !isvariableterm(*s) && i < len) + buffer[i++] = *s++; + } + + buffer[i] = '\0'; + + if (buffer[0]) { + *string = s; + return 1; + } + } + + return 0; +} + +static inline int parse_constant(const char **string, NETDATA_DOUBLE *number) { + char *end = NULL; + NETDATA_DOUBLE n = str2ndd(*string, &end); + if(unlikely(!end || *string == end)) { + *number = 0; + return 0; + } + *number = n; + *string = end; + return 1; +} + +static inline int parse_abs(const char **string) { + const char *s = *string; + + // ABS + if((s[0] == 'A' || s[0] == 'a') && (s[1] == 'B' || s[1] == 'b') && (s[2] == 'S' || s[2] == 's') && s[3] == '(') { + *string = &s[3]; + return 1; + } + + return 0; +} + +static inline int parse_if_then_else(const char **string) { + const char *s = *string; + + // ? + if(s[0] == '?') { + *string = &s[1]; + return 1; + } + + return 0; +} + +static struct operator_parser { + unsigned char id; + int (*parse)(const char **); +} operator_parsers[] = { + // the order in this list is important! + // the first matching will be used + // so place the longer of overlapping ones + // at the top + + { EVAL_OPERATOR_AND, parse_and }, + { EVAL_OPERATOR_OR, parse_or }, + { EVAL_OPERATOR_GREATER_THAN_OR_EQUAL, parse_greater_than_or_equal }, + { EVAL_OPERATOR_LESS_THAN_OR_EQUAL, parse_less_than_or_equal }, + { EVAL_OPERATOR_NOT_EQUAL, parse_not_equal }, + { EVAL_OPERATOR_EQUAL, parse_equal }, + { EVAL_OPERATOR_LESS, parse_less }, + { EVAL_OPERATOR_GREATER, parse_greater }, + { EVAL_OPERATOR_PLUS, parse_plus }, + { EVAL_OPERATOR_MINUS, parse_minus }, + { EVAL_OPERATOR_MULTIPLY, parse_multiply }, + { EVAL_OPERATOR_DIVIDE, parse_divide }, + { EVAL_OPERATOR_IF_THEN_ELSE, parse_if_then_else }, + + /* we should not put in this list the following: + * + * - NOT + * - ( + * - ) + * + * these are handled in code + */ + + // termination + { EVAL_OPERATOR_NOP, NULL } +}; + +static inline unsigned char parse_operator(const char **string, int *precedence) { + skip_spaces(string); + + int i; + for(i = 0 ; operator_parsers[i].parse != NULL ; i++) + if(operator_parsers[i].parse(string)) { + if(precedence) *precedence = eval_precedence(operator_parsers[i].id); + return operator_parsers[i].id; + } + + return EVAL_OPERATOR_NOP; +} + +// ---------------------------------------------------------------------------- +// memory management + +static inline EVAL_NODE *eval_node_alloc(int count) { + static int id = 1; + + EVAL_NODE *op = callocz(1, sizeof(EVAL_NODE) + (sizeof(EVAL_VALUE) * count)); + + op->id = id++; + op->operator = EVAL_OPERATOR_NOP; + op->precedence = eval_precedence(EVAL_OPERATOR_NOP); + op->count = count; + return op; +} + +static inline void eval_node_set_value_to_node(EVAL_NODE *op, int pos, EVAL_NODE *value) { + if(pos >= op->count) + fatal("Invalid request to set position %d of OPERAND that has only %d values", pos + 1, op->count + 1); + + op->ops[pos].type = EVAL_VALUE_EXPRESSION; + op->ops[pos].expression = value; +} + +static inline void eval_node_set_value_to_constant(EVAL_NODE *op, int pos, NETDATA_DOUBLE value) { + if(pos >= op->count) + fatal("Invalid request to set position %d of OPERAND that has only %d values", pos + 1, op->count + 1); + + op->ops[pos].type = EVAL_VALUE_NUMBER; + op->ops[pos].number = value; +} + +static inline void eval_node_set_value_to_variable(EVAL_NODE *op, int pos, const char *variable) { + if(pos >= op->count) + fatal("Invalid request to set position %d of OPERAND that has only %d values", pos + 1, op->count + 1); + + op->ops[pos].type = EVAL_VALUE_VARIABLE; + op->ops[pos].variable = callocz(1, sizeof(EVAL_VARIABLE)); + op->ops[pos].variable->name = string_strdupz(variable); +} + +static inline void eval_variable_free(EVAL_VARIABLE *v) { + string_freez(v->name); + freez(v); +} + +static inline void eval_value_free(EVAL_VALUE *v) { + switch(v->type) { + case EVAL_VALUE_EXPRESSION: + eval_node_free(v->expression); + break; + + case EVAL_VALUE_VARIABLE: + eval_variable_free(v->variable); + break; + + default: + break; + } +} + +static inline void eval_node_free(EVAL_NODE *op) { + if(op->count) { + int i; + for(i = op->count - 1; i >= 0 ;i--) + eval_value_free(&op->ops[i]); + } + + freez(op); +} + +// ---------------------------------------------------------------------------- +// the parsing logic + +// helper function to avoid allocations all over the place +static inline EVAL_NODE *parse_next_operand_given_its_operator(const char **string, unsigned char operator_type, int *error) { + EVAL_NODE *sub = parse_one_full_operand(string, error); + if(!sub) return NULL; + + EVAL_NODE *op = eval_node_alloc(1); + op->operator = operator_type; + eval_node_set_value_to_node(op, 0, sub); + return op; +} + +// parse a full operand, including its sign or other associative operator (e.g. NOT) +static inline EVAL_NODE *parse_one_full_operand(const char **string, int *error) { + char variable_buffer[EVAL_MAX_VARIABLE_NAME_LENGTH + 1]; + EVAL_NODE *op1 = NULL; + NETDATA_DOUBLE number; + + *error = EVAL_ERROR_OK; + + skip_spaces(string); + if(!(**string)) { + *error = EVAL_ERROR_MISSING_OPERAND; + return NULL; + } + + if(parse_not(string)) { + op1 = parse_next_operand_given_its_operator(string, EVAL_OPERATOR_NOT, error); + op1->precedence = eval_precedence(EVAL_OPERATOR_NOT); + } + else if(parse_plus(string)) { + op1 = parse_next_operand_given_its_operator(string, EVAL_OPERATOR_SIGN_PLUS, error); + op1->precedence = eval_precedence(EVAL_OPERATOR_SIGN_PLUS); + } + else if(parse_minus(string)) { + op1 = parse_next_operand_given_its_operator(string, EVAL_OPERATOR_SIGN_MINUS, error); + op1->precedence = eval_precedence(EVAL_OPERATOR_SIGN_MINUS); + } + else if(parse_abs(string)) { + op1 = parse_next_operand_given_its_operator(string, EVAL_OPERATOR_ABS, error); + op1->precedence = eval_precedence(EVAL_OPERATOR_ABS); + } + else if(parse_open_subexpression(string)) { + EVAL_NODE *sub = parse_full_expression(string, error); + if(sub) { + op1 = eval_node_alloc(1); + op1->operator = EVAL_OPERATOR_EXPRESSION_OPEN; + op1->precedence = eval_precedence(EVAL_OPERATOR_EXPRESSION_OPEN); + eval_node_set_value_to_node(op1, 0, sub); + if(!parse_close_subexpression(string)) { + *error = EVAL_ERROR_MISSING_CLOSE_SUBEXPRESSION; + eval_node_free(op1); + return NULL; + } + } + } + else if(parse_variable(string, variable_buffer, EVAL_MAX_VARIABLE_NAME_LENGTH)) { + op1 = eval_node_alloc(1); + op1->operator = EVAL_OPERATOR_NOP; + eval_node_set_value_to_variable(op1, 0, variable_buffer); + } + else if(parse_constant(string, &number)) { + op1 = eval_node_alloc(1); + op1->operator = EVAL_OPERATOR_NOP; + eval_node_set_value_to_constant(op1, 0, number); + } + else if(**string) + *error = EVAL_ERROR_UNKNOWN_OPERAND; + else + *error = EVAL_ERROR_MISSING_OPERAND; + + return op1; +} + +// parse an operator and the rest of the expression +// precedence processing is handled here +static inline EVAL_NODE *parse_rest_of_expression(const char **string, int *error, EVAL_NODE *op1) { + EVAL_NODE *op2 = NULL; + unsigned char operator; + int precedence; + + operator = parse_operator(string, &precedence); + skip_spaces(string); + + if(operator != EVAL_OPERATOR_NOP) { + op2 = parse_one_full_operand(string, error); + if(!op2) { + // error is already reported + eval_node_free(op1); + return NULL; + } + + EVAL_NODE *op = eval_node_alloc(operators[operator].parameters); + op->operator = operator; + op->precedence = precedence; + + if(operator == EVAL_OPERATOR_IF_THEN_ELSE && op->count == 3) { + skip_spaces(string); + + if(**string != ':') { + eval_node_free(op); + eval_node_free(op1); + eval_node_free(op2); + *error = EVAL_ERROR_IF_THEN_ELSE_MISSING_ELSE; + return NULL; + } + (*string)++; + + skip_spaces(string); + + EVAL_NODE *op3 = parse_one_full_operand(string, error); + if(!op3) { + eval_node_free(op); + eval_node_free(op1); + eval_node_free(op2); + // error is already reported + return NULL; + } + + eval_node_set_value_to_node(op, 2, op3); + } + + eval_node_set_value_to_node(op, 1, op2); + + // precedence processing + // if this operator has a higher precedence compared to its next + // put the next operator on top of us (top = evaluated later) + // function recursion does the rest... + if(op->precedence > op1->precedence && op1->count == 2 && op1->operator != '(' && op1->ops[1].type == EVAL_VALUE_EXPRESSION) { + eval_node_set_value_to_node(op, 0, op1->ops[1].expression); + op1->ops[1].expression = op; + op = op1; + } + else + eval_node_set_value_to_node(op, 0, op1); + + return parse_rest_of_expression(string, error, op); + } + else if(**string == ')') { + ; + } + else if(**string) { + eval_node_free(op1); + op1 = NULL; + *error = EVAL_ERROR_MISSING_OPERATOR; + } + + return op1; +} + +// high level function to parse an expression or a sub-expression +static inline EVAL_NODE *parse_full_expression(const char **string, int *error) { + EVAL_NODE *op1 = parse_one_full_operand(string, error); + if(!op1) { + *error = EVAL_ERROR_MISSING_OPERAND; + return NULL; + } + + return parse_rest_of_expression(string, error, op1); +} + +// ---------------------------------------------------------------------------- +// public API + +int expression_evaluate(EVAL_EXPRESSION *expression) { + expression->error = EVAL_ERROR_OK; + + buffer_reset(expression->error_msg); + expression->result = eval_node(expression, expression->nodes, &expression->error); + + if(unlikely(isnan(expression->result))) { + if(expression->error == EVAL_ERROR_OK) + expression->error = EVAL_ERROR_VALUE_IS_NAN; + } + else if(unlikely(isinf(expression->result))) { + if(expression->error == EVAL_ERROR_OK) + expression->error = EVAL_ERROR_VALUE_IS_INFINITE; + } + else if(unlikely(expression->error == EVAL_ERROR_UNKNOWN_VARIABLE)) { + // although there is an unknown variable + // the expression was evaluated successfully + expression->error = EVAL_ERROR_OK; + } + + if(expression->error != EVAL_ERROR_OK) { + expression->result = NAN; + + if(buffer_strlen(expression->error_msg)) + buffer_strcat(expression->error_msg, "; "); + + buffer_sprintf(expression->error_msg, "failed to evaluate expression with error %d (%s)", expression->error, expression_strerror(expression->error)); + return 0; + } + + return 1; +} + +EVAL_EXPRESSION *expression_parse(const char *string, const char **failed_at, int *error) { + if(!string || !*string) + return NULL; + + const char *s = string; + int err = EVAL_ERROR_OK; + + EVAL_NODE *op = parse_full_expression(&s, &err); + + if(*s) { + if(op) { + eval_node_free(op); + op = NULL; + } + err = EVAL_ERROR_REMAINING_GARBAGE; + } + + if (failed_at) *failed_at = s; + if (error) *error = err; + + if(!op) { + unsigned long pos = s - string + 1; + netdata_log_error("failed to parse expression '%s': %s at character %lu (i.e.: '%s').", string, expression_strerror(err), pos, s); + return NULL; + } + + BUFFER *out = buffer_create(1024, NULL); + print_parsed_as_node(out, op, &err); + if(err != EVAL_ERROR_OK) { + netdata_log_error("failed to re-generate expression '%s' with reason: %s", string, expression_strerror(err)); + eval_node_free(op); + buffer_free(out); + return NULL; + } + + EVAL_EXPRESSION *exp = callocz(1, sizeof(EVAL_EXPRESSION)); + + exp->source = string_strdupz(string); + exp->parsed_as = string_strdupz(buffer_tostring(out)); + buffer_free(out); + + exp->error_msg = buffer_create(100, NULL); + exp->nodes = op; + + return exp; +} + +void expression_free(EVAL_EXPRESSION *expression) { + if(!expression) return; + + if(expression->nodes) eval_node_free(expression->nodes); + string_freez((void *)expression->source); + string_freez((void *)expression->parsed_as); + buffer_free(expression->error_msg); + freez(expression); +} + +const char *expression_strerror(int error) { + switch(error) { + case EVAL_ERROR_OK: + return "success"; + + case EVAL_ERROR_MISSING_CLOSE_SUBEXPRESSION: + return "missing closing parenthesis"; + + case EVAL_ERROR_UNKNOWN_OPERAND: + return "unknown operand"; + + case EVAL_ERROR_MISSING_OPERAND: + return "expected operand"; + + case EVAL_ERROR_MISSING_OPERATOR: + return "expected operator"; + + case EVAL_ERROR_REMAINING_GARBAGE: + return "remaining characters after expression"; + + case EVAL_ERROR_INVALID_VALUE: + return "invalid value structure - internal error"; + + case EVAL_ERROR_INVALID_NUMBER_OF_OPERANDS: + return "wrong number of operands for operation - internal error"; + + case EVAL_ERROR_VALUE_IS_NAN: + return "value is unset"; + + case EVAL_ERROR_VALUE_IS_INFINITE: + return "computed value is infinite"; + + case EVAL_ERROR_UNKNOWN_VARIABLE: + return "undefined variable"; + + case EVAL_ERROR_IF_THEN_ELSE_MISSING_ELSE: + return "missing second sub-expression of inline conditional"; + + default: + return "unknown error"; + } +} + +const char *expression_source(EVAL_EXPRESSION *expression) { + if(!expression) + return string2str(NULL); + + return string2str(expression->source); +} + +const char *expression_parsed_as(EVAL_EXPRESSION *expression) { + if(!expression) + return string2str(NULL); + + return string2str(expression->parsed_as); +} + +const char *expression_error_msg(EVAL_EXPRESSION *expression) { + if(!expression || !expression->error_msg) + return ""; + + return buffer_tostring(expression->error_msg); +} + +NETDATA_DOUBLE expression_result(EVAL_EXPRESSION *expression) { + if(!expression) + return NAN; + + return expression->result; +} + +void expression_set_variable_lookup_callback(EVAL_EXPRESSION *expression, eval_expression_variable_lookup_t cb, void *data) { + if(!expression) + return; + + expression->variable_lookup_cb = cb; + expression->variable_lookup_cb_data = data; +} + +static size_t expression_hardcode_node_variable(EVAL_NODE *node, STRING *variable, NETDATA_DOUBLE value) { + size_t matches = 0; + + for(int i = 0; i < node->count; i++) { + switch(node->ops[i].type) { + case EVAL_VALUE_NUMBER: + case EVAL_VALUE_INVALID: + break; + + case EVAL_VALUE_VARIABLE: + if(node->ops[i].variable->name == variable) { + string_freez(node->ops[i].variable->name); + freez(node->ops[i].variable); + node->ops[i].type = EVAL_VALUE_NUMBER; + node->ops[i].number = value; + matches++; + } + break; + + case EVAL_VALUE_EXPRESSION: + matches += expression_hardcode_node_variable(node->ops[i].expression, variable, value); + break; + } + } + + return matches; +} + +void expression_hardcode_variable(EVAL_EXPRESSION *expression, STRING *variable, NETDATA_DOUBLE value) { + if (!expression || !variable || isnan(value)) + return; + + size_t matches = expression_hardcode_node_variable(expression->nodes, variable, value); + if (matches) { + char replace[1024]; + snprintfz(replace, sizeof(replace), NETDATA_DOUBLE_FORMAT_AUTO, value); + size_t replace_len = strlen(replace); + + size_t source_len = string_strlen(expression->source); + const char *source_str = string2str(expression->source); + + // Allocate enough space to accommodate all replacements. + char buf[source_len + 1 + matches * (replace_len + 1)]; + + char find1[string_strlen(variable) + 1 + 1]; + snprintfz(find1, sizeof(find1), "$%s", string2str(variable)); + size_t find1_len = strlen(find1); + + char find2[string_strlen(variable) + 1 + 3]; + snprintfz(find2, sizeof(find2), "${%s}", string2str(variable)); + size_t find2_len = strlen(find2); + + size_t found = 0; + char *buf_ptr = buf; + const char *source_ptr = source_str; + + while (*source_ptr) { + char *s1 = strstr(source_ptr, find1); + char *s2 = strstr(source_ptr, find2); + + char *s = s1; + size_t len = find1_len; + if (s2 && (!s1 || s2 < s1)) { + s = s2; + len = find2_len; + } + + if (s) { + if (s == s1 && (isalnum((uint8_t)s[len]) || s[len] == '_')) { + // Move past the variable if it's part of a larger word. + source_ptr = s + len; + continue; + } + + // Copy the part before the variable. + memcpy(buf_ptr, source_ptr, s - source_ptr); + buf_ptr += (s - source_ptr); + + // Copy the replacement. + memcpy(buf_ptr, replace, replace_len); + buf_ptr += replace_len; + *buf_ptr = '\0'; + + // Move the source pointer past the replaced variable. + source_ptr = s + len; + found++; + } else { + // Copy the rest of the string if no more variables are found. + strcpy(buf_ptr, source_ptr); + break; + } + } + + // Update the expression source with the new string. + string_freez(expression->source); + expression->source = string_strdupz(buf); + } +} diff --git a/src/libnetdata/eval/eval.h b/src/libnetdata/eval/eval.h new file mode 100644 index 00000000..48a3b073 --- /dev/null +++ b/src/libnetdata/eval/eval.h @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_EVAL_H +#define NETDATA_EVAL_H 1 + +#include "../libnetdata.h" + +#define EVAL_MAX_VARIABLE_NAME_LENGTH 300 + +struct eval_expression; +typedef struct eval_expression EVAL_EXPRESSION; +typedef bool (*eval_expression_variable_lookup_t)(STRING *variable, void *data, NETDATA_DOUBLE *result); + +// parsing and evaluation +#define EVAL_ERROR_OK 0 + +// parsing errors +#define EVAL_ERROR_MISSING_CLOSE_SUBEXPRESSION 1 +#define EVAL_ERROR_UNKNOWN_OPERAND 2 +#define EVAL_ERROR_MISSING_OPERAND 3 +#define EVAL_ERROR_MISSING_OPERATOR 4 +#define EVAL_ERROR_REMAINING_GARBAGE 5 +#define EVAL_ERROR_IF_THEN_ELSE_MISSING_ELSE 6 + +// evaluation errors +#define EVAL_ERROR_INVALID_VALUE 101 +#define EVAL_ERROR_INVALID_NUMBER_OF_OPERANDS 102 +#define EVAL_ERROR_VALUE_IS_NAN 103 +#define EVAL_ERROR_VALUE_IS_INFINITE 104 +#define EVAL_ERROR_UNKNOWN_VARIABLE 105 + +// parse the given string as an expression and return: +// a pointer to an expression if it parsed OK +// NULL in which case the pointer to error has the error code +EVAL_EXPRESSION *expression_parse(const char *string, const char **failed_at, int *error); + +// free all resources allocated for an expression +void expression_free(EVAL_EXPRESSION *expression); + +// convert an error code to a message +const char *expression_strerror(int error); + +// evaluate an expression and return +// 1 = OK, the result is in: expression->result +// 2 = FAILED, the error message is in: buffer_tostring(expression->error_msg) +int expression_evaluate(EVAL_EXPRESSION *expression); + +const char *expression_source(EVAL_EXPRESSION *expression); +const char *expression_parsed_as(EVAL_EXPRESSION *expression); +const char *expression_error_msg(EVAL_EXPRESSION *expression); +NETDATA_DOUBLE expression_result(EVAL_EXPRESSION *expression); +void expression_set_variable_lookup_callback(EVAL_EXPRESSION *expression, eval_expression_variable_lookup_t cb, void *data); + +void expression_hardcode_variable(EVAL_EXPRESSION *expression, STRING *variable, NETDATA_DOUBLE value); + +#endif //NETDATA_EVAL_H diff --git a/src/libnetdata/facets/README.md b/src/libnetdata/facets/README.md new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/src/libnetdata/facets/README.md diff --git a/src/libnetdata/facets/facets.c b/src/libnetdata/facets/facets.c new file mode 100644 index 00000000..3c746cbc --- /dev/null +++ b/src/libnetdata/facets/facets.c @@ -0,0 +1,2781 @@ +// SPDX-License-Identifier: GPL-3.0-or-later +#include "facets.h" + +#define FACETS_HISTOGRAM_COLUMNS 150 // the target number of points in a histogram +#define FACETS_KEYS_WITH_VALUES_MAX 200 // the max number of keys that can be facets +#define FACETS_KEYS_IN_ROW_MAX 500 // the max number of keys in a row + +#define FACETS_KEYS_HASHTABLE_ENTRIES 15 +#define FACETS_VALUES_HASHTABLE_ENTRIES 15 + +static inline void facets_reset_key(FACET_KEY *k); + +// ---------------------------------------------------------------------------- + +static const char id_encoding_characters[64 + 1] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ.abcdefghijklmnopqrstuvwxyz_0123456789"; +static const uint8_t id_encoding_characters_reverse[256] = { + ['A'] = 0, ['B'] = 1, ['C'] = 2, ['D'] = 3, + ['E'] = 4, ['F'] = 5, ['G'] = 6, ['H'] = 7, + ['I'] = 8, ['J'] = 9, ['K'] = 10, ['L'] = 11, + ['M'] = 12, ['N'] = 13, ['O'] = 14, ['P'] = 15, + ['Q'] = 16, ['R'] = 17, ['S'] = 18, ['T'] = 19, + ['U'] = 20, ['V'] = 21, ['W'] = 22, ['X'] = 23, + ['Y'] = 24, ['Z'] = 25, ['.'] = 26, ['a'] = 27, + ['b'] = 28, ['c'] = 29, ['d'] = 30, ['e'] = 31, + ['f'] = 32, ['g'] = 33, ['h'] = 34, ['i'] = 35, + ['j'] = 36, ['k'] = 37, ['l'] = 38, ['m'] = 39, + ['n'] = 40, ['o'] = 41, ['p'] = 42, ['q'] = 43, + ['r'] = 44, ['s'] = 45, ['t'] = 46, ['u'] = 47, + ['v'] = 48, ['w'] = 49, ['x'] = 50, ['y'] = 51, + ['z'] = 52, ['_'] = 53, ['0'] = 54, ['1'] = 55, + ['2'] = 56, ['3'] = 57, ['4'] = 58, ['5'] = 59, + ['6'] = 60, ['7'] = 61, ['8'] = 62, ['9'] = 63 +}; + +#define FACET_STRING_HASH_SIZE 12 +#define FACETS_HASH XXH64_hash_t +#define FACETS_HASH_FUNCTION(src, len) XXH3_64bits(src, len) +#define FACETS_HASH_ZERO (FACETS_HASH)0 +#define FACETS_HASH_UNSAMPLED (FACETS_HASH)(UINT64_MAX - 1) +#define FACETS_HASH_ESTIMATED (FACETS_HASH)UINT64_MAX + +static inline void facets_hash_to_str(FACETS_HASH num, char *out) { + out[11] = '\0'; + out[10] = id_encoding_characters[num & 63]; num >>= 6; + out[9] = id_encoding_characters[num & 63]; num >>= 6; + out[8] = id_encoding_characters[num & 63]; num >>= 6; + out[7] = id_encoding_characters[num & 63]; num >>= 6; + out[6] = id_encoding_characters[num & 63]; num >>= 6; + out[5] = id_encoding_characters[num & 63]; num >>= 6; + out[4] = id_encoding_characters[num & 63]; num >>= 6; + out[3] = id_encoding_characters[num & 63]; num >>= 6; + out[2] = id_encoding_characters[num & 63]; num >>= 6; + out[1] = id_encoding_characters[num & 63]; num >>= 6; + out[0] = id_encoding_characters[num & 63]; +} + +static inline FACETS_HASH str_to_facets_hash(const char *str) { + FACETS_HASH num = 0; + int shifts = 6 * (FACET_STRING_HASH_SIZE - 2); + + num |= ((FACETS_HASH)(id_encoding_characters_reverse[(uint8_t)(str[0])])) << shifts; shifts -= 6; + num |= ((FACETS_HASH)(id_encoding_characters_reverse[(uint8_t)(str[1])])) << shifts; shifts -= 6; + num |= ((FACETS_HASH)(id_encoding_characters_reverse[(uint8_t)(str[2])])) << shifts; shifts -= 6; + num |= ((FACETS_HASH)(id_encoding_characters_reverse[(uint8_t)(str[3])])) << shifts; shifts -= 6; + num |= ((FACETS_HASH)(id_encoding_characters_reverse[(uint8_t)(str[4])])) << shifts; shifts -= 6; + num |= ((FACETS_HASH)(id_encoding_characters_reverse[(uint8_t)(str[5])])) << shifts; shifts -= 6; + num |= ((FACETS_HASH)(id_encoding_characters_reverse[(uint8_t)(str[6])])) << shifts; shifts -= 6; + num |= ((FACETS_HASH)(id_encoding_characters_reverse[(uint8_t)(str[7])])) << shifts; shifts -= 6; + num |= ((FACETS_HASH)(id_encoding_characters_reverse[(uint8_t)(str[8])])) << shifts; shifts -= 6; + num |= ((FACETS_HASH)(id_encoding_characters_reverse[(uint8_t)(str[9])])) << shifts; shifts -= 6; + num |= ((FACETS_HASH)(id_encoding_characters_reverse[(uint8_t)(str[10])])) << shifts; + + return num; +} + +static const char *hash_to_static_string(FACETS_HASH hash) { + static __thread char hash_str[FACET_STRING_HASH_SIZE]; + facets_hash_to_str(hash, hash_str); + return hash_str; +} + +static inline bool is_valid_string_hash(const char *s) { + if(strlen(s) != FACET_STRING_HASH_SIZE - 1) { + netdata_log_error("The user supplied key '%s' does not have the right length for a facets hash.", s); + return false; + } + + uint8_t *t = (uint8_t *)s; + while(*t) { + if(id_encoding_characters_reverse[*t] == 0 && *t != id_encoding_characters[0]) { + netdata_log_error("The user supplied key '%s' contains invalid characters for a facets hash.", s); + return false; + } + + t++; + } + + return true; +} + +// ---------------------------------------------------------------------------- +// hashtable for FACET_VALUE + +// cleanup hashtable defines +#include "../../libnetdata/simple_hashtable_undef.h" + +struct facet_value; +// #define SIMPLE_HASHTABLE_SORT_FUNCTION compare_facet_value +#define SIMPLE_HASHTABLE_VALUE_TYPE struct facet_value +#define SIMPLE_HASHTABLE_NAME _VALUE +#include "../simple_hashtable.h" + +// ---------------------------------------------------------------------------- +// hashtable for FACET_KEY + +// cleanup hashtable defines +#include "../../libnetdata/simple_hashtable_undef.h" + +struct facet_key; +// #define SIMPLE_HASHTABLE_SORT_FUNCTION compare_facet_key +#define SIMPLE_HASHTABLE_VALUE_TYPE struct facet_key +#define SIMPLE_HASHTABLE_NAME _KEY +#include "../simple_hashtable.h" + +// ---------------------------------------------------------------------------- + +typedef struct facet_value { + FACETS_HASH hash; + const char *name; + const char *color; + uint32_t name_len; + + bool selected; + bool empty; + bool unsampled; + bool estimated; + + uint32_t rows_matching_facet_value; + uint32_t final_facet_value_counter; + uint32_t order; + + uint32_t *histogram; + uint32_t min, max, sum; + + struct facet_value *prev, *next; +} FACET_VALUE; + +typedef enum { + FACET_KEY_VALUE_NONE = 0, + FACET_KEY_VALUE_UPDATED = (1 << 0), + FACET_KEY_VALUE_EMPTY = (1 << 1), + FACET_KEY_VALUE_UNSAMPLED = (1 << 2), + FACET_KEY_VALUE_ESTIMATED = (1 << 3), + FACET_KEY_VALUE_COPIED = (1 << 4), +} FACET_KEY_VALUE_FLAGS; + +#define facet_key_value_updated(k) ((k)->current_value.flags & FACET_KEY_VALUE_UPDATED) +#define facet_key_value_empty(k) ((k)->current_value.flags & FACET_KEY_VALUE_EMPTY) +#define facet_key_value_unsampled(k) ((k)->current_value.flags & FACET_KEY_VALUE_UNSAMPLED) +#define facet_key_value_estimated(k) ((k)->current_value.flags & FACET_KEY_VALUE_ESTIMATED) +#define facet_key_value_empty_or_unsampled_or_estimated(k) ((k)->current_value.flags & (FACET_KEY_VALUE_EMPTY|FACET_KEY_VALUE_UNSAMPLED|FACET_KEY_VALUE_ESTIMATED)) +#define facet_key_value_copied(k) ((k)->current_value.flags & FACET_KEY_VALUE_COPIED) + +struct facet_key { + FACETS *facets; + + FACETS_HASH hash; + const char *name; + + FACET_KEY_OPTIONS options; + + bool default_selected_for_values; // the default "selected" for all values in the dictionary + + // members about the current row + uint32_t key_found_in_row; + uint32_t key_values_selected_in_row; + uint32_t order; + + struct { + bool enabled; + uint32_t used; + FACET_VALUE *ll; + SIMPLE_HASHTABLE_VALUE ht; + } values; + + struct { + FACETS_HASH hash; + FACET_KEY_VALUE_FLAGS flags; + const char *raw; + uint32_t raw_len; + BUFFER *b; + FACET_VALUE *v; + } current_value; + + struct { + FACET_VALUE *v; + } empty_value; + + struct { + FACET_VALUE *v; + } unsampled_value; + + struct { + FACET_VALUE *v; + } estimated_value; + + struct { + facet_dynamic_row_t cb; + void *data; + } dynamic; + + struct { + bool view_only; + facets_key_transformer_t cb; + void *data; + } transform; + + struct facet_key *prev, *next; +}; + +struct facets { + SIMPLE_PATTERN *visible_keys; + SIMPLE_PATTERN *excluded_keys; + SIMPLE_PATTERN *included_keys; + + FACETS_OPTIONS options; + + struct { + usec_t start_ut; + usec_t stop_ut; + FACETS_ANCHOR_DIRECTION direction; + } anchor; + + SIMPLE_PATTERN *query; // the full text search pattern + size_t keys_filtered_by_query; // the number of fields we do full text search (constant) + + DICTIONARY *accepted_params; + + struct { + size_t count; + FACET_KEY *ll; + SIMPLE_HASHTABLE_KEY ht; + } keys; + + struct { + // this is like a stack, of the keys that are used as facets + size_t used; + FACET_KEY *array[FACETS_KEYS_WITH_VALUES_MAX]; + } keys_with_values; + + struct { + // this is like a stack, of the keys that need to clean up between each row + size_t used; + FACET_KEY *array[FACETS_KEYS_IN_ROW_MAX]; + } keys_in_row; + + FACET_ROW *base; // double linked list of the selected facets rows + + uint32_t items_to_return; + uint32_t max_items_to_return; + uint32_t order; + + struct { + FACET_ROW_SEVERITY severity; + size_t keys_matched_by_query_positive; // the number of fields matched the full text search (per row) + size_t keys_matched_by_query_negative; // the number of fields matched the full text search (per row) + } current_row; + + struct { + usec_t after_ut; + usec_t before_ut; + } timeframe; + + struct { + FACET_KEY *key; + FACETS_HASH hash; + char *chart; + bool enabled; + uint32_t slots; + usec_t slot_width_ut; + usec_t after_ut; + usec_t before_ut; + } histogram; + + struct { + facet_row_severity_t cb; + void *data; + } severity; + + struct { + FACET_ROW *last_added; + + size_t first; + size_t forwards; + size_t backwards; + size_t skips_before; + size_t skips_after; + size_t prepends; + size_t appends; + size_t shifts; + + struct { + size_t evaluated; + size_t matched; + size_t unsampled; + size_t estimated; + size_t created; + size_t reused; + } rows; + + struct { + size_t registered; + size_t unique; + } keys; + + struct { + size_t registered; + size_t transformed; + size_t dynamic; + size_t empty; + size_t unsampled; + size_t estimated; + size_t indexed; + size_t inserts; + size_t conflicts; + } values; + + struct { + size_t searches; + } fts; + } operations; + + struct { + DICTIONARY *used_hashes_registry; + } report; +}; + +usec_t facets_row_oldest_ut(FACETS *facets) { + if(facets->base) + return facets->base->prev->usec; + + return 0; +} + +usec_t facets_row_newest_ut(FACETS *facets) { + if(facets->base) + return facets->base->usec; + + return 0; +} + +uint32_t facets_rows(FACETS *facets) { + return facets->items_to_return; +} + +// ---------------------------------------------------------------------------- + +static void facets_row_free(FACETS *facets __maybe_unused, FACET_ROW *row); +static inline void facet_value_is_used(FACET_KEY *k, FACET_VALUE *v); +static inline bool facets_key_is_facet(FACETS *facets, FACET_KEY *k); + +// ---------------------------------------------------------------------------- +// The FACET_VALUE index within each FACET_KEY + +#define foreach_value_in_key(k, v) \ + for((v) = (k)->values.ll; (v) ;(v) = (v)->next) + +#define foreach_value_in_key_done(v) do { ; } while(0) + +static inline void FACETS_VALUES_INDEX_CREATE(FACET_KEY *k) { + k->values.ll = NULL; + k->values.used = 0; + simple_hashtable_init_VALUE(&k->values.ht, FACETS_VALUES_HASHTABLE_ENTRIES); +} + +static inline void FACETS_VALUES_INDEX_DESTROY(FACET_KEY *k) { + FACET_VALUE *v = k->values.ll; + while(v) { + FACET_VALUE *next = v->next; + freez(v->histogram); + freez((void *)v->name); + freez(v); + v = next; + } + k->values.ll = NULL; + k->values.used = 0; + k->values.enabled = false; + + simple_hashtable_destroy_VALUE(&k->values.ht); +} + +static inline const char *facets_key_get_value(FACET_KEY *k) { + return facet_key_value_copied(k) ? buffer_tostring(k->current_value.b) : k->current_value.raw; +} + +static inline uint32_t facets_key_get_value_length(FACET_KEY *k) { + return facet_key_value_copied(k) ? buffer_strlen(k->current_value.b) : k->current_value.raw_len; +} + +static inline void facets_key_value_copy_to_buffer(FACET_KEY *k) { + if(!facet_key_value_copied(k)) { + buffer_contents_replace(k->current_value.b, k->current_value.raw, k->current_value.raw_len); + k->current_value.flags |= FACET_KEY_VALUE_COPIED; + } +} + +static const char *facets_value_dup(const char *s, uint32_t len) { + char *d = mallocz(len + 1); + + if(len) + memcpy(d, s, len); + + d[len] = '\0'; + + return d; +} + +static inline void FACET_VALUE_ADD_CONFLICT(FACET_KEY *k, FACET_VALUE *v, const FACET_VALUE * const nv) { + if(!v->name && !v->name_len && nv->name && nv->name_len) { + // an actual value, not a filter + v->name = facets_value_dup(nv->name, nv->name_len); + v->name_len = nv->name_len; + } + + if(v->name && v->name_len) + facet_value_is_used(k, v); + + internal_fatal(v->name && nv->name && v->name_len == nv->name_len && memcmp(v->name, nv->name, v->name_len) != 0, + "value hash conflict: '%s' and '%s' have the same hash '%s'", + v->name, nv->name, hash_to_static_string(v->hash)); + + k->facets->operations.values.conflicts++; +} + +static inline FACET_VALUE *FACET_VALUE_GET_FROM_INDEX(FACET_KEY *k, FACETS_HASH hash) { + SIMPLE_HASHTABLE_SLOT_VALUE *slot = simple_hashtable_get_slot_VALUE(&k->values.ht, hash, NULL, true); + return SIMPLE_HASHTABLE_SLOT_DATA(slot); +} + +static inline FACET_VALUE *FACET_VALUE_ADD_TO_INDEX(FACET_KEY *k, const FACET_VALUE * const tv) { + SIMPLE_HASHTABLE_SLOT_VALUE *slot = simple_hashtable_get_slot_VALUE(&k->values.ht, tv->hash, NULL, true); + + if(SIMPLE_HASHTABLE_SLOT_DATA(slot)) { + // already exists + + FACET_VALUE *v = SIMPLE_HASHTABLE_SLOT_DATA(slot); + FACET_VALUE_ADD_CONFLICT(k, v, tv); + return v; + } + + // we have to add it + + FACET_VALUE *v = mallocz(sizeof(*v)); + simple_hashtable_set_slot_VALUE(&k->values.ht, slot, tv->hash, v); + + memcpy(v, tv, sizeof(*v)); + + if(v->estimated || v->unsampled) { + if(k->values.ll && k->values.ll->estimated) { + FACET_VALUE *estimated = k->values.ll; + DOUBLE_LINKED_LIST_INSERT_ITEM_AFTER_UNSAFE(k->values.ll, estimated, v, prev, next); + } + else + DOUBLE_LINKED_LIST_PREPEND_ITEM_UNSAFE(k->values.ll, v, prev, next); + } + else + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(k->values.ll, v, prev, next); + + k->values.used++; + + if(!v->selected) + v->selected = k->default_selected_for_values; + + if(v->name && v->name_len) { + // an actual value, not a filter + v->name = facets_value_dup(v->name, v->name_len); + facet_value_is_used(k, v); + } + else { + v->name = NULL; + v->name_len = 0; + } + + k->facets->operations.values.inserts++; + + return v; +} + +static inline void FACET_VALUE_ADD_UNSAMPLED_VALUE_TO_INDEX(FACET_KEY *k) { + static const FACET_VALUE tv = { + .hash = FACETS_HASH_UNSAMPLED, + .name = FACET_VALUE_UNSAMPLED, + .name_len = sizeof(FACET_VALUE_UNSAMPLED) - 1, + .unsampled = true, + .color = "offline", + }; + + k->current_value.hash = FACETS_HASH_UNSAMPLED; + + if(k->unsampled_value.v) { + FACET_VALUE_ADD_CONFLICT(k, k->unsampled_value.v, &tv); + k->current_value.v = k->unsampled_value.v; + } + else { + FACET_VALUE *v = FACET_VALUE_ADD_TO_INDEX(k, &tv); + v->unsampled = true; + k->unsampled_value.v = v; + k->current_value.v = v; + } +} + +static inline void FACET_VALUE_ADD_ESTIMATED_VALUE_TO_INDEX(FACET_KEY *k) { + static const FACET_VALUE tv = { + .hash = FACETS_HASH_ESTIMATED, + .name = FACET_VALUE_ESTIMATED, + .name_len = sizeof(FACET_VALUE_ESTIMATED) - 1, + .estimated = true, + .color = "generic", + }; + + k->current_value.hash = FACETS_HASH_ESTIMATED; + + if(k->estimated_value.v) { + FACET_VALUE_ADD_CONFLICT(k, k->estimated_value.v, &tv); + k->current_value.v = k->estimated_value.v; + } + else { + FACET_VALUE *v = FACET_VALUE_ADD_TO_INDEX(k, &tv); + v->estimated = true; + k->estimated_value.v = v; + k->current_value.v = v; + } +} + +static inline void FACET_VALUE_ADD_EMPTY_VALUE_TO_INDEX(FACET_KEY *k) { + static const FACET_VALUE tv = { + .hash = FACETS_HASH_ZERO, + .name = FACET_VALUE_UNSET, + .name_len = sizeof(FACET_VALUE_UNSET) - 1, + .empty = true, + }; + + k->current_value.hash = FACETS_HASH_ZERO; + + if(k->empty_value.v) { + FACET_VALUE_ADD_CONFLICT(k, k->empty_value.v, &tv); + k->current_value.v = k->empty_value.v; + } + else { + FACET_VALUE *v = FACET_VALUE_ADD_TO_INDEX(k, &tv); + v->empty = true; + k->empty_value.v = v; + k->current_value.v = v; + } +} + +static inline void FACET_VALUE_ADD_CURRENT_VALUE_TO_INDEX(FACET_KEY *k) { + static __thread FACET_VALUE tv = { 0 }; + + internal_fatal(!facet_key_value_updated(k), "trying to add a non-updated value to the index"); + + tv.name = facets_key_get_value(k); + tv.name_len = facets_key_get_value_length(k); + tv.hash = FACETS_HASH_FUNCTION(tv.name, tv.name_len); + tv.empty = false; + tv.estimated = false; + tv.unsampled = false; + + k->current_value.v = FACET_VALUE_ADD_TO_INDEX(k, &tv); + k->facets->operations.values.indexed++; +} + +static inline void FACET_VALUE_ADD_OR_UPDATE_SELECTED(FACET_KEY *k, FACETS_HASH hash) { + FACET_VALUE tv = { + .hash = hash, + .selected = true, + .name = NULL, + .name_len = 0, + }; + FACET_VALUE_ADD_TO_INDEX(k, &tv); +} + +// ---------------------------------------------------------------------------- +// The FACET_KEY index within each FACET + +#define foreach_key_in_facets(facets, k) \ + for((k) = (facets)->keys.ll; (k) ;(k) = (k)->next) + +#define foreach_key_in_facets_done(k) do { ; } while(0) + +static inline void facet_key_late_init(FACETS *facets, FACET_KEY *k) { + if(k->values.enabled) + return; + + if(facets_key_is_facet(facets, k)) { + FACETS_VALUES_INDEX_CREATE(k); + k->values.enabled = true; + if(facets->keys_with_values.used < FACETS_KEYS_WITH_VALUES_MAX) + facets->keys_with_values.array[facets->keys_with_values.used++] = k; + } +} + +static inline void FACETS_KEYS_INDEX_CREATE(FACETS *facets) { + facets->keys.ll = NULL; + facets->keys.count = 0; + facets->keys_with_values.used = 0; + + simple_hashtable_init_KEY(&facets->keys.ht, FACETS_KEYS_HASHTABLE_ENTRIES); +} + +static inline void FACETS_KEYS_INDEX_DESTROY(FACETS *facets) { + FACET_KEY *k = facets->keys.ll; + while(k) { + FACET_KEY *next = k->next; + + FACETS_VALUES_INDEX_DESTROY(k); + buffer_free(k->current_value.b); + freez((void *)k->name); + freez(k); + + k = next; + } + facets->keys.ll = NULL; + facets->keys.count = 0; + facets->keys_with_values.used = 0; + + simple_hashtable_destroy_KEY(&facets->keys.ht); +} + +static inline FACET_KEY *FACETS_KEY_GET_FROM_INDEX(FACETS *facets, FACETS_HASH hash) { + SIMPLE_HASHTABLE_SLOT_KEY *slot = simple_hashtable_get_slot_KEY(&facets->keys.ht, hash, NULL, true); + return SIMPLE_HASHTABLE_SLOT_DATA(slot); +} + +bool facets_key_name_value_length_is_selected(FACETS *facets, const char *key, size_t key_length, const char *value, size_t value_length) { + FACETS_HASH hash = FACETS_HASH_FUNCTION(key, key_length); + FACET_KEY *k = FACETS_KEY_GET_FROM_INDEX(facets, hash); + if(!k || k->default_selected_for_values) + return false; + + hash = FACETS_HASH_FUNCTION(value, value_length); + FACET_VALUE *v = FACET_VALUE_GET_FROM_INDEX(k, hash); + return (v && v->selected) ? true : false; +} + +void facets_add_possible_value_name_to_key(FACETS *facets, const char *key, size_t key_length, const char *value, size_t value_length) { + FACETS_HASH hash = FACETS_HASH_FUNCTION(key, key_length); + FACET_KEY *k = FACETS_KEY_GET_FROM_INDEX(facets, hash); + if(!k) return; + + hash = FACETS_HASH_FUNCTION(value, value_length); + FACET_VALUE *v = FACET_VALUE_GET_FROM_INDEX(k, hash); + if(v && v->name && v->name_len) return; + + FACET_VALUE tv = { + .hash = hash, + .name = value, + .name_len = value_length, + }; + FACET_VALUE_ADD_TO_INDEX(k, &tv); +} + +static void facet_key_set_name(FACET_KEY *k, const char *name, size_t name_length) { + internal_fatal(k->name && name && (strncmp(k->name, name, name_length) != 0 || k->name[name_length] != '\0'), + "key hash conflict: '%s' and '%s' have the same hash", + k->name, name); + + if(likely(k->name || !name || !name_length)) + return; + + // an actual value, not a filter + + char buf[name_length + 1]; + memcpy(buf, name, name_length); + buf[name_length] = '\0'; + + internal_fatal(strchr(buf, '='), "found = in key"); + + k->name = strdupz(buf); + facet_key_late_init(k->facets, k); +} + +static inline FACET_KEY *FACETS_KEY_CREATE(FACETS *facets, FACETS_HASH hash, const char *name, size_t name_length, FACET_KEY_OPTIONS options) { + facets->operations.keys.unique++; + + FACET_KEY *k = callocz(1, sizeof(*k)); + + k->hash = hash; + k->facets = facets; + k->options = options; + k->current_value.b = buffer_create(sizeof(FACET_VALUE_UNSET), NULL); + k->default_selected_for_values = true; + + if(!(k->options & FACET_KEY_OPTION_REORDER)) + k->order = facets->order++; + + if((k->options & FACET_KEY_OPTION_FTS) || (facets->options & FACETS_OPTION_ALL_KEYS_FTS)) + facets->keys_filtered_by_query++; + + facet_key_set_name(k, name, name_length); + + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(facets->keys.ll, k, prev, next); + facets->keys.count++; + + return k; +} + +static inline FACET_KEY *FACETS_KEY_ADD_TO_INDEX(FACETS *facets, FACETS_HASH hash, const char *name, size_t name_length, FACET_KEY_OPTIONS options) { + facets->operations.keys.registered++; + + SIMPLE_HASHTABLE_SLOT_KEY *slot = simple_hashtable_get_slot_KEY(&facets->keys.ht, hash, NULL, true); + + if(unlikely(!SIMPLE_HASHTABLE_SLOT_DATA(slot))) { + // we have to add it + FACET_KEY *k = FACETS_KEY_CREATE(facets, hash, name, name_length, options); + + simple_hashtable_set_slot_KEY(&facets->keys.ht, slot, hash, k); + + return k; + } + + // already in the index + + FACET_KEY *k = SIMPLE_HASHTABLE_SLOT_DATA(slot); + + facet_key_set_name(k, name, name_length); + + if(unlikely(k->options & FACET_KEY_OPTION_REORDER)) { + k->order = facets->order++; + k->options &= ~FACET_KEY_OPTION_REORDER; + } + + return k; +} + +bool facets_key_name_is_filter(FACETS *facets, const char *key) { + FACETS_HASH hash = FACETS_HASH_FUNCTION(key, strlen(key)); + FACET_KEY *k = FACETS_KEY_GET_FROM_INDEX(facets, hash); + return (!k || k->default_selected_for_values) ? false : true; +} + +bool facets_key_name_is_facet(FACETS *facets, const char *key) { + size_t key_len = strlen(key); + FACETS_HASH hash = FACETS_HASH_FUNCTION(key, key_len); + FACET_KEY *k = FACETS_KEY_ADD_TO_INDEX(facets, hash, key, key_len, 0); + return (k && (k->options & FACET_KEY_OPTION_FACET)); +} + +// ---------------------------------------------------------------------------- + +size_t facets_histogram_slots(FACETS *facets) { + return facets->histogram.slots; +} + +static usec_t calculate_histogram_bar_width(usec_t after_ut, usec_t before_ut) { + // Array of valid durations in seconds + static time_t valid_durations_s[] = { + 1, 2, 5, 10, 15, 30, // seconds + 1 * 60, 2 * 60, 3 * 60, 5 * 60, 10 * 60, 15 * 60, 30 * 60, // minutes + 1 * 3600, 2 * 3600, 6 * 3600, 8 * 3600, 12 * 3600, // hours + 1 * 86400, 2 * 86400, 3 * 86400, 5 * 86400, 7 * 86400, 14 * 86400, // days + 1 * (30*86400) // months + }; + static int array_size = sizeof(valid_durations_s) / sizeof(valid_durations_s[0]); + + usec_t duration_ut = before_ut - after_ut; + usec_t bar_width_ut = 1 * USEC_PER_SEC; + + for (int i = array_size - 1; i >= 0; --i) { + if (duration_ut / (valid_durations_s[i] * USEC_PER_SEC) >= FACETS_HISTOGRAM_COLUMNS) { + bar_width_ut = valid_durations_s[i] * USEC_PER_SEC; + break; + } + } + + return bar_width_ut; +} + +static inline usec_t facets_histogram_slot_baseline_ut(FACETS *facets, usec_t ut) { + usec_t delta_ut = ut % facets->histogram.slot_width_ut; + return ut - delta_ut; +} + +void facets_set_timeframe_and_histogram_by_id(FACETS *facets, const char *key_id, usec_t after_ut, usec_t before_ut) { + if(after_ut > before_ut) { + usec_t t = after_ut; + after_ut = before_ut; + before_ut = t; + } + + facets->histogram.enabled = true; + + if(key_id && *key_id && strlen(key_id) == FACET_STRING_HASH_SIZE - 1) { + facets->histogram.chart = strdupz(key_id); + facets->histogram.hash = str_to_facets_hash(facets->histogram.chart); + } + else { + freez(facets->histogram.chart); + facets->histogram.chart = NULL; + facets->histogram.hash = FACETS_HASH_ZERO; + } + + facets->timeframe.after_ut = after_ut; + facets->timeframe.before_ut = before_ut; + + facets->histogram.slot_width_ut = calculate_histogram_bar_width(after_ut, before_ut); + facets->histogram.after_ut = facets_histogram_slot_baseline_ut(facets, after_ut); + facets->histogram.before_ut = facets_histogram_slot_baseline_ut(facets, before_ut) + facets->histogram.slot_width_ut; + facets->histogram.slots = (facets->histogram.before_ut - facets->histogram.after_ut) / facets->histogram.slot_width_ut + 1; + + internal_fatal(after_ut < facets->histogram.after_ut, "histogram after_ut is not less or equal to wanted after_ut"); + internal_fatal(before_ut > facets->histogram.before_ut, "histogram before_ut is not more or equal to wanted before_ut"); + + if(facets->histogram.slots > 1000) { + facets->histogram.slots = 1000 + 1; + facets->histogram.slot_width_ut = (facets->histogram.before_ut - facets->histogram.after_ut) / 1000; + } +} + +void facets_set_timeframe_and_histogram_by_name(FACETS *facets, const char *key_name, usec_t after_ut, usec_t before_ut) { + char hash_str[FACET_STRING_HASH_SIZE]; + FACETS_HASH hash = FACETS_HASH_FUNCTION(key_name, strlen(key_name)); + facets_hash_to_str(hash, hash_str); + facets_set_timeframe_and_histogram_by_id(facets, hash_str, after_ut, before_ut); +} + +static inline uint32_t facets_histogram_slot_at_time_ut(FACETS *facets, usec_t usec, FACET_VALUE *v) { + if(unlikely(!v->histogram)) + v->histogram = callocz(facets->histogram.slots, sizeof(*v->histogram)); + + usec_t base_ut = facets_histogram_slot_baseline_ut(facets, usec); + + if(unlikely(base_ut < facets->histogram.after_ut)) + base_ut = facets->histogram.after_ut; + + if(unlikely(base_ut > facets->histogram.before_ut)) + base_ut = facets->histogram.before_ut; + + uint32_t slot = (base_ut - facets->histogram.after_ut) / facets->histogram.slot_width_ut; + + if(unlikely(slot >= facets->histogram.slots)) + slot = facets->histogram.slots - 1; + + return slot; +} + +static inline void facets_histogram_update_value_slot(FACETS *facets, usec_t usec, FACET_VALUE *v) { + uint32_t slot = facets_histogram_slot_at_time_ut(facets, usec, v); + v->histogram[slot]++; +} + +static inline void facets_histogram_update_value(FACETS *facets, usec_t usec) { + if(!facets->histogram.enabled || + !facets->histogram.key || + !facets->histogram.key->values.enabled || + !facet_key_value_updated(facets->histogram.key) || + usec < facets->histogram.after_ut || + usec > facets->histogram.before_ut) + return; + + FACET_VALUE *v = facets->histogram.key->current_value.v; + facets_histogram_update_value_slot(facets, usec, v); +} + +static usec_t overlap_duration_ut(usec_t start1, usec_t end1, usec_t start2, usec_t end2) { + usec_t overlap_start = MAX(start1, start2); + usec_t overlap_end = MIN(end1, end2); + + if (overlap_start < overlap_end) + return overlap_end - overlap_start; + else + return 0; // No overlap +} + +void facets_update_estimations(FACETS *facets, usec_t from_ut, usec_t to_ut, size_t entries) { + if(unlikely(!facets->histogram.enabled)) + return; + + if(unlikely(!overlap_duration_ut(facets->histogram.after_ut, facets->histogram.before_ut, from_ut, to_ut))) + return; + + facets->operations.rows.evaluated += entries; + facets->operations.rows.matched += entries; + facets->operations.rows.estimated += entries; + + if (!facets->histogram.enabled || + !facets->histogram.key || + !facets->histogram.key->values.enabled) + return; + + if (from_ut < facets->histogram.after_ut) + from_ut = facets->histogram.after_ut; + + if (to_ut > facets->histogram.before_ut) + to_ut = facets->histogram.before_ut; + + if (!facets->histogram.key->estimated_value.v) + FACET_VALUE_ADD_ESTIMATED_VALUE_TO_INDEX(facets->histogram.key); + + FACET_VALUE *v = facets->histogram.key->estimated_value.v; + + size_t slots = 0; + size_t total_ut = to_ut - from_ut; + ssize_t remaining_entries = (ssize_t)entries; + size_t slot = facets_histogram_slot_at_time_ut(facets, from_ut, v); + for(; slot < facets->histogram.slots ;slot++) { + usec_t slot_start_ut = facets->histogram.after_ut + slot * facets->histogram.slot_width_ut; + usec_t slot_end_ut = slot_start_ut + facets->histogram.slot_width_ut; + + if(slot_start_ut > to_ut) + break; + + usec_t overlap_ut = overlap_duration_ut(from_ut, to_ut, slot_start_ut, slot_end_ut); + + size_t slot_entries = (overlap_ut * entries) / total_ut; + v->histogram[slot] += slot_entries; + remaining_entries -= (ssize_t)slot_entries; + slots++; + } + + // Check if all entries are assigned + // This should always be true if the distribution is correct + internal_fatal(remaining_entries < 0 || remaining_entries >= (ssize_t)(slots), + "distribution of estimations is not accurate - there are %zd remaining entries", + remaining_entries); +} + +void facets_row_finished_unsampled(FACETS *facets, usec_t usec) { + facets->operations.rows.evaluated++; + facets->operations.rows.matched++; + facets->operations.rows.unsampled++; + + if(!facets->histogram.enabled || + !facets->histogram.key || + !facets->histogram.key->values.enabled || + usec < facets->histogram.after_ut || + usec > facets->histogram.before_ut) + return; + + if(!facets->histogram.key->unsampled_value.v) + FACET_VALUE_ADD_UNSAMPLED_VALUE_TO_INDEX(facets->histogram.key); + + FACET_VALUE *v = facets->histogram.key->unsampled_value.v; + facets_histogram_update_value_slot(facets, usec, v); + + facets_reset_key(facets->histogram.key); +} + +static const char *facets_key_name_cached(FACET_KEY *k, DICTIONARY *used_hashes_registry) { + if(k->name) { + if(used_hashes_registry && !k->default_selected_for_values) { + char hash_str[FACET_STRING_HASH_SIZE]; + facets_hash_to_str(k->hash, hash_str); + dictionary_set(used_hashes_registry, hash_str, (void *)k->name, strlen(k->name) + 1); + } + + return k->name; + } + + // key has no name + const char *name = "[UNAVAILABLE_FIELD]"; + + if(used_hashes_registry) { + char hash_str[FACET_STRING_HASH_SIZE]; + facets_hash_to_str(k->hash, hash_str); + const char *s = dictionary_get(used_hashes_registry, hash_str); + if(s) name = s; + } + + return name; +} + +static const char *facets_key_value_cached(FACET_KEY *k, FACET_VALUE *v, DICTIONARY *used_hashes_registry) { + if(v->empty || v->estimated || v->unsampled) + return v->name; + + if(v->name && v->name_len) { + if(used_hashes_registry && !k->default_selected_for_values && v->selected) { + char hash_str[FACET_STRING_HASH_SIZE]; + facets_hash_to_str(v->hash, hash_str); + dictionary_set(used_hashes_registry, hash_str, (void *)v->name, v->name_len + 1); + } + + return v->name; + } + + // key has no name + const char *name = "[unavailable field]"; + + if(used_hashes_registry) { + char hash_str[FACET_STRING_HASH_SIZE]; + facets_hash_to_str(v->hash, hash_str); + const char *s = dictionary_get(used_hashes_registry, hash_str); + if(s) name = s; + } + + return name; +} + +static inline void facets_key_value_transformed(FACETS *facets, FACET_KEY *k, FACET_VALUE *v, BUFFER *dst, FACETS_TRANSFORMATION_SCOPE scope) { + buffer_flush(dst); + + if(v->empty || v->unsampled || v->estimated) + buffer_strcat(dst, v->name); + else if(k->transform.cb && k->transform.view_only) { + buffer_contents_replace(dst, v->name, v->name_len); + k->transform.cb(facets, dst, scope, k->transform.data); + } + else + buffer_strcat(dst, facets_key_value_cached(k, v, facets->report.used_hashes_registry)); +} + +static inline void facets_histogram_value_names(BUFFER *wb, FACETS *facets __maybe_unused, FACET_KEY *k, const char *key, const char *first_key) { + CLEAN_BUFFER *tb = buffer_create(0, NULL); + + buffer_json_member_add_array(wb, key); + { + if(first_key) + buffer_json_add_array_item_string(wb, first_key); + + if(k && k->values.enabled) { + FACET_VALUE *v; + foreach_value_in_key(k, v) { + if (unlikely(!v->histogram)) + continue; + + facets_key_value_transformed(facets, k, v, tb, FACETS_TRANSFORM_HISTOGRAM); + buffer_json_add_array_item_string(wb, buffer_tostring(tb)); + } + foreach_value_in_key_done(v); + } + } + buffer_json_array_close(wb); // key +} + +static inline void facets_histogram_value_colors(BUFFER *wb, FACETS *facets __maybe_unused, FACET_KEY *k, const char *key) { + buffer_json_member_add_array(wb, key); + { + if(k && k->values.enabled) { + FACET_VALUE *v; + foreach_value_in_key(k, v) { + if (unlikely(!v->histogram)) + continue; + + buffer_json_add_array_item_string(wb, v->color); + } + foreach_value_in_key_done(v); + } + } + buffer_json_array_close(wb); // key +} + +static inline void facets_histogram_value_units(BUFFER *wb, FACETS *facets __maybe_unused, FACET_KEY *k, const char *key) { + buffer_json_member_add_array(wb, key); + { + if(k && k->values.enabled) { + FACET_VALUE *v; + foreach_value_in_key(k, v) { + if (unlikely(!v->histogram)) + continue; + + buffer_json_add_array_item_string(wb, "events"); + } + foreach_value_in_key_done(v); + } + } + buffer_json_array_close(wb); // key +} + +static inline void facets_histogram_value_min(BUFFER *wb, FACETS *facets __maybe_unused, FACET_KEY *k, const char *key) { + buffer_json_member_add_array(wb, key); + { + if(k && k->values.enabled) { + FACET_VALUE *v; + foreach_value_in_key(k, v) { + if (unlikely(!v->histogram)) + continue; + + buffer_json_add_array_item_uint64(wb, v->min); + } + foreach_value_in_key_done(v); + } + } + buffer_json_array_close(wb); // key +} + +static inline void facets_histogram_value_max(BUFFER *wb, FACETS *facets __maybe_unused, FACET_KEY *k, const char *key) { + buffer_json_member_add_array(wb, key); + { + if(k && k->values.enabled) { + FACET_VALUE *v; + foreach_value_in_key(k, v) { + if (unlikely(!v->histogram)) + continue; + + buffer_json_add_array_item_uint64(wb, v->max); + } + foreach_value_in_key_done(v); + } + } + buffer_json_array_close(wb); // key +} + +static inline void facets_histogram_value_avg(BUFFER *wb, FACETS *facets __maybe_unused, FACET_KEY *k, const char *key) { + buffer_json_member_add_array(wb, key); + { + if(k && k->values.enabled) { + FACET_VALUE *v; + foreach_value_in_key(k, v) { + if (unlikely(!v->histogram)) + continue; + + buffer_json_add_array_item_double(wb, (double) v->sum / (double) facets->histogram.slots); + } + foreach_value_in_key_done(v); + } + } + buffer_json_array_close(wb); // key +} + +static inline void facets_histogram_value_arp(BUFFER *wb, FACETS *facets __maybe_unused, FACET_KEY *k, const char *key) { + buffer_json_member_add_array(wb, key); + { + if(k && k->values.enabled) { + FACET_VALUE *v; + foreach_value_in_key(k, v) { + if (unlikely(!v->histogram)) + continue; + + buffer_json_add_array_item_uint64(wb, 0); + } + foreach_value_in_key_done(v); + } + } + buffer_json_array_close(wb); // key +} + +static inline void facets_histogram_value_con(BUFFER *wb, FACETS *facets __maybe_unused, FACET_KEY *k, const char *key, uint32_t sum) { + buffer_json_member_add_array(wb, key); + { + if(k && k->values.enabled) { + FACET_VALUE *v; + foreach_value_in_key(k, v) { + if (unlikely(!v->histogram)) + continue; + + buffer_json_add_array_item_double(wb, (double) v->sum * 100.0 / (double) sum); + } + foreach_value_in_key_done(v); + } + } + buffer_json_array_close(wb); // key +} + +static void facets_histogram_generate(FACETS *facets, FACET_KEY *k, BUFFER *wb) { + CLEAN_BUFFER *tmp = buffer_create(0, NULL); + + size_t dimensions = 0; + uint32_t min = UINT32_MAX, max = 0, sum = 0, count = 0; + + if(k && k->values.enabled) { + FACET_VALUE *v; + foreach_value_in_key(k, v) { + if (unlikely(!v->histogram)) + continue; + + dimensions++; + + v->min = UINT32_MAX; + v->max = 0; + v->sum = 0; + + for(uint32_t i = 0; i < facets->histogram.slots ;i++) { + uint32_t n = v->histogram[i]; + + if(n < min) + min = n; + + if(n > max) + max = n; + + sum += n; + count++; + + if(n < v->min) + v->min = n; + + if(n > v->max) + v->max = n; + + v->sum += n; + } + } + foreach_value_in_key_done(v); + } + + buffer_json_member_add_object(wb, "summary"); + { + // summary.nodes + buffer_json_member_add_array(wb, "nodes"); + { + buffer_json_add_array_item_object(wb); // node + { + buffer_json_member_add_string(wb, "mg", "default"); + buffer_json_member_add_string(wb, "nm", "facets.histogram"); + buffer_json_member_add_uint64(wb, "ni", 0); + buffer_json_member_add_object(wb, "st"); + { + buffer_json_member_add_uint64(wb, "ai", 0); + buffer_json_member_add_uint64(wb, "code", 200); + buffer_json_member_add_string(wb, "msg", ""); + } + buffer_json_object_close(wb); // st + + if(dimensions) { + buffer_json_member_add_object(wb, "is"); + { + buffer_json_member_add_uint64(wb, "sl", 1); + buffer_json_member_add_uint64(wb, "qr", 1); + } + buffer_json_object_close(wb); // is + + buffer_json_member_add_object(wb, "ds"); + { + buffer_json_member_add_uint64(wb, "sl", dimensions); + buffer_json_member_add_uint64(wb, "qr", dimensions); + } + buffer_json_object_close(wb); // ds + } + + if(count) { + buffer_json_member_add_object(wb, "sts"); + { + buffer_json_member_add_uint64(wb, "min", min); + buffer_json_member_add_uint64(wb, "max", max); + buffer_json_member_add_double(wb, "avg", (double) sum / (double) count); + buffer_json_member_add_double(wb, "con", 100.0); + } + buffer_json_object_close(wb); // sts + } + } + buffer_json_object_close(wb); // node + } + buffer_json_array_close(wb); // nodes + + // summary.contexts + buffer_json_member_add_array(wb, "contexts"); + { + buffer_json_add_array_item_object(wb); // context + { + buffer_json_member_add_string(wb, "id", "facets.histogram"); + + if(dimensions) { + buffer_json_member_add_object(wb, "is"); + { + buffer_json_member_add_uint64(wb, "sl", 1); + buffer_json_member_add_uint64(wb, "qr", 1); + } + buffer_json_object_close(wb); // is + + buffer_json_member_add_object(wb, "ds"); + { + buffer_json_member_add_uint64(wb, "sl", dimensions); + buffer_json_member_add_uint64(wb, "qr", dimensions); + } + buffer_json_object_close(wb); // ds + } + + if(count) { + buffer_json_member_add_object(wb, "sts"); + { + buffer_json_member_add_uint64(wb, "min", min); + buffer_json_member_add_uint64(wb, "max", max); + buffer_json_member_add_double(wb, "avg", (double) sum / (double) count); + buffer_json_member_add_double(wb, "con", 100.0); + } + buffer_json_object_close(wb); // sts + } + } + buffer_json_object_close(wb); // context + } + buffer_json_array_close(wb); // contexts + + // summary.instances + buffer_json_member_add_array(wb, "instances"); + { + buffer_json_add_array_item_object(wb); // instance + { + buffer_json_member_add_string(wb, "id", "facets.histogram"); + buffer_json_member_add_uint64(wb, "ni", 0); + + if(dimensions) { + buffer_json_member_add_object(wb, "ds"); + { + buffer_json_member_add_uint64(wb, "sl", dimensions); + buffer_json_member_add_uint64(wb, "qr", dimensions); + } + buffer_json_object_close(wb); // ds + } + + if(count) { + buffer_json_member_add_object(wb, "sts"); + { + buffer_json_member_add_uint64(wb, "min", min); + buffer_json_member_add_uint64(wb, "max", max); + buffer_json_member_add_double(wb, "avg", (double) sum / (double) count); + buffer_json_member_add_double(wb, "con", 100.0); + } + buffer_json_object_close(wb); // sts + } + } + buffer_json_object_close(wb); // instance + } + buffer_json_array_close(wb); // instances + + // summary.dimensions + buffer_json_member_add_array(wb, "dimensions"); + if(dimensions && k && k->values.enabled) { + size_t pri = 0; + FACET_VALUE *v; + + foreach_value_in_key(k, v) { + if(unlikely(!v->histogram)) + continue; + + buffer_json_add_array_item_object(wb); // dimension + { + facets_key_value_transformed(facets, k, v, tmp, FACETS_TRANSFORM_HISTOGRAM); + buffer_json_member_add_string(wb, "id", buffer_tostring(tmp)); + buffer_json_member_add_object(wb, "ds"); + { + buffer_json_member_add_uint64(wb, "sl", 1); + buffer_json_member_add_uint64(wb, "qr", 1); + } + buffer_json_object_close(wb); // ds + buffer_json_member_add_object(wb, "sts"); + { + buffer_json_member_add_uint64(wb, "min", v->min); + buffer_json_member_add_uint64(wb, "max", v->max); + buffer_json_member_add_double(wb, "avg", (double)v->sum / (double)facets->histogram.slots); + buffer_json_member_add_double(wb, "con", (double)v->sum * 100.0 / (double)sum); + } + buffer_json_object_close(wb); // sts + buffer_json_member_add_uint64(wb, "pri", pri++); + } + buffer_json_object_close(wb); // dimension + } + foreach_value_in_key_done(v); + } + buffer_json_array_close(wb); // dimensions + + buffer_json_member_add_array(wb, "labels"); + buffer_json_array_close(wb); // labels + + buffer_json_member_add_array(wb, "alerts"); + buffer_json_array_close(wb); // alerts + } + buffer_json_object_close(wb); // summary + + buffer_json_member_add_object(wb, "totals"); + { + buffer_json_member_add_object(wb, "nodes"); + { + buffer_json_member_add_uint64(wb, "sl", 1); + buffer_json_member_add_uint64(wb, "qr", 1); + } + buffer_json_object_close(wb); // nodes + + if(dimensions) { + buffer_json_member_add_object(wb, "contexts"); + { + buffer_json_member_add_uint64(wb, "sl", 1); + buffer_json_member_add_uint64(wb, "qr", 1); + } + buffer_json_object_close(wb); // contexts + buffer_json_member_add_object(wb, "instances"); + { + buffer_json_member_add_uint64(wb, "sl", 1); + buffer_json_member_add_uint64(wb, "qr", 1); + } + buffer_json_object_close(wb); // instances + + buffer_json_member_add_object(wb, "dimensions"); + { + buffer_json_member_add_uint64(wb, "sl", dimensions); + buffer_json_member_add_uint64(wb, "qr", dimensions); + } + buffer_json_object_close(wb); // dimension + } + } + buffer_json_object_close(wb); // totals + + buffer_json_member_add_object(wb, "result"); + { + facets_histogram_value_names(wb, facets, k, "labels", "time"); + + buffer_json_member_add_object(wb, "point"); + { + buffer_json_member_add_uint64(wb, "value", 0); + buffer_json_member_add_uint64(wb, "arp", 1); + buffer_json_member_add_uint64(wb, "pa", 2); + } + buffer_json_object_close(wb); // point + + buffer_json_member_add_array(wb, "data"); + if(k && k->values.enabled) { + usec_t t = facets->histogram.after_ut; + for(uint32_t i = 0; i < facets->histogram.slots ;i++) { + buffer_json_add_array_item_array(wb); // row + { + buffer_json_add_array_item_time_ms(wb, t / USEC_PER_SEC); + + FACET_VALUE *v; + foreach_value_in_key(k, v) { + if (unlikely(!v->histogram)) + continue; + + buffer_json_add_array_item_array(wb); // point + + buffer_json_add_array_item_uint64(wb, v->histogram[i]); + buffer_json_add_array_item_uint64(wb, 0); // arp - anomaly rate + buffer_json_add_array_item_uint64(wb, 0); // pa - point annotation + + buffer_json_array_close(wb); // point + } + foreach_value_in_key_done(v); + } + buffer_json_array_close(wb); // row + + t += facets->histogram.slot_width_ut; + } + } + buffer_json_array_close(wb); //data + } + buffer_json_object_close(wb); // result + + buffer_json_member_add_object(wb, "db"); + { + buffer_json_member_add_uint64(wb, "tiers", 1); + buffer_json_member_add_uint64(wb, "update_every", facets->histogram.slot_width_ut / USEC_PER_SEC); +// we should add these only when we know the retention of the db +// buffer_json_member_add_time_t(wb, "first_entry", facets->histogram.after_ut / USEC_PER_SEC); +// buffer_json_member_add_time_t(wb, "last_entry", facets->histogram.before_ut / USEC_PER_SEC); + buffer_json_member_add_string(wb, "units", "events"); + buffer_json_member_add_object(wb, "dimensions"); + { + facets_histogram_value_names(wb, facets, k, "ids", NULL); + facets_histogram_value_units(wb, facets, k, "units"); + + buffer_json_member_add_object(wb, "sts"); + { + facets_histogram_value_min(wb, facets, k, "min"); + facets_histogram_value_max(wb, facets, k, "max"); + facets_histogram_value_avg(wb, facets, k, "avg"); + facets_histogram_value_arp(wb, facets, k, "arp"); + facets_histogram_value_con(wb, facets, k, "con", sum); + } + buffer_json_object_close(wb); // sts + } + buffer_json_object_close(wb); // dimensions + + buffer_json_member_add_array(wb, "per_tier"); + { + buffer_json_add_array_item_object(wb); // tier0 + { + buffer_json_member_add_uint64(wb, "tier", 0); + buffer_json_member_add_uint64(wb, "queries", 1); + buffer_json_member_add_uint64(wb, "points", count); + buffer_json_member_add_time_t(wb, "update_every", facets->histogram.slot_width_ut / USEC_PER_SEC); +// we should add these only when we know the retention of the db +// buffer_json_member_add_time_t(wb, "first_entry", facets->histogram.after_ut / USEC_PER_SEC); +// buffer_json_member_add_time_t(wb, "last_entry", facets->histogram.before_ut / USEC_PER_SEC); + } + buffer_json_object_close(wb); // tier0 + } + buffer_json_array_close(wb); // per_tier + } + buffer_json_object_close(wb); // db + + buffer_json_member_add_object(wb, "view"); + { + char title[1024 + 1] = "Events Distribution"; + FACET_KEY *kt = FACETS_KEY_GET_FROM_INDEX(facets, facets->histogram.hash); + if(kt && kt->name) + snprintfz(title, sizeof(title) - 1, "Events Distribution by %s", kt->name); + + buffer_json_member_add_string(wb, "title", title); + buffer_json_member_add_time_t(wb, "update_every", facets->histogram.slot_width_ut / USEC_PER_SEC); + buffer_json_member_add_time_t(wb, "after", facets->histogram.after_ut / USEC_PER_SEC); + buffer_json_member_add_time_t(wb, "before", facets->histogram.before_ut / USEC_PER_SEC); + buffer_json_member_add_string(wb, "units", "events"); + buffer_json_member_add_string(wb, "chart_type", "stackedBar"); + buffer_json_member_add_object(wb, "dimensions"); + { + buffer_json_member_add_array(wb, "grouped_by"); + { + buffer_json_add_array_item_string(wb, "dimension"); + } + buffer_json_array_close(wb); // grouped_by + + facets_histogram_value_names(wb, facets, k, "ids", NULL); + facets_histogram_value_names(wb, facets, k, "names", NULL); + facets_histogram_value_colors(wb, facets, k, "colors"); + facets_histogram_value_units(wb, facets, k, "units"); + + buffer_json_member_add_object(wb, "sts"); + { + facets_histogram_value_min(wb, facets, k, "min"); + facets_histogram_value_max(wb, facets, k, "max"); + facets_histogram_value_avg(wb, facets, k, "avg"); + facets_histogram_value_arp(wb, facets, k, "arp"); + facets_histogram_value_con(wb, facets, k, "con", sum); + } + buffer_json_object_close(wb); // sts + } + buffer_json_object_close(wb); // dimensions + + buffer_json_member_add_uint64(wb, "min", min); + buffer_json_member_add_uint64(wb, "max", max); + } + buffer_json_object_close(wb); // view + + buffer_json_member_add_array(wb, "agents"); + { + buffer_json_add_array_item_object(wb); // agent + { + buffer_json_member_add_string(wb, "mg", "default"); + buffer_json_member_add_string(wb, "nm", "facets.histogram"); + buffer_json_member_add_time_t(wb, "now", now_realtime_sec()); + buffer_json_member_add_uint64(wb, "ai", 0); + } + buffer_json_object_close(wb); // agent + } + buffer_json_array_close(wb); // agents +} + +// ---------------------------------------------------------------------------- + +static inline void facet_value_is_used(FACET_KEY *k, FACET_VALUE *v) { + if(!k->key_found_in_row) + v->rows_matching_facet_value++; + + k->key_found_in_row++; + + if(v->selected) + k->key_values_selected_in_row++; +} + +static inline bool facets_key_is_facet(FACETS *facets, FACET_KEY *k) { + bool included = true, excluded = false, never = false; + + if(k->options & (FACET_KEY_OPTION_FACET | FACET_KEY_OPTION_NO_FACET | FACET_KEY_OPTION_NEVER_FACET)) { + if(k->options & FACET_KEY_OPTION_FACET) { + included = true; + excluded = false; + never = false; + } + else if(k->options & (FACET_KEY_OPTION_NO_FACET | FACET_KEY_OPTION_NEVER_FACET)) { + included = false; + excluded = true; + never = true; + } + } + else { + if (facets->included_keys) { + if (!simple_pattern_matches(facets->included_keys, k->name)) + included = false; + } + + if (facets->excluded_keys) { + if (simple_pattern_matches(facets->excluded_keys, k->name)) { + excluded = true; + never = true; + } + } + } + + if(included && !excluded) { + k->options |= FACET_KEY_OPTION_FACET; + k->options &= ~FACET_KEY_OPTION_NO_FACET; + return true; + } + + k->options |= FACET_KEY_OPTION_NO_FACET; + k->options &= ~FACET_KEY_OPTION_FACET; + + if(never) + k->options |= FACET_KEY_OPTION_NEVER_FACET; + + return false; +} + +// ---------------------------------------------------------------------------- + +FACETS *facets_create(uint32_t items_to_return, FACETS_OPTIONS options, const char *visible_keys, const char *facet_keys, const char *non_facet_keys) { + FACETS *facets = callocz(1, sizeof(FACETS)); + facets->options = options; + FACETS_KEYS_INDEX_CREATE(facets); + + if(facet_keys && *facet_keys) + facets->included_keys = simple_pattern_create(facet_keys, "|", SIMPLE_PATTERN_EXACT, true); + + if(non_facet_keys && *non_facet_keys) + facets->excluded_keys = simple_pattern_create(non_facet_keys, "|", SIMPLE_PATTERN_EXACT, true); + + if(visible_keys && *visible_keys) + facets->visible_keys = simple_pattern_create(visible_keys, "|", SIMPLE_PATTERN_EXACT, true); + + facets->max_items_to_return = items_to_return > 1 ? items_to_return : 2; + facets->anchor.start_ut = 0; + facets->anchor.stop_ut = 0; + facets->anchor.direction = FACETS_ANCHOR_DIRECTION_BACKWARD; + facets->order = 1; + + return facets; +} + +void facets_destroy(FACETS *facets) { + dictionary_destroy(facets->accepted_params); + FACETS_KEYS_INDEX_DESTROY(facets); + simple_pattern_free(facets->visible_keys); + simple_pattern_free(facets->included_keys); + simple_pattern_free(facets->excluded_keys); + + while(facets->base) { + FACET_ROW *r = facets->base; + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(facets->base, r, prev, next); + + facets_row_free(facets, r); + } + + freez(facets->histogram.chart); + freez(facets); +} + +void facets_accepted_param(FACETS *facets, const char *param) { + if(!facets->accepted_params) + facets->accepted_params = dictionary_create(DICT_OPTION_SINGLE_THREADED|DICT_OPTION_DONT_OVERWRITE_VALUE); + + dictionary_set(facets->accepted_params, param, NULL, 0); +} + +static inline FACET_KEY *facets_register_key_name_length(FACETS *facets, const char *key, size_t key_length, FACET_KEY_OPTIONS options) { + return FACETS_KEY_ADD_TO_INDEX(facets, FACETS_HASH_FUNCTION(key, key_length), key, key_length, options); +} + +inline FACET_KEY *facets_register_key_name(FACETS *facets, const char *key, FACET_KEY_OPTIONS options) { + return facets_register_key_name_length(facets, key, strlen(key), options); +} + +inline FACET_KEY *facets_register_key_name_transformation(FACETS *facets, const char *key, FACET_KEY_OPTIONS options, facets_key_transformer_t cb, void *data) { + FACET_KEY *k = facets_register_key_name(facets, key, options); + k->transform.cb = cb; + k->transform.data = data; + k->transform.view_only = (options & FACET_KEY_OPTION_TRANSFORM_VIEW) ? true : false; + return k; +} + +inline FACET_KEY *facets_register_dynamic_key_name(FACETS *facets, const char *key, FACET_KEY_OPTIONS options, facet_dynamic_row_t cb, void *data) { + FACET_KEY *k = facets_register_key_name(facets, key, options); + k->dynamic.cb = cb; + k->dynamic.data = data; + return k; +} + +void facets_set_query(FACETS *facets, const char *query) { + if(!query) + return; + + facets->query = simple_pattern_create(query, "|", SIMPLE_PATTERN_SUBSTRING, false); +} + +void facets_set_items(FACETS *facets, uint32_t items) { + facets->max_items_to_return = items > 1 ? items : 2; +} + +void facets_set_anchor(FACETS *facets, usec_t start_ut, usec_t stop_ut, FACETS_ANCHOR_DIRECTION direction) { + facets->anchor.start_ut = start_ut; + facets->anchor.stop_ut = stop_ut; + facets->anchor.direction = direction; + + if((facets->anchor.direction == FACETS_ANCHOR_DIRECTION_BACKWARD && facets->anchor.start_ut && facets->anchor.start_ut < facets->anchor.stop_ut) || + (facets->anchor.direction == FACETS_ANCHOR_DIRECTION_FORWARD && facets->anchor.stop_ut && facets->anchor.stop_ut < facets->anchor.start_ut)) { + internal_error(true, "start and stop anchors are flipped"); + facets->anchor.start_ut = stop_ut; + facets->anchor.stop_ut = start_ut; + } +} + +void facets_enable_slice_mode(FACETS *facets) { + facets->options |= FACETS_OPTION_DONT_SEND_EMPTY_VALUE_FACETS | FACETS_OPTION_SORT_FACETS_ALPHABETICALLY; +} + +inline FACET_KEY *facets_register_facet_id(FACETS *facets, const char *key_id, FACET_KEY_OPTIONS options) { + if(!is_valid_string_hash(key_id)) + return NULL; + + FACETS_HASH hash = str_to_facets_hash(key_id); + + internal_error(strcmp(hash_to_static_string(hash), key_id) != 0, + "Regenerating the user supplied key, does not produce the same hash string"); + + FACET_KEY *k = FACETS_KEY_ADD_TO_INDEX(facets, hash, NULL, 0, options); + k->options |= FACET_KEY_OPTION_FACET; + k->options &= ~FACET_KEY_OPTION_NO_FACET; + facet_key_late_init(facets, k); + + return k; +} + +void facets_register_facet_id_filter(FACETS *facets, const char *key_id, char *value_id, FACET_KEY_OPTIONS options) { + FACET_KEY *k = facets_register_facet_id(facets, key_id, options); + if(k) { + if(is_valid_string_hash(value_id)) { + k->default_selected_for_values = false; + FACET_VALUE_ADD_OR_UPDATE_SELECTED(k, str_to_facets_hash(value_id)); + } + } +} + +void facets_set_current_row_severity(FACETS *facets, FACET_ROW_SEVERITY severity) { + facets->current_row.severity = severity; +} + +void facets_register_row_severity(FACETS *facets, facet_row_severity_t cb, void *data) { + facets->severity.cb = cb; + facets->severity.data = data; +} + +void facets_set_additional_options(FACETS *facets, FACETS_OPTIONS options) { + facets->options |= options; +} + +// ---------------------------------------------------------------------------- + +static inline void facets_key_set_unsampled_value(FACETS *facets, FACET_KEY *k) { + if(likely(!facet_key_value_updated(k) && facets->keys_in_row.used < FACETS_KEYS_IN_ROW_MAX)) + facets->keys_in_row.array[facets->keys_in_row.used++] = k; + + k->current_value.flags |= FACET_KEY_VALUE_UPDATED | FACET_KEY_VALUE_UNSAMPLED; + + facets->operations.values.registered++; + facets->operations.values.unsampled++; + + // no need to copy the UNSET value + // empty values are exported as empty + k->current_value.raw = NULL; + k->current_value.raw_len = 0; + k->current_value.b->len = 0; + k->current_value.flags &= ~FACET_KEY_VALUE_COPIED; + + if(unlikely(k->values.enabled)) + FACET_VALUE_ADD_UNSAMPLED_VALUE_TO_INDEX(k); + else { + k->key_found_in_row++; + k->key_values_selected_in_row++; + } +} + +static inline void facets_key_set_empty_value(FACETS *facets, FACET_KEY *k) { + if(likely(!facet_key_value_updated(k) && facets->keys_in_row.used < FACETS_KEYS_IN_ROW_MAX)) + facets->keys_in_row.array[facets->keys_in_row.used++] = k; + + k->current_value.flags |= FACET_KEY_VALUE_UPDATED | FACET_KEY_VALUE_EMPTY; + + facets->operations.values.registered++; + facets->operations.values.empty++; + + // no need to copy the UNSET value + // empty values are exported as empty + k->current_value.raw = NULL; + k->current_value.raw_len = 0; + k->current_value.b->len = 0; + k->current_value.flags &= ~FACET_KEY_VALUE_COPIED; + + if(unlikely(k->values.enabled)) + FACET_VALUE_ADD_EMPTY_VALUE_TO_INDEX(k); + else { + k->key_found_in_row++; + k->key_values_selected_in_row++; + } +} + +static inline void facets_key_check_value(FACETS *facets, FACET_KEY *k) { + if(likely(!facet_key_value_updated(k) && facets->keys_in_row.used < FACETS_KEYS_IN_ROW_MAX)) + facets->keys_in_row.array[facets->keys_in_row.used++] = k; + + k->current_value.flags |= FACET_KEY_VALUE_UPDATED; + k->current_value.flags &= ~(FACET_KEY_VALUE_EMPTY|FACET_KEY_VALUE_UNSAMPLED|FACET_KEY_VALUE_ESTIMATED); + + facets->operations.values.registered++; + + if(k->transform.cb && !k->transform.view_only) { + facets->operations.values.transformed++; + facets_key_value_copy_to_buffer(k); + k->transform.cb(facets, k->current_value.b, FACETS_TRANSFORM_VALUE, k->transform.data); + } + +// bool found = false; +// if(strstr(buffer_tostring(k->current_value), "fprintd") != NULL) +// found = true; + + if(facets->query && !facet_key_value_empty_or_unsampled_or_estimated(k) && ((k->options & FACET_KEY_OPTION_FTS) || facets->options & FACETS_OPTION_ALL_KEYS_FTS)) { + facets->operations.fts.searches++; + facets_key_value_copy_to_buffer(k); + switch(simple_pattern_matches_extract(facets->query, buffer_tostring(k->current_value.b), NULL, 0)) { + case SP_MATCHED_POSITIVE: + facets->current_row.keys_matched_by_query_positive++; + break; + + case SP_MATCHED_NEGATIVE: + facets->current_row.keys_matched_by_query_negative++; + break; + + case SP_NOT_MATCHED: + break; + } + } + + if(k->values.enabled) + FACET_VALUE_ADD_CURRENT_VALUE_TO_INDEX(k); + else { + k->key_found_in_row++; + k->key_values_selected_in_row++; + } +} + +void facets_add_key_value(FACETS *facets, const char *key, const char *value) { + FACET_KEY *k = facets_register_key_name(facets, key, 0); + k->current_value.raw = value; + k->current_value.raw_len = strlen(value); + + facets_key_check_value(facets, k); +} + +void facets_add_key_value_length(FACETS *facets, const char *key, size_t key_len, const char *value, size_t value_len) { + FACET_KEY *k = facets_register_key_name_length(facets, key, key_len, 0); + k->current_value.raw = value; + k->current_value.raw_len = value_len; + + facets_key_check_value(facets, k); +} + +// ---------------------------------------------------------------------------- +// FACET_ROW dictionary hooks + +static void facet_row_key_value_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *data) { + FACET_ROW_KEY_VALUE *rkv = value; + FACET_ROW *row = data; (void)row; + + rkv->wb = buffer_create(0, NULL); + if(!rkv->empty) + buffer_contents_replace(rkv->wb, rkv->tmp, rkv->tmp_len); +} + +static bool facet_row_key_value_conflict_callback(const DICTIONARY_ITEM *item __maybe_unused, void *old_value, void *new_value, void *data) { + FACET_ROW_KEY_VALUE *rkv = old_value; + FACET_ROW_KEY_VALUE *n_rkv = new_value; + FACET_ROW *row = data; (void)row; + + rkv->empty = n_rkv->empty; + + if(!rkv->empty) + buffer_contents_replace(rkv->wb, n_rkv->tmp, n_rkv->tmp_len); + else + buffer_flush(rkv->wb); + + return false; +} + +static void facet_row_key_value_delete_callback(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *data) { + FACET_ROW_KEY_VALUE *rkv = value; + FACET_ROW *row = data; (void)row; + + buffer_free(rkv->wb); +} + +// ---------------------------------------------------------------------------- +// FACET_ROW management + +static void facets_row_free(FACETS *facets __maybe_unused, FACET_ROW *row) { + dictionary_destroy(row->dict); + freez(row); +} + +static FACET_ROW *facets_row_create(FACETS *facets, usec_t usec, FACET_ROW *into) { + FACET_ROW *row; + + if(into) { + row = into; + facets->operations.rows.reused++; + } + else { + row = callocz(1, sizeof(FACET_ROW)); + row->dict = dictionary_create_advanced(DICT_OPTION_SINGLE_THREADED|DICT_OPTION_DONT_OVERWRITE_VALUE|DICT_OPTION_FIXED_SIZE, NULL, sizeof(FACET_ROW_KEY_VALUE)); + dictionary_register_insert_callback(row->dict, facet_row_key_value_insert_callback, row); + dictionary_register_conflict_callback(row->dict, facet_row_key_value_conflict_callback, row); + dictionary_register_delete_callback(row->dict, facet_row_key_value_delete_callback, row); + facets->operations.rows.created++; + } + + row->severity = facets->current_row.severity; + row->usec = usec; + + FACET_KEY *k; + foreach_key_in_facets(facets, k) { + FACET_ROW_KEY_VALUE t = { + .tmp = NULL, + .tmp_len = 0, + .wb = NULL, + .empty = true, + }; + + if(facet_key_value_updated(k) && !facet_key_value_empty_or_unsampled_or_estimated(k)) { + t.tmp = facets_key_get_value(k); + t.tmp_len = facets_key_get_value_length(k); + t.empty = false; + } + + dictionary_set(row->dict, k->name, &t, sizeof(t)); + } + foreach_key_in_facets_done(k); + + return row; +} + +// ---------------------------------------------------------------------------- + +static inline FACET_ROW *facets_row_keep_seek_to_position(FACETS *facets, usec_t usec) { + if(usec < facets->base->prev->usec) + return facets->base->prev; + + if(usec > facets->base->usec) + return facets->base; + + FACET_ROW *last = facets->operations.last_added; + while(last->prev != facets->base->prev && usec > last->prev->usec) { + last = last->prev; + facets->operations.backwards++; + } + + while(last->next && usec < last->next->usec) { + last = last->next; + facets->operations.forwards++; + } + + return last; +} + +static void facets_row_keep_first_entry(FACETS *facets, usec_t usec) { + facets->operations.last_added = facets_row_create(facets, usec, NULL); + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(facets->base, facets->operations.last_added, prev, next); + facets->items_to_return++; + facets->operations.first++; +} + +static inline bool facets_is_entry_within_anchor(FACETS *facets, usec_t usec) { + if(facets->anchor.start_ut || facets->anchor.stop_ut) { + // we have an anchor key + // we don't want to keep rows on the other side of the direction + + switch (facets->anchor.direction) { + default: + case FACETS_ANCHOR_DIRECTION_BACKWARD: + // we need to keep only the smaller timestamps + if (facets->anchor.start_ut && usec >= facets->anchor.start_ut) { + facets->operations.skips_before++; + return false; + } + if (facets->anchor.stop_ut && usec <= facets->anchor.stop_ut) { + facets->operations.skips_after++; + return false; + } + break; + + case FACETS_ANCHOR_DIRECTION_FORWARD: + // we need to keep only the bigger timestamps + if (facets->anchor.start_ut && usec <= facets->anchor.start_ut) { + facets->operations.skips_after++; + return false; + } + if (facets->anchor.stop_ut && usec >= facets->anchor.stop_ut) { + facets->operations.skips_before++; + return false; + } + break; + } + } + + return true; +} + +bool facets_row_candidate_to_keep(FACETS *facets, usec_t usec) { + return !facets->base || + (usec >= facets->base->prev->usec && usec <= facets->base->usec && facets_is_entry_within_anchor(facets, usec)) || + facets->items_to_return < facets->max_items_to_return; +} + +static void facets_row_keep(FACETS *facets, usec_t usec) { + facets->operations.rows.matched++; + + if(unlikely(!facets->base)) { + // the first row to keep + facets_row_keep_first_entry(facets, usec); + return; + } + + FACET_ROW *closest = facets_row_keep_seek_to_position(facets, usec); + FACET_ROW *to_replace = NULL; + + if(likely(facets->items_to_return >= facets->max_items_to_return)) { + // we have enough items to return already + + switch(facets->anchor.direction) { + default: + case FACETS_ANCHOR_DIRECTION_BACKWARD: + if(closest == facets->base->prev && usec < closest->usec) { + // this is to the end of the list, belonging to the next page + facets->operations.skips_after++; + return; + } + + // it seems we need to remove an item - the last one + to_replace = facets->base->prev; + if(closest == to_replace) + closest = to_replace->prev; + + break; + + case FACETS_ANCHOR_DIRECTION_FORWARD: + if(closest == facets->base && usec > closest->usec) { + // this is to the beginning of the list, belonging to the next page + facets->operations.skips_before++; + return; + } + + // it seems we need to remove an item - the first one + to_replace = facets->base; + if(closest == to_replace) + closest = to_replace->next; + + break; + } + + facets->operations.shifts++; + facets->items_to_return--; + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(facets->base, to_replace, prev, next); + } + + internal_fatal(!closest, "FACETS: closest cannot be NULL"); + internal_fatal(closest == to_replace, "FACETS: closest cannot be the same as to_replace"); + + facets->operations.last_added = facets_row_create(facets, usec, to_replace); + + if(usec < closest->usec) { + DOUBLE_LINKED_LIST_INSERT_ITEM_AFTER_UNSAFE(facets->base, closest, facets->operations.last_added, prev, next); + facets->operations.appends++; + } + else { + DOUBLE_LINKED_LIST_INSERT_ITEM_BEFORE_UNSAFE(facets->base, closest, facets->operations.last_added, prev, next); + facets->operations.prepends++; + } + + facets->items_to_return++; +} + +static inline void facets_reset_key(FACET_KEY *k) { + k->key_found_in_row = 0; + k->key_values_selected_in_row = 0; + k->current_value.flags = FACET_KEY_VALUE_NONE; + k->current_value.hash = FACETS_HASH_ZERO; +} + +static void facets_reset_keys_with_value_and_row(FACETS *facets) { + size_t entries = facets->keys_in_row.used; + + for(size_t p = 0; p < entries ;p++) { + FACET_KEY *k = facets->keys_in_row.array[p]; + facets_reset_key(k); + } + + facets->current_row.severity = FACET_ROW_SEVERITY_NORMAL; + facets->current_row.keys_matched_by_query_positive = 0; + facets->current_row.keys_matched_by_query_negative = 0; + facets->keys_in_row.used = 0; +} + +void facets_rows_begin(FACETS *facets) { + FACET_KEY *k; + foreach_key_in_facets(facets, k) { + facets_reset_key(k); + } + foreach_key_in_facets_done(k); + + facets->keys_in_row.used = 0; + facets_reset_keys_with_value_and_row(facets); +} + +bool facets_row_finished(FACETS *facets, usec_t usec) { + facets->operations.rows.evaluated++; + + if(unlikely((facets->query && facets->keys_filtered_by_query && + (!facets->current_row.keys_matched_by_query_positive || facets->current_row.keys_matched_by_query_negative)) || + (facets->timeframe.before_ut && usec > facets->timeframe.before_ut) || + (facets->timeframe.after_ut && usec < facets->timeframe.after_ut))) { + // this row is not useful + // 1. not matched by full text search, or + // 2. not in our timeframe + facets_reset_keys_with_value_and_row(facets); + return false; + } + + bool within_anchor = facets_is_entry_within_anchor(facets, usec); + if(unlikely(!within_anchor && (facets->options & FACETS_OPTION_DATA_ONLY))) { + facets_reset_keys_with_value_and_row(facets); + return false; + } + + size_t entries = facets->keys_with_values.used; + size_t total_keys = 0; + size_t selected_keys = 0; + + for(size_t p = 0; p < entries ;p++) { + FACET_KEY *k = facets->keys_with_values.array[p]; + + if(!facet_key_value_updated(k)) { + // put the FACET_VALUE_UNSET value into it + facets_key_set_empty_value(facets, k); + } + + total_keys++; + + if(k->key_values_selected_in_row) + selected_keys++; + + if(unlikely(!facets->histogram.key && facets->histogram.hash == k->hash)) + facets->histogram.key = k; + } + + if(selected_keys >= total_keys - 1) { + size_t found = 0; + (void) found; + + for(size_t p = 0; p < entries; p++) { + FACET_KEY *k = facets->keys_with_values.array[p]; + + size_t counted_by = selected_keys; + + if(counted_by != total_keys && !k->key_values_selected_in_row) + counted_by++; + + if(counted_by == total_keys) { + k->current_value.v->final_facet_value_counter++; + found++; + } + } + + internal_fatal(!found, "We should find at least one facet to count this row"); + } + + if(selected_keys == total_keys) { + // we need to keep this row + facets_histogram_update_value(facets, usec); + + if(within_anchor) + facets_row_keep(facets, usec); + } + + facets_reset_keys_with_value_and_row(facets); + + return selected_keys == total_keys; +} + +// ---------------------------------------------------------------------------- +// output + +const char *facets_severity_to_string(FACET_ROW_SEVERITY severity) { + switch(severity) { + default: + case FACET_ROW_SEVERITY_NORMAL: + return "normal"; + + case FACET_ROW_SEVERITY_DEBUG: + return "debug"; + + case FACET_ROW_SEVERITY_NOTICE: + return "notice"; + + case FACET_ROW_SEVERITY_WARNING: + return "warning"; + + case FACET_ROW_SEVERITY_CRITICAL: + return "critical"; + } +} + +void facets_accepted_parameters_to_json_array(FACETS *facets, BUFFER *wb, bool with_keys) { + buffer_json_member_add_array(wb, "accepted_params"); + { + if(facets->accepted_params) { + void *t; + dfe_start_read(facets->accepted_params, t) { + buffer_json_add_array_item_string(wb, t_dfe.name); + } + dfe_done(t); + } + + if(with_keys) { + FACET_KEY *k; + foreach_key_in_facets(facets, k){ + if (!k->values.enabled) + continue; + + buffer_json_add_array_item_string(wb, hash_to_static_string(k->hash)); + } + foreach_key_in_facets_done(k); + } + } + buffer_json_array_close(wb); // accepted_params +} + +static int facets_keys_reorder_compar(const void *a, const void *b) { + const FACET_KEY *ak = *((const FACET_KEY **)a); + const FACET_KEY *bk = *((const FACET_KEY **)b); + + const char *an = ak->name; + const char *bn = bk->name; + + if(!an) an = "0"; + if(!bn) bn = "0"; + + while(*an && ispunct((uint8_t)*an)) an++; + while(*bn && ispunct((uint8_t)*bn)) bn++; + + return strcasecmp(an, bn); +} + +void facets_sort_and_reorder_keys(FACETS *facets) { + size_t entries = facets->keys_with_values.used; + if(!entries) + return; + + FACET_KEY *keys[entries]; + memcpy(keys, facets->keys_with_values.array, sizeof(FACET_KEY *) * entries); + + qsort(keys, entries, sizeof(FACET_KEY *), facets_keys_reorder_compar); + + for(size_t i = 0; i < entries ;i++) + keys[i]->order = i + 1; +} + +static int facets_key_values_reorder_by_name_compar(const void *a, const void *b) { + const FACET_VALUE *av = *((const FACET_VALUE **)a); + const FACET_VALUE *bv = *((const FACET_VALUE **)b); + + const char *an = (av->name && av->name_len) ? av->name : "0"; + const char *bn = (bv->name && bv->name_len) ? bv->name : "0"; + + while(*an && ispunct((uint8_t)*an)) an++; + while(*bn && ispunct((uint8_t)*bn)) bn++; + + int ret = strcasecmp(an, bn); + return ret; +} + +static int facets_key_values_reorder_by_count_compar(const void *a, const void *b) { + const FACET_VALUE *av = *((const FACET_VALUE **)a); + const FACET_VALUE *bv = *((const FACET_VALUE **)b); + + if(av->final_facet_value_counter < bv->final_facet_value_counter) + return 1; + + if(av->final_facet_value_counter > bv->final_facet_value_counter) + return -1; + + return facets_key_values_reorder_by_name_compar(a, b); +} + +static int facets_key_values_reorder_by_name_numeric_compar(const void *a, const void *b) { + const FACET_VALUE *av = *((const FACET_VALUE **)a); + const FACET_VALUE *bv = *((const FACET_VALUE **)b); + + const char *an = (av->name && av->name_len) ? av->name : "0"; + const char *bn = (bv->name && bv->name_len) ? bv->name : "0"; + + if(strcmp(an, FACET_VALUE_UNSET) == 0) an = "0"; + if(strcmp(bn, FACET_VALUE_UNSET) == 0) bn = "0"; + + int64_t ad = str2ll(an, NULL); + int64_t bd = str2ll(bn, NULL); + + if(ad < bd) + return -1; + + if(ad > bd) + return 1; + + return facets_key_values_reorder_by_name_compar(a, b); +} + +static uint32_t facets_sort_and_reorder_values_internal(FACET_KEY *k) { + bool all_values_numeric = true; + size_t entries = k->values.used; + FACET_VALUE *values[entries], *v; + uint32_t used = 0; + foreach_value_in_key(k, v) { + if((k->facets->options & FACETS_OPTION_DONT_SEND_EMPTY_VALUE_FACETS) && v->empty) + continue; + + if(all_values_numeric && !v->empty && v->name && v->name_len) { + const char *s = v->name; + while(isdigit((uint8_t)*s)) s++; + if(*s != '\0') + all_values_numeric = false; + } + + values[used++] = v; + + if(used >= entries) + break; + } + foreach_value_in_key_done(v); + + if(!used) + return 0; + + if(k->facets->options & FACETS_OPTION_SORT_FACETS_ALPHABETICALLY) { + if(all_values_numeric) + qsort(values, used, sizeof(FACET_VALUE *), facets_key_values_reorder_by_name_numeric_compar); + else + qsort(values, used, sizeof(FACET_VALUE *), facets_key_values_reorder_by_name_compar); + } + else + qsort(values, used, sizeof(FACET_VALUE *), facets_key_values_reorder_by_count_compar); + + for(size_t i = 0; i < used; i++) + values[i]->order = i + 1; + + return used; +} + +static uint32_t facets_sort_and_reorder_values(FACET_KEY *k) { + if(!k->values.enabled || !k->values.ll || !k->values.used) + return 0; + + if(!k->transform.cb || !k->transform.view_only || !(k->facets->options & FACETS_OPTION_SORT_FACETS_ALPHABETICALLY)) + return facets_sort_and_reorder_values_internal(k); + + // we have a transformation and has to be sorted alphabetically + + BUFFER *tb = buffer_create(0, NULL); + uint32_t ret = 0; + + size_t entries = k->values.used; + struct { + const char *name; + uint32_t name_len; + } values[entries]; + FACET_VALUE *v; + uint32_t used = 0; + + foreach_value_in_key(k, v) { + if(used >= entries) + break; + + values[used].name = v->name; + values[used].name_len = v->name_len; + used++; + + facets_key_value_transformed(k->facets, k, v, tb, FACETS_TRANSFORM_FACET_SORT); + v->name = strdupz(buffer_tostring(tb)); + v->name_len = buffer_strlen(tb); + } + foreach_value_in_key_done(v); + + ret = facets_sort_and_reorder_values_internal(k); + + used = 0; + foreach_value_in_key(k, v) { + if(used >= entries) + break; + + freez((void *)v->name); + v->name = values[used].name; + v->name_len = values[used].name_len; + used++; + } + foreach_value_in_key_done(v); + + buffer_free(tb); + return ret; +} + +void facets_table_config(BUFFER *wb) { + buffer_json_member_add_boolean(wb, "show_ids", false); // do not show the column ids to the user + buffer_json_member_add_boolean(wb, "has_history", true); // enable date-time picker with after-before + + buffer_json_member_add_object(wb, "pagination"); + { + buffer_json_member_add_boolean(wb, "enabled", true); + buffer_json_member_add_string(wb, "key", "anchor"); + buffer_json_member_add_string(wb, "column", "timestamp"); + buffer_json_member_add_string(wb, "units", "timestamp_usec"); + } + buffer_json_object_close(wb); // pagination +} + +void facets_report(FACETS *facets, BUFFER *wb, DICTIONARY *used_hashes_registry) { + facets->report.used_hashes_registry = used_hashes_registry; + + if(!(facets->options & FACETS_OPTION_DATA_ONLY)) { + facets_table_config(wb); + facets_accepted_parameters_to_json_array(facets, wb, true); + } + + // ------------------------------------------------------------------------ + // facets + + if(!(facets->options & FACETS_OPTION_DONT_SEND_FACETS)) { + bool show_facets = false; + + if(facets->options & FACETS_OPTION_DATA_ONLY) { + if(facets->options & FACETS_OPTION_SHOW_DELTAS) { + buffer_json_member_add_array(wb, "facets_delta"); + show_facets = true; + } + } + else { + buffer_json_member_add_array(wb, "facets"); + show_facets = true; + } + + if(show_facets) { + CLEAN_BUFFER *tb = buffer_create(0, NULL); + FACET_KEY *k; + foreach_key_in_facets(facets, k) { + if(!k->values.enabled) + continue; + + if(!facets_sort_and_reorder_values(k)) + // no values for this key + continue; + + buffer_json_add_array_item_object(wb); // key + { + buffer_json_member_add_string(wb, "id", hash_to_static_string(k->hash)); + buffer_json_member_add_string(wb, "name", facets_key_name_cached(k + , facets->report.used_hashes_registry + )); + + if(!k->order) k->order = facets->order++; + buffer_json_member_add_uint64(wb, "order", k->order); + + buffer_json_member_add_array(wb, "options"); + { + FACET_VALUE *v; + foreach_value_in_key(k, v) { + if((facets->options & FACETS_OPTION_DONT_SEND_EMPTY_VALUE_FACETS) && v->empty) + continue; + + if(v->unsampled || v->estimated) + continue; + + buffer_json_add_array_item_object(wb); + { + buffer_json_member_add_string(wb, "id", hash_to_static_string(v->hash)); + + facets_key_value_transformed(facets, k, v, tb, FACETS_TRANSFORM_FACET); + buffer_json_member_add_string(wb, "name", buffer_tostring(tb)); + buffer_json_member_add_uint64(wb, "count", v->final_facet_value_counter); + buffer_json_member_add_uint64(wb, "order", v->order); + } + buffer_json_object_close(wb); + } + foreach_value_in_key_done(v); + } + buffer_json_array_close(wb); // options + } + buffer_json_object_close(wb); // key + } + foreach_key_in_facets_done(k); + buffer_json_array_close(wb); // facets + } + } + + // ------------------------------------------------------------------------ + // columns + + buffer_json_member_add_object(wb, "columns"); + { + size_t field_id = 0; + buffer_rrdf_table_add_field( + wb, field_id++, + "timestamp", "Timestamp", + RRDF_FIELD_TYPE_TIMESTAMP, + RRDF_FIELD_VISUAL_VALUE, + RRDF_FIELD_TRANSFORM_DATETIME_USEC, 0, NULL, NAN, + RRDF_FIELD_SORT_DESCENDING|RRDF_FIELD_SORT_FIXED, + NULL, + RRDF_FIELD_SUMMARY_COUNT, + RRDF_FIELD_FILTER_RANGE, + RRDF_FIELD_OPTS_WRAP | RRDF_FIELD_OPTS_VISIBLE | RRDF_FIELD_OPTS_UNIQUE_KEY, + NULL); + + buffer_rrdf_table_add_field( + wb, field_id++, + "rowOptions", "rowOptions", + RRDF_FIELD_TYPE_NONE, + RRDR_FIELD_VISUAL_ROW_OPTIONS, + RRDF_FIELD_TRANSFORM_NONE, 0, NULL, NAN, + RRDF_FIELD_SORT_FIXED, + NULL, + RRDF_FIELD_SUMMARY_COUNT, + RRDF_FIELD_FILTER_NONE, + RRDF_FIELD_OPTS_DUMMY, + NULL); + + FACET_KEY *k; + foreach_key_in_facets(facets, k) { + RRDF_FIELD_OPTIONS options = RRDF_FIELD_OPTS_WRAP; + bool visible = k->options & (FACET_KEY_OPTION_VISIBLE | FACET_KEY_OPTION_STICKY); + + if ((facets->options & FACETS_OPTION_ALL_FACETS_VISIBLE && k->values.enabled)) + visible = true; + + if (!visible) + visible = simple_pattern_matches(facets->visible_keys, k->name); + + if (visible) + options |= RRDF_FIELD_OPTS_VISIBLE; + + if (k->options & FACET_KEY_OPTION_MAIN_TEXT) + options |= RRDF_FIELD_OPTS_FULL_WIDTH | RRDF_FIELD_OPTS_WRAP; + + if (k->options & FACET_KEY_OPTION_EXPANDED_FILTER) + options |= RRDF_FIELD_OPTS_EXPANDED_FILTER; + + const char *hash_str = hash_to_static_string(k->hash); + + buffer_rrdf_table_add_field( + wb, field_id++, + hash_str, k->name ? k->name : hash_str, + RRDF_FIELD_TYPE_STRING, + (k->options & FACET_KEY_OPTION_RICH_TEXT) ? RRDF_FIELD_VISUAL_RICH : RRDF_FIELD_VISUAL_VALUE, + RRDF_FIELD_TRANSFORM_NONE, 0, NULL, NAN, + RRDF_FIELD_SORT_FIXED, + NULL, + RRDF_FIELD_SUMMARY_COUNT, + (k->options & FACET_KEY_OPTION_NEVER_FACET) ? RRDF_FIELD_FILTER_NONE : RRDF_FIELD_FILTER_FACET, + options, FACET_VALUE_UNSET); + } + foreach_key_in_facets_done(k); + } + buffer_json_object_close(wb); // columns + + // ------------------------------------------------------------------------ + // rows data + + buffer_json_member_add_array(wb, "data"); + { + usec_t last_usec = 0; (void)last_usec; + + for(FACET_ROW *row = facets->base ; row ;row = row->next) { + + internal_fatal( + facets->anchor.start_ut && ( + (facets->anchor.direction == FACETS_ANCHOR_DIRECTION_BACKWARD && row->usec >= facets->anchor.start_ut) || + (facets->anchor.direction == FACETS_ANCHOR_DIRECTION_FORWARD && row->usec <= facets->anchor.start_ut) + ), "Wrong data returned related to %s start anchor!", facets->anchor.direction == FACETS_ANCHOR_DIRECTION_FORWARD ? "forward" : "backward"); + + internal_fatal(last_usec && row->usec > last_usec, "Wrong order of data returned!"); + + last_usec = row->usec; + + buffer_json_add_array_item_array(wb); // each row + buffer_json_add_array_item_uint64(wb, row->usec); + buffer_json_add_array_item_object(wb); + { + if(facets->severity.cb) + row->severity = facets->severity.cb(facets, row, facets->severity.data); + + buffer_json_member_add_string(wb, "severity", facets_severity_to_string(row->severity)); + } + buffer_json_object_close(wb); + + FACET_KEY *k; + foreach_key_in_facets(facets, k) { + FACET_ROW_KEY_VALUE *rkv = dictionary_get(row->dict, k->name); + + if(unlikely(k->dynamic.cb)) { + if(unlikely(!rkv)) + rkv = dictionary_set(row->dict, k->name, NULL, sizeof(*rkv)); + + k->dynamic.cb(facets, wb, rkv, row, k->dynamic.data); + facets->operations.values.dynamic++; + } + else { + if(!rkv || rkv->empty) { + buffer_json_add_array_item_string(wb, NULL); + } + else if(unlikely(k->transform.cb && k->transform.view_only)) { + k->transform.cb(facets, rkv->wb, FACETS_TRANSFORM_DATA, k->transform.data); + buffer_json_add_array_item_string(wb, buffer_tostring(rkv->wb)); + } + else + buffer_json_add_array_item_string(wb, buffer_tostring(rkv->wb)); + } + } + foreach_key_in_facets_done(k); + buffer_json_array_close(wb); // each row + } + } + buffer_json_array_close(wb); // data + + if(!(facets->options & FACETS_OPTION_DATA_ONLY)) { + buffer_json_member_add_string(wb, "default_sort_column", "timestamp"); + buffer_json_member_add_array(wb, "default_charts"); + buffer_json_array_close(wb); + } + + // ------------------------------------------------------------------------ + // histogram + + if(facets->histogram.enabled && !(facets->options & FACETS_OPTION_DONT_SEND_HISTOGRAM)) { + FACETS_HASH first_histogram_hash = 0; + buffer_json_member_add_array(wb, "available_histograms"); + { + FACET_KEY *k; + foreach_key_in_facets(facets, k) { + if (!k->values.enabled) + continue; + + if(unlikely(!first_histogram_hash)) + first_histogram_hash = k->hash; + + buffer_json_add_array_item_object(wb); + buffer_json_member_add_string(wb, "id", hash_to_static_string(k->hash)); + buffer_json_member_add_string(wb, "name", k->name); + buffer_json_member_add_uint64(wb, "order", k->order); + buffer_json_object_close(wb); + } + foreach_key_in_facets_done(k); + } + buffer_json_array_close(wb); + + { + FACET_KEY *k = FACETS_KEY_GET_FROM_INDEX(facets, facets->histogram.hash); + if(!k || !k->values.enabled) + k = FACETS_KEY_GET_FROM_INDEX(facets, first_histogram_hash); + + bool show_histogram = false; + + if(facets->options & FACETS_OPTION_DATA_ONLY) { + if(facets->options & FACETS_OPTION_SHOW_DELTAS) { + buffer_json_member_add_object(wb, "histogram_delta"); + show_histogram = true; + } + } + else { + buffer_json_member_add_object(wb, "histogram"); + show_histogram = true; + } + + if(show_histogram) { + buffer_json_member_add_string(wb, "id", k ? hash_to_static_string(k->hash) : ""); + buffer_json_member_add_string(wb, "name", k ? k->name : ""); + buffer_json_member_add_object(wb, "chart"); + { + facets_histogram_generate(facets, k, wb); + } + buffer_json_object_close(wb); // chart + buffer_json_object_close(wb); // histogram + } + } + } + + // ------------------------------------------------------------------------ + // items + + bool show_items = false; + if(facets->options & FACETS_OPTION_DATA_ONLY) { + if(facets->options & FACETS_OPTION_SHOW_DELTAS) { + buffer_json_member_add_object(wb, "items_delta"); + show_items = true; + } + } + else { + buffer_json_member_add_object(wb, "items"); + show_items = true; + } + + if(show_items) { + buffer_json_member_add_uint64(wb, "evaluated", facets->operations.rows.evaluated); + buffer_json_member_add_uint64(wb, "matched", facets->operations.rows.matched); + buffer_json_member_add_uint64(wb, "unsampled", facets->operations.rows.unsampled); + buffer_json_member_add_uint64(wb, "estimated", facets->operations.rows.estimated); + buffer_json_member_add_uint64(wb, "returned", facets->items_to_return); + buffer_json_member_add_uint64(wb, "max_to_return", facets->max_items_to_return); + buffer_json_member_add_uint64(wb, "before", facets->operations.skips_before); + buffer_json_member_add_uint64(wb, "after", facets->operations.skips_after + facets->operations.shifts); + buffer_json_object_close(wb); // items + } + + // ------------------------------------------------------------------------ + // stats + + buffer_json_member_add_object(wb, "_stats"); + { + buffer_json_member_add_uint64(wb, "first", facets->operations.first); + buffer_json_member_add_uint64(wb, "forwards", facets->operations.forwards); + buffer_json_member_add_uint64(wb, "backwards", facets->operations.backwards); + buffer_json_member_add_uint64(wb, "skips_before", facets->operations.skips_before); + buffer_json_member_add_uint64(wb, "skips_after", facets->operations.skips_after); + buffer_json_member_add_uint64(wb, "prepends", facets->operations.prepends); + buffer_json_member_add_uint64(wb, "appends", facets->operations.appends); + buffer_json_member_add_uint64(wb, "shifts", facets->operations.shifts); + buffer_json_member_add_object(wb, "rows"); + { + buffer_json_member_add_uint64(wb, "created", facets->operations.rows.created); + buffer_json_member_add_uint64(wb, "reused", facets->operations.rows.reused); + buffer_json_member_add_uint64(wb, "evaluated", facets->operations.rows.evaluated); + buffer_json_member_add_uint64(wb, "matched", facets->operations.rows.matched); + } + buffer_json_object_close(wb); // rows + buffer_json_member_add_object(wb, "keys"); + { + size_t resizes = 0, searches = 0, collisions = 0, used = 0, size = 0, count = 0; + count++; + used += facets->keys.ht.used; + size += facets->keys.ht.size; + resizes += facets->keys.ht.resizes; + searches += facets->keys.ht.searches; + collisions += facets->keys.ht.collisions; + + buffer_json_member_add_uint64(wb, "registered", facets->operations.keys.registered); + buffer_json_member_add_uint64(wb, "unique", facets->operations.keys.unique); + buffer_json_member_add_uint64(wb, "hashtables", count); + buffer_json_member_add_uint64(wb, "hashtable_used", used); + buffer_json_member_add_uint64(wb, "hashtable_size", size); + buffer_json_member_add_uint64(wb, "hashtable_searches", searches); + buffer_json_member_add_uint64(wb, "hashtable_collisions", collisions); + buffer_json_member_add_uint64(wb, "hashtable_resizes", resizes); + } + buffer_json_object_close(wb); // keys + buffer_json_member_add_object(wb, "values"); + { + size_t resizes = 0, searches = 0, collisions = 0, used = 0, size = 0, count = 0; + for(FACET_KEY *k = facets->keys.ll; k ; k = k->next) { + count++; + used += k->values.ht.used; + size += k->values.ht.size; + resizes += k->values.ht.resizes; + searches += k->values.ht.searches; + collisions += k->values.ht.collisions; + } + + buffer_json_member_add_uint64(wb, "registered", facets->operations.values.registered); + buffer_json_member_add_uint64(wb, "transformed", facets->operations.values.transformed); + buffer_json_member_add_uint64(wb, "dynamic", facets->operations.values.dynamic); + buffer_json_member_add_uint64(wb, "empty", facets->operations.values.empty); + buffer_json_member_add_uint64(wb, "unsampled", facets->operations.values.unsampled); + buffer_json_member_add_uint64(wb, "estimated", facets->operations.values.estimated); + buffer_json_member_add_uint64(wb, "indexed", facets->operations.values.indexed); + buffer_json_member_add_uint64(wb, "inserts", facets->operations.values.inserts); + buffer_json_member_add_uint64(wb, "conflicts", facets->operations.values.conflicts); + buffer_json_member_add_uint64(wb, "hashtables", count); + buffer_json_member_add_uint64(wb, "hashtable_used", used); + buffer_json_member_add_uint64(wb, "hashtable_size", size); + buffer_json_member_add_uint64(wb, "hashtable_searches", searches); + buffer_json_member_add_uint64(wb, "hashtable_collisions", collisions); + buffer_json_member_add_uint64(wb, "hashtable_resizes", resizes); + } + buffer_json_object_close(wb); // values + buffer_json_member_add_object(wb, "fts"); + { + buffer_json_member_add_uint64(wb, "searches", facets->operations.fts.searches); + } + buffer_json_object_close(wb); // fts + } + buffer_json_object_close(wb); // items +} diff --git a/src/libnetdata/facets/facets.h b/src/libnetdata/facets/facets.h new file mode 100644 index 00000000..8364d861 --- /dev/null +++ b/src/libnetdata/facets/facets.h @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef FACETS_H +#define FACETS_H 1 + +#include "../libnetdata.h" + +#define FACET_VALUE_UNSET "-" +#define FACET_VALUE_UNSAMPLED "[unsampled]" +#define FACET_VALUE_ESTIMATED "[estimated]" + +typedef enum __attribute__((packed)) { + FACETS_ANCHOR_DIRECTION_FORWARD, + FACETS_ANCHOR_DIRECTION_BACKWARD, +} FACETS_ANCHOR_DIRECTION; + +typedef enum __attribute__((packed)) { + FACETS_TRANSFORM_VALUE, + FACETS_TRANSFORM_HISTOGRAM, + FACETS_TRANSFORM_FACET, + FACETS_TRANSFORM_DATA, + FACETS_TRANSFORM_FACET_SORT, +} FACETS_TRANSFORMATION_SCOPE; + +typedef enum __attribute__((packed)) { + FACET_KEY_OPTION_FACET = (1 << 0), // filterable values + FACET_KEY_OPTION_NO_FACET = (1 << 1), // non-filterable value + FACET_KEY_OPTION_NEVER_FACET = (1 << 2), // never enable this field as facet + FACET_KEY_OPTION_STICKY = (1 << 3), // should be sticky in the table + FACET_KEY_OPTION_VISIBLE = (1 << 4), // should be in the default table + FACET_KEY_OPTION_FTS = (1 << 5), // the key is filterable by full text search (FTS) + FACET_KEY_OPTION_MAIN_TEXT = (1 << 6), // full width and wrap + FACET_KEY_OPTION_RICH_TEXT = (1 << 7), + FACET_KEY_OPTION_REORDER = (1 << 8), // give the key a new order id on first encounter + FACET_KEY_OPTION_TRANSFORM_VIEW = (1 << 9), // when registering the transformation, do it only at the view, not on all data + FACET_KEY_OPTION_EXPANDED_FILTER = (1 << 10), // the presentation should have this filter expanded by default +} FACET_KEY_OPTIONS; + +typedef enum __attribute__((packed)) { + FACET_ROW_SEVERITY_DEBUG, // lowest - not important + FACET_ROW_SEVERITY_NORMAL, // the default + FACET_ROW_SEVERITY_NOTICE, // bold + FACET_ROW_SEVERITY_WARNING, // yellow + bold + FACET_ROW_SEVERITY_CRITICAL, // red + bold +} FACET_ROW_SEVERITY; + +typedef struct facet_row_key_value { + const char *tmp; + uint32_t tmp_len; + bool empty; + BUFFER *wb; +} FACET_ROW_KEY_VALUE; + +typedef struct facet_row { + usec_t usec; + DICTIONARY *dict; + FACET_ROW_SEVERITY severity; + struct facet_row *prev, *next; +} FACET_ROW; + +typedef struct facets FACETS; +typedef struct facet_key FACET_KEY; + +typedef void (*facets_key_transformer_t)(FACETS *facets __maybe_unused, BUFFER *wb, FACETS_TRANSFORMATION_SCOPE scope, void *data); +typedef void (*facet_dynamic_row_t)(FACETS *facets, BUFFER *json_array, FACET_ROW_KEY_VALUE *rkv, FACET_ROW *row, void *data); +typedef FACET_ROW_SEVERITY (*facet_row_severity_t)(FACETS *facets, FACET_ROW *row, void *data); +FACET_KEY *facets_register_dynamic_key_name(FACETS *facets, const char *key, FACET_KEY_OPTIONS options, facet_dynamic_row_t cb, void *data); +FACET_KEY *facets_register_key_name_transformation(FACETS *facets, const char *key, FACET_KEY_OPTIONS options, facets_key_transformer_t cb, void *data); +void facets_register_row_severity(FACETS *facets, facet_row_severity_t cb, void *data); + +typedef enum __attribute__((packed)) { + FACETS_OPTION_ALL_FACETS_VISIBLE = (1 << 0), // all facets should be visible by default in the table + FACETS_OPTION_ALL_KEYS_FTS = (1 << 1), // all keys are searchable by full text search + FACETS_OPTION_DONT_SEND_FACETS = (1 << 2), // "facets" object will not be included in the report + FACETS_OPTION_DONT_SEND_HISTOGRAM = (1 << 3), // "histogram" object will not be included in the report + FACETS_OPTION_DATA_ONLY = (1 << 4), + FACETS_OPTION_DONT_SEND_EMPTY_VALUE_FACETS = (1 << 5), // empty facet values will not be included in the report + FACETS_OPTION_SORT_FACETS_ALPHABETICALLY = (1 << 6), + FACETS_OPTION_SHOW_DELTAS = (1 << 7), +} FACETS_OPTIONS; + +FACETS *facets_create(uint32_t items_to_return, FACETS_OPTIONS options, const char *visible_keys, const char *facet_keys, const char *non_facet_keys); +void facets_destroy(FACETS *facets); + +void facets_accepted_param(FACETS *facets, const char *param); + +void facets_rows_begin(FACETS *facets); +bool facets_row_finished(FACETS *facets, usec_t usec); + +void facets_row_finished_unsampled(FACETS *facets, usec_t usec); +void facets_update_estimations(FACETS *facets, usec_t from_ut, usec_t to_ut, size_t entries); +size_t facets_histogram_slots(FACETS *facets); + +FACET_KEY *facets_register_key_name(FACETS *facets, const char *key, FACET_KEY_OPTIONS options); +void facets_set_query(FACETS *facets, const char *query); +void facets_set_items(FACETS *facets, uint32_t items); +void facets_set_anchor(FACETS *facets, usec_t start_ut, usec_t stop_ut, FACETS_ANCHOR_DIRECTION direction); +void facets_enable_slice_mode(FACETS *facets); +bool facets_row_candidate_to_keep(FACETS *facets, usec_t usec); + +FACET_KEY *facets_register_facet_id(FACETS *facets, const char *key_id, FACET_KEY_OPTIONS options); +void facets_register_facet_id_filter(FACETS *facets, const char *key_id, char *value_id, FACET_KEY_OPTIONS options); +void facets_set_timeframe_and_histogram_by_id(FACETS *facets, const char *key_id, usec_t after_ut, usec_t before_ut); +void facets_set_timeframe_and_histogram_by_name(FACETS *facets, const char *key_name, usec_t after_ut, usec_t before_ut); + +void facets_add_key_value(FACETS *facets, const char *key, const char *value); +void facets_add_key_value_length(FACETS *facets, const char *key, size_t key_len, const char *value, size_t value_len); + +void facets_report(FACETS *facets, BUFFER *wb, DICTIONARY *used_hashes_registry); +void facets_accepted_parameters_to_json_array(FACETS *facets, BUFFER *wb, bool with_keys); +void facets_set_current_row_severity(FACETS *facets, FACET_ROW_SEVERITY severity); +void facets_set_additional_options(FACETS *facets, FACETS_OPTIONS options); + +bool facets_key_name_is_filter(FACETS *facets, const char *key); +bool facets_key_name_is_facet(FACETS *facets, const char *key); +bool facets_key_name_value_length_is_selected(FACETS *facets, const char *key, size_t key_length, const char *value, size_t value_length); +void facets_add_possible_value_name_to_key(FACETS *facets, const char *key, size_t key_length, const char *value, size_t value_length); + +void facets_sort_and_reorder_keys(FACETS *facets); +usec_t facets_row_oldest_ut(FACETS *facets); +usec_t facets_row_newest_ut(FACETS *facets); +uint32_t facets_rows(FACETS *facets); + +void facets_table_config(BUFFER *wb); + +const char *facets_severity_to_string(FACET_ROW_SEVERITY severity); + +#endif diff --git a/src/libnetdata/functions_evloop/README.md b/src/libnetdata/functions_evloop/README.md new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/src/libnetdata/functions_evloop/README.md diff --git a/src/libnetdata/functions_evloop/functions_evloop.c b/src/libnetdata/functions_evloop/functions_evloop.c new file mode 100644 index 00000000..5000d038 --- /dev/null +++ b/src/libnetdata/functions_evloop/functions_evloop.c @@ -0,0 +1,466 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "functions_evloop.h" + +static void functions_evloop_config_cb(const char *transaction, char *function, usec_t *stop_monotonic_ut, + bool *cancelled, BUFFER *payload, HTTP_ACCESS access, + const char *source, void *data); + +struct functions_evloop_worker_job { + bool used; + bool running; + bool cancelled; + usec_t stop_monotonic_ut; + char *cmd; + const char *transaction; + time_t timeout; + + BUFFER *payload; + HTTP_ACCESS access; + const char *source; + + functions_evloop_worker_execute_t cb; + void *cb_data; +}; + +static void worker_job_cleanup(struct functions_evloop_worker_job *j) { + freez((void *)j->cmd); + freez((void *)j->transaction); + freez((void *)j->source); + buffer_free(j->payload); +} + +struct rrd_functions_expectation { + const char *function; + size_t function_length; + functions_evloop_worker_execute_t cb; + void *cb_data; + time_t default_timeout; + struct rrd_functions_expectation *prev, *next; +}; + +struct functions_evloop_globals { + const char *tag; + + DICTIONARY *worker_queue; + pthread_mutex_t worker_mutex; + pthread_cond_t worker_cond_var; + size_t workers; + + netdata_mutex_t *stdout_mutex; + bool *plugin_should_exit; + bool workers_exit; // all workers are waiting on the same condition - this makes them all exit, when any is cancelled + + ND_THREAD *reader_thread; + ND_THREAD **worker_threads; + + struct { + DICTIONARY *nodes; + } dyncfg; + + struct rrd_functions_expectation *expectations; +}; + +static void rrd_functions_worker_canceller(void *data) { + struct functions_evloop_globals *wg = data; + pthread_mutex_lock(&wg->worker_mutex); + wg->workers_exit = true; + pthread_cond_signal(&wg->worker_cond_var); + pthread_mutex_unlock(&wg->worker_mutex); +} + +static void *rrd_functions_worker_globals_worker_main(void *arg) { + struct functions_evloop_globals *wg = arg; + + nd_thread_register_canceller(rrd_functions_worker_canceller, wg); + + bool last_acquired = true; + while (true) { + pthread_mutex_lock(&wg->worker_mutex); + + if(wg->workers_exit || nd_thread_signaled_to_cancel()) { + pthread_mutex_unlock(&wg->worker_mutex); + break; + } + + if(dictionary_entries(wg->worker_queue) == 0 || !last_acquired) + pthread_cond_wait(&wg->worker_cond_var, &wg->worker_mutex); + + const DICTIONARY_ITEM *acquired = NULL; + struct functions_evloop_worker_job *j; + dfe_start_write(wg->worker_queue, j) { + if(j->running || j->cancelled) + continue; + + acquired = dictionary_acquired_item_dup(wg->worker_queue, j_dfe.item); + j->running = true; + break; + } + dfe_done(j); + + pthread_mutex_unlock(&wg->worker_mutex); + + if(wg->workers_exit || nd_thread_signaled_to_cancel()) { + if(acquired) + dictionary_acquired_item_release(wg->worker_queue, acquired); + + break; + } + + if(acquired) { + ND_LOG_STACK lgs[] = { + ND_LOG_FIELD_TXT(NDF_REQUEST, j->cmd), + ND_LOG_FIELD_END(), + }; + ND_LOG_STACK_PUSH(lgs); + + last_acquired = true; + j = dictionary_acquired_item_value(acquired); + j->cb(j->transaction, j->cmd, &j->stop_monotonic_ut, &j->cancelled, j->payload, j->access, j->source, j->cb_data); + dictionary_del(wg->worker_queue, j->transaction); + dictionary_acquired_item_release(wg->worker_queue, acquired); + dictionary_garbage_collect(wg->worker_queue); + } + else + last_acquired = false; + } + + return NULL; +} + +static void worker_add_job(struct functions_evloop_globals *wg, const char *keyword, char *transaction, char *function, char *timeout_s, BUFFER *payload, const char *access, const char *source) { + if(!transaction || !*transaction || !timeout_s || !*timeout_s || !function || !*function) { + nd_log(NDLS_COLLECTORS, NDLP_ERR, "Received incomplete %s (transaction = '%s', timeout = '%s', function = '%s'). Ignoring it.", + keyword, + transaction?transaction:"(unset)", + timeout_s?timeout_s:"(unset)", + function?function:"(unset)"); + } + else { + int timeout = str2i(timeout_s); + + const char *msg = "No function with this name found"; + bool found = false; + struct rrd_functions_expectation *we; + for(we = wg->expectations; we ;we = we->next) { + if(strncmp(function, we->function, we->function_length) == 0) { + if(timeout <= 0) + timeout = (int)we->default_timeout; + + struct functions_evloop_worker_job t = { + .cmd = strdupz(function), + .transaction = strdupz(transaction), + .running = false, + .cancelled = false, + .timeout = timeout, + .stop_monotonic_ut = now_monotonic_usec() + (timeout * USEC_PER_SEC), + .used = false, + .payload = buffer_dup(payload), + .access = http_access_from_hex(access), + .source = source ? strdupz(source) : NULL, + .cb = we->cb, + .cb_data = we->cb_data, + }; + struct functions_evloop_worker_job *j = dictionary_set(wg->worker_queue, transaction, &t, sizeof(t)); + if(j->used) { + nd_log(NDLS_COLLECTORS, NDLP_WARNING, "Received duplicate function transaction '%s'. Ignoring it.", transaction); + worker_job_cleanup(&t); + msg = "Duplicate function transaction. Ignoring it."; + } + else { + found = true; + j->used = true; + pthread_mutex_lock(&wg->worker_mutex); + pthread_cond_signal(&wg->worker_cond_var); + pthread_mutex_unlock(&wg->worker_mutex); + } + } + } + + if(!found) { + netdata_mutex_lock(wg->stdout_mutex); + pluginsd_function_json_error_to_stdout(transaction, HTTP_RESP_NOT_FOUND, msg); + netdata_mutex_unlock(wg->stdout_mutex); + } + } +} + +static void *rrd_functions_worker_globals_reader_main(void *arg) { + struct functions_evloop_globals *wg = arg; + + struct { + size_t last_len; // to remember the last pos - do not use a pointer, the buffer may realloc... + bool enabled; + char *transaction; + char *function; + char *timeout_s; + char *access; + char *source; + char *content_type; + } deferred = { 0 }; + + struct buffered_reader reader = { 0 }; + buffered_reader_init(&reader); + BUFFER *buffer = buffer_create(sizeof(reader.read_buffer) + 2, NULL); + + while(!(*wg->plugin_should_exit)) { + if(unlikely(!buffered_reader_next_line(&reader, buffer))) { + buffered_reader_ret_t ret = buffered_reader_read_timeout( + &reader, + fileno((FILE *)stdin), + 2 * 60 * MSEC_PER_SEC, + false + ); + + if(unlikely(ret != BUFFERED_READER_READ_OK && ret != BUFFERED_READER_READ_POLL_TIMEOUT)) + break; + + continue; + } + + if(deferred.enabled) { + char *s = (char *)buffer_tostring(buffer); + + if(strstr(&s[deferred.last_len], PLUGINSD_CALL_FUNCTION_PAYLOAD_END "\n") != NULL) { + if(deferred.last_len > 0) + // remove the trailing newline from the buffer + deferred.last_len--; + + s[deferred.last_len] = '\0'; + buffer->len = deferred.last_len; + buffer->content_type = content_type_string2id(deferred.content_type); + worker_add_job(wg, + PLUGINSD_CALL_FUNCTION_PAYLOAD_BEGIN, deferred.transaction, deferred.function, + deferred.timeout_s, buffer, deferred.access, deferred.source); + buffer_flush(buffer); + + freez(deferred.transaction); + freez(deferred.function); + freez(deferred.timeout_s); + freez(deferred.access); + freez(deferred.source); + freez(deferred.content_type); + memset(&deferred, 0, sizeof(deferred)); + } + else + deferred.last_len = buffer->len; + + continue; + } + + char *words[MAX_FUNCTION_PARAMETERS] = { NULL }; + size_t num_words = quoted_strings_splitter_pluginsd((char *)buffer_tostring(buffer), words, MAX_FUNCTION_PARAMETERS); + + const char *keyword = get_word(words, num_words, 0); + + if(keyword && (strcmp(keyword, PLUGINSD_CALL_FUNCTION) == 0)) { + char *transaction = get_word(words, num_words, 1); + char *timeout_s = get_word(words, num_words, 2); + char *function = get_word(words, num_words, 3); + char *access = get_word(words, num_words, 4); + char *source = get_word(words, num_words, 5); + worker_add_job(wg, keyword, transaction, function, timeout_s, NULL, access, source); + } + else if(keyword && (strcmp(keyword, PLUGINSD_CALL_FUNCTION_PAYLOAD_BEGIN) == 0)) { + char *transaction = get_word(words, num_words, 1); + char *timeout_s = get_word(words, num_words, 2); + char *function = get_word(words, num_words, 3); + char *access = get_word(words, num_words, 4); + char *source = get_word(words, num_words, 5); + char *content_type = get_word(words, num_words, 6); + + deferred.transaction = strdupz(transaction ? transaction : ""); + deferred.timeout_s = strdupz(timeout_s ? timeout_s : ""); + deferred.function = strdupz(function ? function : ""); + deferred.access = strdupz(access ? access : ""); + deferred.source = strdupz(source ? source : ""); + deferred.content_type = strdupz(content_type ? content_type : ""); + deferred.last_len = 0; + deferred.enabled = true; + } + else if(keyword && strcmp(keyword, PLUGINSD_CALL_FUNCTION_CANCEL) == 0) { + char *transaction = get_word(words, num_words, 1); + const DICTIONARY_ITEM *acquired = dictionary_get_and_acquire_item(wg->worker_queue, transaction); + if(acquired) { + struct functions_evloop_worker_job *j = dictionary_acquired_item_value(acquired); + __atomic_store_n(&j->cancelled, true, __ATOMIC_RELAXED); + dictionary_acquired_item_release(wg->worker_queue, acquired); + dictionary_del(wg->worker_queue, transaction); + dictionary_garbage_collect(wg->worker_queue); + } + else + nd_log(NDLS_COLLECTORS, NDLP_NOTICE, "Received CANCEL for transaction '%s', but it not available here", transaction); + } + else if(keyword && strcmp(keyword, PLUGINSD_CALL_FUNCTION_PROGRESS) == 0) { + char *transaction = get_word(words, num_words, 1); + const DICTIONARY_ITEM *acquired = dictionary_get_and_acquire_item(wg->worker_queue, transaction); + if(acquired) { + struct functions_evloop_worker_job *j = dictionary_acquired_item_value(acquired); + + functions_stop_monotonic_update_on_progress(&j->stop_monotonic_ut); + + dictionary_acquired_item_release(wg->worker_queue, acquired); + } + else + nd_log(NDLS_COLLECTORS, NDLP_NOTICE, "Received PROGRESS for transaction '%s', but it not available here", transaction); + } + else + nd_log(NDLS_COLLECTORS, NDLP_NOTICE, "Received unknown command: %s", keyword?keyword:"(unset)"); + + buffer_flush(buffer); + } + + if(!(*wg->plugin_should_exit)) + nd_log(NDLS_COLLECTORS, NDLP_ERR, "Read error on stdin"); + + *wg->plugin_should_exit = true; + exit(1); +} + +void worker_queue_delete_cb(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *data __maybe_unused) { + struct functions_evloop_worker_job *j = value; + worker_job_cleanup(j); +} + +struct functions_evloop_globals *functions_evloop_init(size_t worker_threads, const char *tag, netdata_mutex_t *stdout_mutex, bool *plugin_should_exit) { + struct functions_evloop_globals *wg = callocz(1, sizeof(struct functions_evloop_globals)); + + wg->worker_queue = dictionary_create(DICT_OPTION_DONT_OVERWRITE_VALUE); + dictionary_register_delete_callback(wg->worker_queue, worker_queue_delete_cb, NULL); + + wg->dyncfg.nodes = dyncfg_nodes_dictionary_create(); + + pthread_mutex_init(&wg->worker_mutex, NULL); + pthread_cond_init(&wg->worker_cond_var, NULL); + + wg->plugin_should_exit = plugin_should_exit; + wg->stdout_mutex = stdout_mutex; + wg->workers = worker_threads; + wg->worker_threads = callocz(wg->workers, sizeof(ND_THREAD *)); + wg->tag = tag; + + char tag_buffer[NETDATA_THREAD_TAG_MAX + 1]; + snprintfz(tag_buffer, NETDATA_THREAD_TAG_MAX, "%s_READER", wg->tag); + wg->reader_thread = nd_thread_create(tag_buffer, NETDATA_THREAD_OPTION_DONT_LOG, + rrd_functions_worker_globals_reader_main, wg); + + for(size_t i = 0; i < wg->workers ; i++) { + snprintfz(tag_buffer, NETDATA_THREAD_TAG_MAX, "%s_WORK[%zu]", wg->tag, i+1); + wg->worker_threads[i] = nd_thread_create(tag_buffer, NETDATA_THREAD_OPTION_DONT_LOG, + rrd_functions_worker_globals_worker_main, wg); + } + + functions_evloop_add_function(wg, "config", functions_evloop_config_cb, 120, wg); + + return wg; +} + +void functions_evloop_add_function(struct functions_evloop_globals *wg, const char *function, functions_evloop_worker_execute_t cb, time_t default_timeout, void *data) { + struct rrd_functions_expectation *we = callocz(1, sizeof(*we)); + we->function = function; + we->function_length = strlen(we->function); + we->cb = cb; + we->cb_data = data; + we->default_timeout = default_timeout; + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(wg->expectations, we, prev, next); +} + +void functions_evloop_cancel_threads(struct functions_evloop_globals *wg) { + nd_thread_signal_cancel(wg->reader_thread); + + for(size_t i = 0; i < wg->workers ; i++) + nd_thread_signal_cancel(wg->worker_threads[i]); +} + +// ---------------------------------------------------------------------------- + +static void functions_evloop_config_cb(const char *transaction, char *function, usec_t *stop_monotonic_ut, bool *cancelled, + BUFFER *payload, HTTP_ACCESS access, const char *source, void *data) { + struct functions_evloop_globals *wg = data; + + CLEAN_BUFFER *result = buffer_create(1024, NULL); + int code = dyncfg_node_find_and_call(wg->dyncfg.nodes, transaction, function, stop_monotonic_ut, + cancelled, payload, access, source, result); + + netdata_mutex_lock(wg->stdout_mutex); + pluginsd_function_result_begin_to_stdout(transaction, code, content_type_id2string(result->content_type), result->expires); + printf("%s", buffer_tostring(result)); + pluginsd_function_result_end_to_stdout(); + fflush(stdout); + netdata_mutex_unlock(wg->stdout_mutex); +} + +void functions_evloop_dyncfg_add(struct functions_evloop_globals *wg, const char *id, const char *path, + DYNCFG_STATUS status, DYNCFG_TYPE type, DYNCFG_SOURCE_TYPE source_type, + const char *source, DYNCFG_CMDS cmds, + HTTP_ACCESS view_access, HTTP_ACCESS edit_access, + dyncfg_cb_t cb, void *data) { + + if(!dyncfg_is_valid_id(id)) { + nd_log(NDLS_COLLECTORS, NDLP_ERR, "DYNCFG: id '%s' is invalid. Ignoring dynamic configuration for it.", id); + return; + } + + struct dyncfg_node tmp = { + .cmds = cmds, + .type = type, + .cb = cb, + .data = data, + }; + dictionary_set(wg->dyncfg.nodes, id, &tmp, sizeof(tmp)); + + CLEAN_BUFFER *c = buffer_create(100, NULL); + dyncfg_cmds2buffer(cmds, c); + + netdata_mutex_lock(wg->stdout_mutex); + + fprintf(stdout, + PLUGINSD_KEYWORD_CONFIG " '%s' " PLUGINSD_KEYWORD_CONFIG_ACTION_CREATE " '%s' '%s' '%s' '%s' '%s' '%s' "HTTP_ACCESS_FORMAT" "HTTP_ACCESS_FORMAT"\n", + id, + dyncfg_id2status(status), + dyncfg_id2type(type), path, + dyncfg_id2source_type(source_type), + source, + buffer_tostring(c), + (HTTP_ACCESS_FORMAT_CAST)view_access, + (HTTP_ACCESS_FORMAT_CAST)edit_access + ); + fflush(stdout); + + netdata_mutex_unlock(wg->stdout_mutex); +} + +void functions_evloop_dyncfg_del(struct functions_evloop_globals *wg, const char *id) { + if(!dyncfg_is_valid_id(id)) { + nd_log(NDLS_COLLECTORS, NDLP_ERR, "DYNCFG: id '%s' is invalid. Ignoring dynamic configuration for it.", id); + return; + } + + dictionary_del(wg->dyncfg.nodes, id); + + netdata_mutex_lock(wg->stdout_mutex); + + fprintf(stdout, + PLUGINSD_KEYWORD_CONFIG " %s " PLUGINSD_KEYWORD_CONFIG_ACTION_DELETE "\n", + id); + fflush(stdout); + + netdata_mutex_unlock(wg->stdout_mutex); +} + +void functions_evloop_dyncfg_status(struct functions_evloop_globals *wg, const char *id, DYNCFG_STATUS status) { + if(!dyncfg_is_valid_id(id)) { + nd_log(NDLS_COLLECTORS, NDLP_ERR, "DYNCFG: id '%s' is invalid. Ignoring dynamic configuration for it.", id); + return; + } + + netdata_mutex_lock(wg->stdout_mutex); + + fprintf(stdout, + PLUGINSD_KEYWORD_CONFIG " %s " PLUGINSD_KEYWORD_CONFIG_ACTION_STATUS " %s\n", + id, dyncfg_id2status(status)); + + fflush(stdout); + + netdata_mutex_unlock(wg->stdout_mutex); +} diff --git a/src/libnetdata/functions_evloop/functions_evloop.h b/src/libnetdata/functions_evloop/functions_evloop.h new file mode 100644 index 00000000..5c575bd1 --- /dev/null +++ b/src/libnetdata/functions_evloop/functions_evloop.h @@ -0,0 +1,156 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_FUNCTIONS_EVLOOP_H +#define NETDATA_FUNCTIONS_EVLOOP_H + +#include "../libnetdata.h" + +#define MAX_FUNCTION_PARAMETERS 1024 +#define PLUGINS_FUNCTIONS_TIMEOUT_DEFAULT 10 // seconds + +// plugins.d 1st version of the external plugins and streaming protocol +#define PLUGINSD_KEYWORD_CHART "CHART" +#define PLUGINSD_KEYWORD_CHART_DEFINITION_END "CHART_DEFINITION_END" +#define PLUGINSD_KEYWORD_DIMENSION "DIMENSION" +#define PLUGINSD_KEYWORD_BEGIN "BEGIN" +#define PLUGINSD_KEYWORD_SET "SET" +#define PLUGINSD_KEYWORD_END "END" +#define PLUGINSD_KEYWORD_FLUSH "FLUSH" +#define PLUGINSD_KEYWORD_DISABLE "DISABLE" +#define PLUGINSD_KEYWORD_VARIABLE "VARIABLE" +#define PLUGINSD_KEYWORD_LABEL "LABEL" +#define PLUGINSD_KEYWORD_OVERWRITE "OVERWRITE" +#define PLUGINSD_KEYWORD_CLABEL "CLABEL" +#define PLUGINSD_KEYWORD_CLABEL_COMMIT "CLABEL_COMMIT" +#define PLUGINSD_KEYWORD_EXIT "EXIT" + +// high-speed versions of BEGIN, SET, END +#define PLUGINSD_KEYWORD_BEGIN_V2 "BEGIN2" +#define PLUGINSD_KEYWORD_SET_V2 "SET2" +#define PLUGINSD_KEYWORD_END_V2 "END2" + +// super high-speed versions of BEGIN, SET, END have this as first parameter +// enabled with the streaming capability STREAM_CAP_SLOTS +#define PLUGINSD_KEYWORD_SLOT "SLOT" // to change the length of this, update pluginsd_extract_chart_slot() too + +// virtual hosts (only for external plugins - for streaming virtual hosts are like all other hosts) +#define PLUGINSD_KEYWORD_HOST_DEFINE "HOST_DEFINE" +#define PLUGINSD_KEYWORD_HOST_DEFINE_END "HOST_DEFINE_END" +#define PLUGINSD_KEYWORD_HOST_LABEL "HOST_LABEL" +#define PLUGINSD_KEYWORD_HOST "HOST" + +// replication +// enabled with STREAM_CAP_REPLICATION +#define PLUGINSD_KEYWORD_REPLAY_CHART "REPLAY_CHART" +#define PLUGINSD_KEYWORD_REPLAY_BEGIN "RBEGIN" +#define PLUGINSD_KEYWORD_REPLAY_SET "RSET" +#define PLUGINSD_KEYWORD_REPLAY_RRDDIM_STATE "RDSTATE" +#define PLUGINSD_KEYWORD_REPLAY_RRDSET_STATE "RSSTATE" +#define PLUGINSD_KEYWORD_REPLAY_END "REND" + +// plugins.d accepts these for functions (from external plugins or streaming children) +// related to STREAM_CAP_FUNCTIONS, STREAM_CAP_PROGRESS +#define PLUGINSD_KEYWORD_FUNCTION "FUNCTION" // define a function +#define PLUGINSD_KEYWORD_FUNCTION_PROGRESS "FUNCTION_PROGRESS" // send updates about function progress +#define PLUGINSD_KEYWORD_FUNCTION_RESULT_BEGIN "FUNCTION_RESULT_BEGIN" // the result of a function transaction +#define PLUGINSD_KEYWORD_FUNCTION_RESULT_END "FUNCTION_RESULT_END" // the end of the result of a func. trans. + +// plugins.d sends these for functions (to external plugins or streaming children) +// related to STREAM_CAP_FUNCTIONS, STREAM_CAP_PROGRESS +#define PLUGINSD_CALL_FUNCTION "FUNCTION" // call a function to a plugin or remote host +#define PLUGINSD_CALL_FUNCTION_PAYLOAD_BEGIN "FUNCTION_PAYLOAD" // call a function with a payload +#define PLUGINSD_CALL_FUNCTION_PAYLOAD_END "FUNCTION_PAYLOAD_END" // function payload ends +#define PLUGINSD_CALL_FUNCTION_CANCEL "FUNCTION_CANCEL" // cancel a running function transaction +#define PLUGINSD_CALL_FUNCTION_PROGRESS "FUNCTION_PROGRESS" // let the function know the user is waiting + +// dyncfg +// enabled with STREAM_CAP_DYNCFG +#define PLUGINSD_KEYWORD_CONFIG "CONFIG" +#define PLUGINSD_KEYWORD_CONFIG_ACTION_CREATE "create" +#define PLUGINSD_KEYWORD_CONFIG_ACTION_DELETE "delete" +#define PLUGINSD_KEYWORD_CONFIG_ACTION_STATUS "status" +#define PLUGINSD_FUNCTION_CONFIG "config" + +typedef void (*functions_evloop_worker_execute_t)(const char *transaction, char *function, usec_t *stop_monotonic_ut, + bool *cancelled, BUFFER *payload, HTTP_ACCESS access, + const char *source, void *data); + +struct functions_evloop_worker_job; +struct functions_evloop_globals *functions_evloop_init(size_t worker_threads, const char *tag, netdata_mutex_t *stdout_mutex, bool *plugin_should_exit); +void functions_evloop_add_function(struct functions_evloop_globals *wg, const char *function, functions_evloop_worker_execute_t cb, time_t default_timeout, void *data); +void functions_evloop_cancel_threads(struct functions_evloop_globals *wg); + +#define FUNCTIONS_EXTENDED_TIME_ON_PROGRESS_UT (10 * USEC_PER_SEC) +static inline void functions_stop_monotonic_update_on_progress(usec_t *stop_monotonic_ut) { + usec_t now_ut = now_monotonic_usec(); + if(now_ut + FUNCTIONS_EXTENDED_TIME_ON_PROGRESS_UT > *stop_monotonic_ut) { + nd_log(NDLS_DAEMON, NDLP_DEBUG, "Extending function timeout due to PROGRESS update..."); + __atomic_store_n(stop_monotonic_ut, now_ut + FUNCTIONS_EXTENDED_TIME_ON_PROGRESS_UT, __ATOMIC_RELAXED); + } + else + nd_log(NDLS_DAEMON, NDLP_DEBUG, "Received PROGRESS update..."); +} + +#define pluginsd_function_result_begin_to_buffer(wb, transaction, code, content_type, expires) \ + buffer_sprintf(wb \ + , PLUGINSD_KEYWORD_FUNCTION_RESULT_BEGIN " \"%s\" %d \"%s\" %ld\n" \ + , (transaction) ? (transaction) : "" \ + , (int)(code) \ + , (content_type) ? (content_type) : "" \ + , (long int)(expires) \ + ) + +#define pluginsd_function_result_end_to_buffer(wb) \ + buffer_strcat(wb, "\n" PLUGINSD_KEYWORD_FUNCTION_RESULT_END "\n") + +#define pluginsd_function_result_begin_to_stdout(transaction, code, content_type, expires) \ + fprintf(stdout \ + , PLUGINSD_KEYWORD_FUNCTION_RESULT_BEGIN " \"%s\" %d \"%s\" %ld\n" \ + , (transaction) ? (transaction) : "" \ + , (int)(code) \ + , (content_type) ? (content_type) : "" \ + , (long int)(expires) \ + ) + +#define pluginsd_function_result_end_to_stdout() \ + fprintf(stdout, "\n" PLUGINSD_KEYWORD_FUNCTION_RESULT_END "\n") + +static inline void pluginsd_function_json_error_to_stdout(const char *transaction, int code, const char *msg) { + char buffer[PLUGINSD_LINE_MAX + 1]; + json_escape_string(buffer, msg, PLUGINSD_LINE_MAX); + + pluginsd_function_result_begin_to_stdout(transaction, code, "application/json", now_realtime_sec()); + fprintf(stdout, "{\"status\":%d,\"error_message\":\"%s\"}", code, buffer); + pluginsd_function_result_end_to_stdout(); + fflush(stdout); +} + +static inline void pluginsd_function_result_to_stdout(const char *transaction, int code, const char *content_type, time_t expires, BUFFER *result) { + pluginsd_function_result_begin_to_stdout(transaction, code, content_type, expires); + fwrite(buffer_tostring(result), buffer_strlen(result), 1, stdout); + pluginsd_function_result_end_to_stdout(); + fflush(stdout); +} + +static inline void pluginsd_function_progress_to_stdout(const char *transaction, size_t done, size_t all) { + fprintf(stdout, PLUGINSD_KEYWORD_FUNCTION_PROGRESS " '%s' %zu %zu\n", + transaction, done, all); + fflush(stdout); +} + +static inline void send_newline_and_flush(pthread_mutex_t *mutex) { + netdata_mutex_lock(mutex); + fprintf(stdout, "\n"); + fflush(stdout); + netdata_mutex_unlock(mutex); +} + +void functions_evloop_dyncfg_add(struct functions_evloop_globals *wg, const char *id, const char *path, + DYNCFG_STATUS status, DYNCFG_TYPE type, DYNCFG_SOURCE_TYPE source_type, const char *source, DYNCFG_CMDS cmds, + HTTP_ACCESS view_access, HTTP_ACCESS edit_access, + dyncfg_cb_t cb, void *data); + +void functions_evloop_dyncfg_del(struct functions_evloop_globals *wg, const char *id); +void functions_evloop_dyncfg_status(struct functions_evloop_globals *wg, const char *id, DYNCFG_STATUS status); + +#endif //NETDATA_FUNCTIONS_EVLOOP_H diff --git a/src/libnetdata/gorilla/README.md b/src/libnetdata/gorilla/README.md new file mode 100644 index 00000000..dc3718d1 --- /dev/null +++ b/src/libnetdata/gorilla/README.md @@ -0,0 +1,39 @@ +# Gorilla compression and decompression + +This provides an alternative way of representing values stored in database +pages. Instead of allocating and using a page of fixed size, ie. 4096 bytes, +the Gorilla implementation adds support for dynamically sized pages that +contain a variable number of Gorilla buffers. + +Each buffer takes 512 bytes and compresses incoming data using the Gorilla +compression: + +- The very first value is stored as it is. +- For each new value, Gorilla compression doesn't store the value itself. Instead, +it computes the difference (XOR) between the new value and the previous value. +- If the XOR result is zero (meaning the new value is identical to the previous +value), we store just a single bit set to `1`. +- If the XOR result is not zero (meaning the new value differs from the previous): + - We store a `0` bit to indicate the change. + - We compute the leading-zero count (LZC) of the XOR result, and compare it + with the previous LZC. If the two LZCs are equal we store a `1` bit. + - If the LZCs are different we use 5 bits to store the new LZC, and we store + the rest of the value (ie. without its LZC) in the buffer. + +A Gorilla page can have multiple Gorilla buffers. If the values of a metric +are highly compressible, just one Gorilla buffer is able to store all the values +that otherwise would require a regular 4096 byte page, ie. we can use just 512 +bytes instead. In the worst case scenario (for metrics whose values are not +compressible at all), a Gorilla page might end up having `9` Gorilla buffers, +consuming 4608 bytes. In practice, this is pretty rare and does not negate +the effect of compression for the metrics. + +When a gorilla page is full, ie. it contains 1024 slots/values, we serialize +the linked-list of gorilla buffers directly to disk. During deserialization, +eg. when performing a DBEngine query, the Gorilla page is loaded from the disk and +its linked-list entries are patched to point to the new memory allocated for +serving the query results. + +Overall, on a real-agent the Gorilla compression scheme reduces memory +consumption approximately by ~30%, which can be several GiB of RAM for parents +having hundreds, or even thousands of children streaming to them. diff --git a/src/libnetdata/gorilla/benchmark.sh b/src/libnetdata/gorilla/benchmark.sh new file mode 100755 index 00000000..a5d11143 --- /dev/null +++ b/src/libnetdata/gorilla/benchmark.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash +# +# SPDX-License-Identifier: GPL-3.0-or-later +# + +set -exu -o pipefail + +clang++ \ + -std=c++11 -Wall -Wextra \ + -DENABLE_BENCHMARK -O2 -g \ + -lbenchmark -lbenchmark_main \ + -o gorilla_benchmark gorilla.cc + +./gorilla_benchmark diff --git a/src/libnetdata/gorilla/fuzzer.sh b/src/libnetdata/gorilla/fuzzer.sh new file mode 100755 index 00000000..19098a61 --- /dev/null +++ b/src/libnetdata/gorilla/fuzzer.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash +# +# SPDX-License-Identifier: GPL-3.0-or-later +# + +set -exu -o pipefail + +clang++ \ + -std=c++11 -Wall -Wextra \ + -DENABLE_FUZZER -O2 -g \ + -fsanitize=fuzzer \ + -o gorilla_fuzzer gorilla.cc + +./gorilla_fuzzer -workers=12 -jobs=16 diff --git a/src/libnetdata/gorilla/gorilla.cc b/src/libnetdata/gorilla/gorilla.cc new file mode 100644 index 00000000..c7601836 --- /dev/null +++ b/src/libnetdata/gorilla/gorilla.cc @@ -0,0 +1,522 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "gorilla.h" + +#include <cassert> +#include <climits> +#include <cstdio> +#include <cstring> + +using std::size_t; + +template <typename T> +static constexpr size_t bit_size() noexcept +{ + static_assert((sizeof(T) * CHAR_BIT) == 32 || (sizeof(T) * CHAR_BIT) == 64, + "Word size should be 32 or 64 bits."); + return (sizeof(T) * CHAR_BIT); +} + +static void bit_buffer_write(uint32_t *buf, size_t pos, uint32_t v, size_t nbits) +{ + assert(nbits > 0 && nbits <= bit_size<uint32_t>()); + + const size_t index = pos / bit_size<uint32_t>(); + const size_t offset = pos % bit_size<uint32_t>(); + + pos += nbits; + + if (offset == 0) { + buf[index] = v; + } else { + const size_t remaining_bits = bit_size<uint32_t>() - offset; + + // write the lower part of the value + const uint32_t low_bits_mask = ((uint32_t) 1 << remaining_bits) - 1; + const uint32_t lowest_bits_in_value = v & low_bits_mask; + buf[index] |= (lowest_bits_in_value << offset); + + if (nbits > remaining_bits) { + // write the upper part of the value + const uint32_t high_bits_mask = ~low_bits_mask; + const uint32_t highest_bits_in_value = (v & high_bits_mask) >> (remaining_bits); + buf[index + 1] = highest_bits_in_value; + } + } +} + +static void bit_buffer_read(const uint32_t *buf, size_t pos, uint32_t *v, size_t nbits) +{ + assert(nbits > 0 && nbits <= bit_size<uint32_t>()); + + const size_t index = pos / bit_size<uint32_t>(); + const size_t offset = pos % bit_size<uint32_t>(); + + pos += nbits; + + if (offset == 0) { + *v = (nbits == bit_size<uint32_t>()) ? + buf[index] : + buf[index] & (((uint32_t) 1 << nbits) - 1); + } else { + const size_t remaining_bits = bit_size<uint32_t>() - offset; + + // extract the lower part of the value + if (nbits < remaining_bits) { + *v = (buf[index] >> offset) & (((uint32_t) 1 << nbits) - 1); + } else { + *v = (buf[index] >> offset) & (((uint32_t) 1 << remaining_bits) - 1); + nbits -= remaining_bits; + *v |= (buf[index + 1] & (((uint32_t) 1 << nbits) - 1)) << remaining_bits; + } + } +} + +gorilla_writer_t gorilla_writer_init(gorilla_buffer_t *gbuf, size_t n) +{ + gorilla_writer_t gw = gorilla_writer_t { + .head_buffer = gbuf, + .last_buffer = NULL, + .prev_number = 0, + .prev_xor_lzc = 0, + .capacity = 0 + }; + + gorilla_writer_add_buffer(&gw, gbuf, n); + return gw; +} + +void gorilla_writer_add_buffer(gorilla_writer_t *gw, gorilla_buffer_t *gbuf, size_t n) +{ + gbuf->header.next = NULL; + gbuf->header.entries = 0; + gbuf->header.nbits = 0; + + uint32_t capacity = (n * bit_size<uint32_t>()) - (sizeof(gorilla_header_t) * CHAR_BIT); + + gw->prev_number = 0; + gw->prev_xor_lzc = 0; + gw->capacity = capacity; + + if (gw->last_buffer) + gw->last_buffer->header.next = gbuf; + + __atomic_store_n(&gw->last_buffer, gbuf, __ATOMIC_RELAXED); +} + +uint32_t gorilla_writer_entries(const gorilla_writer_t *gw) { + uint32_t entries = 0; + + const gorilla_buffer_t *curr_gbuf = __atomic_load_n(&gw->head_buffer, __ATOMIC_SEQ_CST); + do { + const gorilla_buffer_t *next_gbuf = __atomic_load_n(&curr_gbuf->header.next, __ATOMIC_SEQ_CST); + + entries += __atomic_load_n(&curr_gbuf->header.entries, __ATOMIC_SEQ_CST); + + curr_gbuf = next_gbuf; + } while (curr_gbuf); + + return entries; +} + +bool gorilla_writer_write(gorilla_writer_t *gw, uint32_t number) +{ + gorilla_header_t *hdr = &gw->last_buffer->header; + uint32_t *data = gw->last_buffer->data; + + // this is the first number we are writing + if (hdr->entries == 0) { + if (hdr->nbits + bit_size<uint32_t>() >= gw->capacity) + return false; + bit_buffer_write(data, hdr->nbits, number, bit_size<uint32_t>()); + + __atomic_fetch_add(&hdr->nbits, bit_size<uint32_t>(), __ATOMIC_RELAXED); + __atomic_fetch_add(&hdr->entries, 1, __ATOMIC_RELAXED); + gw->prev_number = number; + return true; + } + + // write true/false based on whether we got the same number or not. + if (number == gw->prev_number) { + if (hdr->nbits + 1 >= gw->capacity) + return false; + + bit_buffer_write(data, hdr->nbits, static_cast<uint32_t>(1), 1); + __atomic_fetch_add(&hdr->nbits, 1, __ATOMIC_RELAXED); + __atomic_fetch_add(&hdr->entries, 1, __ATOMIC_RELAXED); + return true; + } + + if (hdr->nbits + 1 >= gw->capacity) + return false; + bit_buffer_write(data, hdr->nbits, static_cast<uint32_t>(0), 1); + __atomic_fetch_add(&hdr->nbits, 1, __ATOMIC_RELAXED); + + uint32_t xor_value = gw->prev_number ^ number; + uint32_t xor_lzc = (bit_size<uint32_t>() == 32) ? __builtin_clz(xor_value) : __builtin_clzll(xor_value); + uint32_t is_xor_lzc_same = (xor_lzc == gw->prev_xor_lzc) ? 1 : 0; + + if (hdr->nbits + 1 >= gw->capacity) + return false; + bit_buffer_write(data, hdr->nbits, is_xor_lzc_same, 1); + __atomic_fetch_add(&hdr->nbits, 1, __ATOMIC_RELAXED); + + if (!is_xor_lzc_same) { + if (hdr->nbits + 1 >= gw->capacity) + return false; + bit_buffer_write(data, hdr->nbits, xor_lzc, (bit_size<uint32_t>() == 32) ? 5 : 6); + __atomic_fetch_add(&hdr->nbits, (bit_size<uint32_t>() == 32) ? 5 : 6, __ATOMIC_RELAXED); + } + + // write the bits of the XOR'd value without the LZC prefix + if (hdr->nbits + (bit_size<uint32_t>() - xor_lzc) >= gw->capacity) + return false; + bit_buffer_write(data, hdr->nbits, xor_value, bit_size<uint32_t>() - xor_lzc); + __atomic_fetch_add(&hdr->nbits, bit_size<uint32_t>() - xor_lzc, __ATOMIC_RELAXED); + __atomic_fetch_add(&hdr->entries, 1, __ATOMIC_RELAXED); + + gw->prev_number = number; + gw->prev_xor_lzc = xor_lzc; + return true; +} + +gorilla_buffer_t *gorilla_writer_drop_head_buffer(gorilla_writer_t *gw) { + if (!gw->head_buffer) + return NULL; + + gorilla_buffer_t *curr_head = gw->head_buffer; + gorilla_buffer_t *next_head = gw->head_buffer->header.next; + __atomic_store_n(&gw->head_buffer, next_head, __ATOMIC_RELAXED); + return curr_head; +} + +uint32_t gorilla_writer_nbytes(const gorilla_writer_t *gw) +{ + uint32_t nbits = 0; + + const gorilla_buffer_t *curr_gbuf = __atomic_load_n(&gw->head_buffer, __ATOMIC_SEQ_CST); + do { + const gorilla_buffer_t *next_gbuf = __atomic_load_n(&curr_gbuf->header.next, __ATOMIC_SEQ_CST); + + nbits += __atomic_load_n(&curr_gbuf->header.nbits, __ATOMIC_SEQ_CST); + + curr_gbuf = next_gbuf; + } while (curr_gbuf); + + return (nbits + (CHAR_BIT - 1)) / CHAR_BIT; +} + +bool gorilla_writer_serialize(const gorilla_writer_t *gw, uint8_t *dst, uint32_t dst_size) { + const gorilla_buffer_t *curr_gbuf = gw->head_buffer; + + do { + const gorilla_buffer_t *next_gbuf = curr_gbuf->header.next; + + size_t bytes = RRDENG_GORILLA_32BIT_BUFFER_SIZE; + if (bytes > dst_size) + return false; + + memcpy(dst, curr_gbuf, bytes); + dst += bytes; + dst_size -= bytes; + + curr_gbuf = next_gbuf; + } while (curr_gbuf); + + return true; +} + +uint32_t gorilla_buffer_patch(gorilla_buffer_t *gbuf) { + gorilla_buffer_t *curr_gbuf = gbuf; + uint32_t n = curr_gbuf->header.entries; + + while (curr_gbuf->header.next) { + uint32_t *buf = reinterpret_cast<uint32_t *>(gbuf); + gbuf = reinterpret_cast<gorilla_buffer_t *>(&buf[RRDENG_GORILLA_32BIT_BUFFER_SLOTS]); + + assert(((uintptr_t) (gbuf) % sizeof(uintptr_t)) == 0 && + "Gorilla buffer not aligned to uintptr_t"); + + curr_gbuf->header.next = gbuf; + curr_gbuf = curr_gbuf->header.next; + + n += curr_gbuf->header.entries; + } + + return n; +} + +gorilla_reader_t gorilla_writer_get_reader(const gorilla_writer_t *gw) +{ + const gorilla_buffer_t *buffer = __atomic_load_n(&gw->head_buffer, __ATOMIC_SEQ_CST); + + uint32_t entries = __atomic_load_n(&buffer->header.entries, __ATOMIC_SEQ_CST); + uint32_t capacity = __atomic_load_n(&buffer->header.nbits, __ATOMIC_SEQ_CST); + + return gorilla_reader_t { + .buffer = buffer, + .entries = entries, + .index = 0, + .capacity = capacity, + .position = 0, + .prev_number = 0, + .prev_xor_lzc = 0, + .prev_xor = 0, + }; +} + +gorilla_reader_t gorilla_reader_init(gorilla_buffer_t *gbuf) +{ + uint32_t entries = __atomic_load_n(&gbuf->header.entries, __ATOMIC_SEQ_CST); + uint32_t capacity = __atomic_load_n(&gbuf->header.nbits, __ATOMIC_SEQ_CST); + + return gorilla_reader_t { + .buffer = gbuf, + .entries = entries, + .index = 0, + .capacity = capacity, + .position = 0, + .prev_number = 0, + .prev_xor_lzc = 0, + .prev_xor = 0, + }; +} + +bool gorilla_reader_read(gorilla_reader_t *gr, uint32_t *number) +{ + const uint32_t *data = gr->buffer->data; + + if (gr->index + 1 > gr->entries) { + // We don't have any more entries to return. However, the writer + // might have updated the buffer's entries. We need to check once + // more in case more elements were added. + gr->entries = __atomic_load_n(&gr->buffer->header.entries, __ATOMIC_SEQ_CST); + gr->capacity = __atomic_load_n(&gr->buffer->header.nbits, __ATOMIC_SEQ_CST); + + // if the reader's current buffer has not been updated, we need to + // check if it has a pointer to a next buffer. + if (gr->index + 1 > gr->entries) { + gorilla_buffer_t *next_buffer = __atomic_load_n(&gr->buffer->header.next, __ATOMIC_SEQ_CST); + + if (!next_buffer) { + // fprintf(stderr, "Consumed reader with %zu entries from buffer %p\n (No more buffers to read from)", gr->length, gr->buffer); + return false; + } + + // fprintf(stderr, "Consumed reader with %zu entries from buffer %p\n", gr->length, gr->buffer); + *gr = gorilla_reader_init(next_buffer); + return gorilla_reader_read(gr, number); + } + } + + // read the first number + if (gr->index == 0) { + bit_buffer_read(data, gr->position, number, bit_size<uint32_t>()); + + gr->index++; + gr->position += bit_size<uint32_t>(); + gr->prev_number = *number; + return true; + } + + // process same-number bit + uint32_t is_same_number; + bit_buffer_read(data, gr->position, &is_same_number, 1); + gr->position++; + + if (is_same_number) { + *number = gr->prev_number; + gr->index++; + return true; + } + + // proceess same-xor-lzc bit + uint32_t xor_lzc = gr->prev_xor_lzc; + + uint32_t same_xor_lzc; + bit_buffer_read(data, gr->position, &same_xor_lzc, 1); + gr->position++; + + if (!same_xor_lzc) { + bit_buffer_read(data, gr->position, &xor_lzc, (bit_size<uint32_t>() == 32) ? 5 : 6); + gr->position += (bit_size<uint32_t>() == 32) ? 5 : 6; + } + + // process the non-lzc suffix + uint32_t xor_value = 0; + bit_buffer_read(data, gr->position, &xor_value, bit_size<uint32_t>() - xor_lzc); + gr->position += bit_size<uint32_t>() - xor_lzc; + + *number = (gr->prev_number ^ xor_value); + + gr->index++; + gr->prev_number = *number; + gr->prev_xor_lzc = xor_lzc; + gr->prev_xor = xor_value; + + return true; +} + +/* + * Internal code used for fuzzing the library +*/ + +#ifdef ENABLE_FUZZER + +#include <vector> + +template<typename Word> +static std::vector<Word> random_vector(const uint8_t *data, size_t size) { + std::vector<Word> V; + + V.reserve(1024); + + while (size >= sizeof(Word)) { + size -= sizeof(Word); + + Word w; + memcpy(&w, &data[size], sizeof(Word)); + V.push_back(w); + } + + return V; +} + +class Storage { +public: + gorilla_buffer_t *alloc_buffer(size_t words) { + uint32_t *new_buffer = new uint32_t[words](); + assert(((((uintptr_t) new_buffer) % 8u) == 0) && "Unaligned buffer..."); + Buffers.push_back(new_buffer); + return reinterpret_cast<gorilla_buffer_t *>(new_buffer); + } + + void free_buffers() { + for (uint32_t *buffer : Buffers) { + delete[] buffer; + } + } + +private: + std::vector<uint32_t *> Buffers; +}; + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) { + if (Size < 4) + return 0; + + std::vector<uint32_t> RandomData = random_vector<uint32_t>(Data, Size); + + Storage S; + size_t words_per_buffer = 8; + + /* + * write data + */ + gorilla_buffer_t *first_buffer = S.alloc_buffer(words_per_buffer); + gorilla_writer_t gw = gorilla_writer_init(first_buffer, words_per_buffer); + + for (size_t i = 0; i != RandomData.size(); i++) { + bool ok = gorilla_writer_write(&gw, RandomData[i]); + if (ok) + continue; + + // add new buffer + gorilla_buffer_t *buffer = S.alloc_buffer(words_per_buffer); + gorilla_writer_add_buffer(&gw, buffer, words_per_buffer); + + ok = gorilla_writer_write(&gw, RandomData[i]); + assert(ok && "Could not write data to new buffer!!!"); + } + + + /* + * read data + */ + gorilla_reader_t gr = gorilla_writer_get_reader(&gw); + + for (size_t i = 0; i != RandomData.size(); i++) { + uint32_t number = 0; + bool ok = gorilla_reader_read(&gr, &number); + assert(ok && "Failed to read number from gorilla buffer"); + + assert((number == RandomData[i]) + && "Read wrong number from gorilla buffer"); + } + + S.free_buffers(); + return 0; +} + +#endif /* ENABLE_FUZZER */ + +#ifdef ENABLE_BENCHMARK + +#include <benchmark/benchmark.h> +#include <random> + +static size_t NumItems = 1024; + +static void BM_EncodeU32Numbers(benchmark::State& state) { + std::random_device rd; + std::mt19937 mt(rd()); + std::uniform_int_distribution<uint32_t> dist(0x0, 0x0000FFFF); + + std::vector<uint32_t> RandomData; + for (size_t idx = 0; idx != NumItems; idx++) { + RandomData.push_back(dist(mt)); + } + std::vector<uint32_t> EncodedData(10 * RandomData.capacity(), 0); + + for (auto _ : state) { + gorilla_writer_t gw = gorilla_writer_init( + reinterpret_cast<gorilla_buffer_t *>(EncodedData.data()), + EncodedData.size()); + + for (size_t i = 0; i != RandomData.size(); i++) + benchmark::DoNotOptimize(gorilla_writer_write(&gw, RandomData[i])); + + benchmark::ClobberMemory(); + } + + state.SetItemsProcessed(NumItems * state.iterations()); + state.SetBytesProcessed(NumItems * state.iterations() * sizeof(uint32_t)); +} +BENCHMARK(BM_EncodeU32Numbers)->ThreadRange(1, 16)->UseRealTime(); + +static void BM_DecodeU32Numbers(benchmark::State& state) { + std::random_device rd; + std::mt19937 mt(rd()); + std::uniform_int_distribution<uint32_t> dist(0x0, 0xFFFFFFFF); + + std::vector<uint32_t> RandomData; + for (size_t idx = 0; idx != NumItems; idx++) { + RandomData.push_back(dist(mt)); + } + std::vector<uint32_t> EncodedData(10 * RandomData.capacity(), 0); + std::vector<uint32_t> DecodedData(10 * RandomData.capacity(), 0); + + gorilla_writer_t gw = gorilla_writer_init( + reinterpret_cast<gorilla_buffer_t *>(EncodedData.data()), + EncodedData.size()); + + for (size_t i = 0; i != RandomData.size(); i++) + gorilla_writer_write(&gw, RandomData[i]); + + for (auto _ : state) { + gorilla_reader_t gr = gorilla_reader_init(reinterpret_cast<gorilla_buffer_t *>(EncodedData.data())); + + for (size_t i = 0; i != RandomData.size(); i++) { + uint32_t number = 0; + benchmark::DoNotOptimize(gorilla_reader_read(&gr, &number)); + } + + benchmark::ClobberMemory(); + } + + state.SetItemsProcessed(NumItems * state.iterations()); + state.SetBytesProcessed(NumItems * state.iterations() * sizeof(uint32_t)); +} +BENCHMARK(BM_DecodeU32Numbers)->ThreadRange(1, 16)->UseRealTime(); + +#endif /* ENABLE_BENCHMARK */ diff --git a/src/libnetdata/gorilla/gorilla.h b/src/libnetdata/gorilla/gorilla.h new file mode 100644 index 00000000..7975d85e --- /dev/null +++ b/src/libnetdata/gorilla/gorilla.h @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef GORILLA_H +#define GORILLA_H + +#include <stdbool.h> +#include <stdint.h> +#include <stddef.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct gorilla_buffer; + +typedef struct { + struct gorilla_buffer *next; + uint32_t entries; + uint32_t nbits; +} gorilla_header_t; + +typedef struct gorilla_buffer { + gorilla_header_t header; + uint32_t data[]; +} gorilla_buffer_t; + +typedef struct { + gorilla_buffer_t *head_buffer; + gorilla_buffer_t *last_buffer; + + uint32_t prev_number; + uint32_t prev_xor_lzc; + + // in bits + uint32_t capacity; +} gorilla_writer_t; + +typedef struct { + const gorilla_buffer_t *buffer; + + // number of values + size_t entries; + size_t index; + + // in bits + size_t capacity; // FIXME: this not needed on the reader's side + size_t position; + + uint32_t prev_number; + uint32_t prev_xor_lzc; + uint32_t prev_xor; +} gorilla_reader_t; + +gorilla_writer_t gorilla_writer_init(gorilla_buffer_t *gbuf, size_t n); +void gorilla_writer_add_buffer(gorilla_writer_t *gw, gorilla_buffer_t *gbuf, size_t n); +bool gorilla_writer_write(gorilla_writer_t *gw, uint32_t number); +uint32_t gorilla_writer_entries(const gorilla_writer_t *gw); + +gorilla_reader_t gorilla_writer_get_reader(const gorilla_writer_t *gw); + +gorilla_buffer_t *gorilla_writer_drop_head_buffer(gorilla_writer_t *gw); + +uint32_t gorilla_writer_nbytes(const gorilla_writer_t *gw); +bool gorilla_writer_serialize(const gorilla_writer_t *gw, uint8_t *dst, uint32_t dst_size); + +uint32_t gorilla_buffer_patch(gorilla_buffer_t *buf); +gorilla_reader_t gorilla_reader_init(gorilla_buffer_t *buf); +bool gorilla_reader_read(gorilla_reader_t *gr, uint32_t *number); + +#define RRDENG_GORILLA_32BIT_BUFFER_SLOTS 128 +#define RRDENG_GORILLA_32BIT_BUFFER_SIZE (RRDENG_GORILLA_32BIT_BUFFER_SLOTS * sizeof(uint32_t)) + +#ifdef __cplusplus +} +#endif + +#endif /* GORILLA_H */ diff --git a/src/libnetdata/http/content_type.c b/src/libnetdata/http/content_type.c new file mode 100644 index 00000000..3e388a1d --- /dev/null +++ b/src/libnetdata/http/content_type.c @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "content_type.h" + + +static struct { + const char *format; + HTTP_CONTENT_TYPE content_type; + bool needs_charset; + const char *options; +} content_types[] = { + // primary - preferred during id-to-string conversions + { .format = "text/html", CT_TEXT_HTML, true }, + { .format = "text/plain", CT_TEXT_PLAIN, true }, + { .format = "text/css", CT_TEXT_CSS, true }, + { .format = "text/yaml", CT_TEXT_YAML, true }, + { .format = "application/yaml", CT_APPLICATION_YAML, true }, + { .format = "text/xml", CT_TEXT_XML, true }, + { .format = "text/xsl", CT_TEXT_XSL, true }, + { .format = "application/json", CT_APPLICATION_JSON, true }, + { .format = "application/xml", CT_APPLICATION_XML, true }, + { .format = "application/javascript", CT_APPLICATION_X_JAVASCRIPT, true }, + { .format = "application/octet-stream", CT_APPLICATION_OCTET_STREAM, false }, + { .format = "image/svg+xml", CT_IMAGE_SVG_XML, false }, + { .format = "application/x-font-truetype", CT_APPLICATION_X_FONT_TRUETYPE, false }, + { .format = "application/x-font-opentype", CT_APPLICATION_X_FONT_OPENTYPE, false }, + { .format = "application/font-woff", CT_APPLICATION_FONT_WOFF, false }, + { .format = "application/font-woff2", CT_APPLICATION_FONT_WOFF2, false }, + { .format = "application/vnd.ms-fontobject",CT_APPLICATION_VND_MS_FONTOBJ, false }, + { .format = "image/png", CT_IMAGE_PNG, false }, + { .format = "image/jpeg", CT_IMAGE_JPG, false }, + { .format = "image/gif", CT_IMAGE_GIF, false }, + { .format = "image/x-icon", CT_IMAGE_XICON, false }, + { .format = "image/bmp", CT_IMAGE_BMP, false }, + { .format = "image/icns", CT_IMAGE_ICNS, false }, + { .format = "audio/mpeg", CT_AUDIO_MPEG, false }, + { .format = "audio/ogg", CT_AUDIO_OGG, false }, + { .format = "video/mp4", CT_VIDEO_MP4, false }, + { .format = "application/pdf", CT_APPLICATION_PDF, false }, + { .format = "application/zip", CT_APPLICATION_ZIP, false }, + { .format = "image/png", CT_IMAGE_PNG, false }, + + // secondary - overlapping with primary + + { .format = "text/plain", CT_PROMETHEUS, false, "version=0.0.4" }, + { .format = "prometheus", CT_PROMETHEUS }, + { .format = "text", CT_TEXT_PLAIN }, + { .format = "txt", CT_TEXT_PLAIN }, + { .format = "json", CT_APPLICATION_JSON }, + { .format = "html", CT_TEXT_HTML }, + { .format = "xml", CT_APPLICATION_XML }, + + // terminator + { .format = NULL, CT_TEXT_PLAIN }, +}; + +HTTP_CONTENT_TYPE content_type_string2id(const char *format) { + if(format && *format) { + for (int i = 0; content_types[i].format; i++) + if (strcmp(content_types[i].format, format) == 0) + return content_types[i].content_type; + } + + return CT_TEXT_PLAIN; +} + +const char *content_type_id2string(HTTP_CONTENT_TYPE content_type) { + for (int i = 0; content_types[i].format; i++) + if (content_types[i].content_type == content_type) + return content_types[i].format; + + return "text/plain"; +} + +void http_header_content_type(BUFFER *wb, HTTP_CONTENT_TYPE content_type) { + buffer_strcat(wb, "Content-Type: "); + + for (int i = 0; content_types[i].format; i++) { + if (content_types[i].content_type == content_type) { + buffer_strcat(wb, content_types[i].format); + + if(content_types[i].needs_charset) { + buffer_strcat(wb, "; charset=utf-8"); + } + if(content_types[i].options) { + buffer_strcat(wb, "; "); + buffer_strcat(wb, content_types[i].options); + } + + buffer_strcat(wb, "\r\n"); + + return; + } + } + + buffer_strcat(wb, "text/plain; charset=utf-8\r\n"); +} diff --git a/src/libnetdata/http/content_type.h b/src/libnetdata/http/content_type.h new file mode 100644 index 00000000..b982494d --- /dev/null +++ b/src/libnetdata/http/content_type.h @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_CONTENT_TYPE_H +#define NETDATA_CONTENT_TYPE_H + +typedef enum __attribute__ ((__packed__)) { + CT_NONE = 0, + CT_APPLICATION_JSON, + CT_TEXT_PLAIN, + CT_TEXT_HTML, + CT_APPLICATION_X_JAVASCRIPT, + CT_TEXT_CSS, + CT_TEXT_XML, + CT_APPLICATION_XML, + CT_TEXT_XSL, + CT_APPLICATION_OCTET_STREAM, + CT_APPLICATION_X_FONT_TRUETYPE, + CT_APPLICATION_X_FONT_OPENTYPE, + CT_APPLICATION_FONT_WOFF, + CT_APPLICATION_FONT_WOFF2, + CT_APPLICATION_VND_MS_FONTOBJ, + CT_IMAGE_SVG_XML, + CT_IMAGE_PNG, + CT_IMAGE_JPG, + CT_IMAGE_GIF, + CT_IMAGE_XICON, + CT_IMAGE_ICNS, + CT_IMAGE_BMP, + CT_PROMETHEUS, + CT_AUDIO_MPEG, + CT_AUDIO_OGG, + CT_VIDEO_MP4, + CT_APPLICATION_PDF, + CT_APPLICATION_ZIP, + CT_TEXT_YAML, + CT_APPLICATION_YAML, +} HTTP_CONTENT_TYPE; + +HTTP_CONTENT_TYPE content_type_string2id(const char *format); +const char *content_type_id2string(HTTP_CONTENT_TYPE content_type); + +#include "../libnetdata.h" + +void http_header_content_type(struct web_buffer *wb, HTTP_CONTENT_TYPE type); + +#endif //NETDATA_CONTENT_TYPE_H diff --git a/src/libnetdata/http/http_access.c b/src/libnetdata/http/http_access.c new file mode 100644 index 00000000..5be63bb1 --- /dev/null +++ b/src/libnetdata/http/http_access.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" + +static struct { + HTTP_USER_ROLE access; + const char *name; +} user_roles[] = { + { .access = HTTP_USER_ROLE_NONE, .name = "none" }, + { .access = HTTP_USER_ROLE_ADMIN, .name = "admin" }, + { .access = HTTP_USER_ROLE_MANAGER, .name = "manager" }, + { .access = HTTP_USER_ROLE_TROUBLESHOOTER, .name = "troubleshooter" }, + { .access = HTTP_USER_ROLE_OBSERVER, .name = "observer" }, + { .access = HTTP_USER_ROLE_MEMBER, .name = "member" }, + { .access = HTTP_USER_ROLE_BILLING, .name = "billing" }, + { .access = HTTP_USER_ROLE_ANY, .name = "any" }, + + { .access = HTTP_USER_ROLE_MEMBER, .name = "members" }, + { .access = HTTP_USER_ROLE_ADMIN, .name = "admins" }, + { .access = HTTP_USER_ROLE_ANY, .name = "all" }, + + // terminator + { .access = 0, .name = NULL }, +}; + +HTTP_USER_ROLE http_user_role2id(const char *role) { + if(!role || !*role) + return HTTP_USER_ROLE_MEMBER; + + for(size_t i = 0; user_roles[i].name ;i++) { + if(strcmp(user_roles[i].name, role) == 0) + return user_roles[i].access; + } + + nd_log(NDLS_DAEMON, NDLP_WARNING, "HTTP user role '%s' is not valid", role); + return HTTP_USER_ROLE_NONE; +} + +const char *http_id2user_role(HTTP_USER_ROLE role) { + for(size_t i = 0; user_roles[i].name ;i++) { + if(role == user_roles[i].access) + return user_roles[i].name; + } + + nd_log(NDLS_DAEMON, NDLP_WARNING, "HTTP user role %d is not valid", role); + return "none"; +} + +// -------------------------------------------------------------------------------------------------------------------- + +static struct { + const char *name; + uint32_t hash; + HTTP_ACCESS value; +} http_accesses[] = { + {"none" , 0 , HTTP_ACCESS_NONE} + , {"signed-in" , 0 , HTTP_ACCESS_SIGNED_ID} + , {"same-space" , 0 , HTTP_ACCESS_SAME_SPACE} + , {"commercial" , 0 , HTTP_ACCESS_COMMERCIAL_SPACE} + , {"anonymous-data" , 0 , HTTP_ACCESS_ANONYMOUS_DATA} + , {"sensitive-data" , 0 , HTTP_ACCESS_SENSITIVE_DATA} + , {"view-config" , 0 , HTTP_ACCESS_VIEW_AGENT_CONFIG} + , {"edit-config" , 0 , HTTP_ACCESS_EDIT_AGENT_CONFIG} + , {"view-notifications-config" , 0 , HTTP_ACCESS_VIEW_NOTIFICATIONS_CONFIG} + , {"edit-notifications-config" , 0 , HTTP_ACCESS_EDIT_NOTIFICATIONS_CONFIG} + , {"view-alerts-silencing" , 0 , HTTP_ACCESS_VIEW_ALERTS_SILENCING} + , {"edit-alerts-silencing" , 0 , HTTP_ACCESS_EDIT_ALERTS_SILENCING} + + , {NULL , 0 , 0} +}; + +inline HTTP_ACCESS http_access2id_one(const char *str) { + HTTP_ACCESS ret = 0; + + if(!str || !*str) return ret; + + uint32_t hash = simple_hash(str); + int i; + for(i = 0; http_accesses[i].name ; i++) { + if(unlikely(!http_accesses[i].hash)) + http_accesses[i].hash = simple_hash(http_accesses[i].name); + + if (unlikely(hash == http_accesses[i].hash && !strcmp(str, http_accesses[i].name))) { + ret |= http_accesses[i].value; + break; + } + } + + return ret; +} + +inline HTTP_ACCESS http_access2id(char *str) { + HTTP_ACCESS ret = 0; + char *tok; + + while(str && *str && (tok = strsep_skip_consecutive_separators(&str, ", |"))) { + if(!*tok) continue; + ret |= http_access2id_one(tok); + } + + return ret; +} + +void http_access2buffer_json_array(BUFFER *wb, const char *key, HTTP_ACCESS access) { + buffer_json_member_add_array(wb, key); + + HTTP_ACCESS used = 0; // to prevent adding duplicates + for(int i = 0; http_accesses[i].name ; i++) { + if (unlikely((http_accesses[i].value & access) && !(http_accesses[i].value & used))) { + const char *name = http_accesses[i].name; + used |= http_accesses[i].value; + + buffer_json_add_array_item_string(wb, name); + } + } + + buffer_json_array_close(wb); +} + +void http_access2txt(char *buf, size_t size, const char *separator, HTTP_ACCESS access) { + char *write = buf; + char *end = &buf[size - 1]; + + HTTP_ACCESS used = 0; // to prevent adding duplicates + int added = 0; + for(int i = 0; http_accesses[i].name ; i++) { + if (unlikely((http_accesses[i].value & access) && !(http_accesses[i].value & used))) { + const char *name = http_accesses[i].name; + used |= http_accesses[i].value; + + if(added && write < end) { + const char *s = separator; + while(*s && write < end) + *write++ = *s++; + } + + while(*name && write < end) + *write++ = *name++; + + added++; + } + } + *write = *end = '\0'; +} + +HTTP_ACCESS http_access_from_hex_mapping_old_roles(const char *str) { + if(!str || !*str) + return HTTP_ACCESS_NONE; + + if(strcmp(str, "any") == 0 || strcmp(str, "all") == 0) + return HTTP_ACCESS_MAP_OLD_ANY; + + if(strcmp(str, "member") == 0 || strcmp(str, "members") == 0) + return HTTP_ACCESS_MAP_OLD_MEMBER; + + else if(strcmp(str, "admin") == 0 || strcmp(str, "admins") == 0) + return HTTP_ACCESS_MAP_OLD_ADMIN; + + return (HTTP_ACCESS)strtoull(str, NULL, 16) & HTTP_ACCESS_ALL; +} + +HTTP_ACCESS http_access_from_hex(const char *str) { + if(!str || !*str) + return HTTP_ACCESS_NONE; + + return (HTTP_ACCESS)strtoull(str, NULL, 16) & HTTP_ACCESS_ALL; +} + +HTTP_ACCESS http_access_from_source(const char *str) { + if(!str || !*str) + return HTTP_ACCESS_NONE; + + HTTP_ACCESS access = HTTP_ACCESS_NONE; + + const char *permissions = strstr(str, "permissions="); + if(permissions) + access = (HTTP_ACCESS)strtoull(permissions + 12, NULL, 16) & HTTP_ACCESS_ALL; + + return access; +} + +bool log_cb_http_access_to_hex(BUFFER *wb, void *data) { + HTTP_ACCESS access = *((HTTP_ACCESS *)data); + buffer_sprintf(wb, HTTP_ACCESS_FORMAT, (HTTP_ACCESS_FORMAT_CAST)access); + return true; +} diff --git a/src/libnetdata/http/http_access.h b/src/libnetdata/http/http_access.h new file mode 100644 index 00000000..afc2e1dc --- /dev/null +++ b/src/libnetdata/http/http_access.h @@ -0,0 +1,148 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_HTTP_ACCESS_H +#define NETDATA_HTTP_ACCESS_H + +typedef enum __attribute__((packed)) { + HTTP_USER_ROLE_NONE = 0, + HTTP_USER_ROLE_ADMIN = 1, + HTTP_USER_ROLE_MANAGER = 2, + HTTP_USER_ROLE_TROUBLESHOOTER = 3, + HTTP_USER_ROLE_OBSERVER = 4, + HTTP_USER_ROLE_MEMBER = 5, + HTTP_USER_ROLE_BILLING = 6, + HTTP_USER_ROLE_ANY = 7, + + // keep this list so that lower numbers are more strict access levels +} HTTP_USER_ROLE; +const char *http_id2user_role(HTTP_USER_ROLE role); +HTTP_USER_ROLE http_user_role2id(const char *role); + +typedef enum __attribute__((packed)) { + HTTP_ACCESS_NONE = 0, // adm man trb obs mem bil + HTTP_ACCESS_SIGNED_ID = (1 << 0), // User is authenticated A A A A A A + HTTP_ACCESS_SAME_SPACE = (1 << 1), // NC user+agent = same space A A A A A A + HTTP_ACCESS_COMMERCIAL_SPACE = (1 << 2), // NC P P P P P P + HTTP_ACCESS_ANONYMOUS_DATA = (1 << 3), // NC room:Read A A A SR SR - + HTTP_ACCESS_SENSITIVE_DATA = (1 << 4), // NC agent:ViewSensitiveData A A A - SR - + HTTP_ACCESS_VIEW_AGENT_CONFIG = (1 << 5), // NC agent:ReadDynCfg P P - - - - + HTTP_ACCESS_EDIT_AGENT_CONFIG = (1 << 6), // NC agent:EditDynCfg P P - - - - + HTTP_ACCESS_VIEW_NOTIFICATIONS_CONFIG = (1 << 7), // NC agent:ViewNotificationsConfig P - - - - - + HTTP_ACCESS_EDIT_NOTIFICATIONS_CONFIG = (1 << 8), // NC agent:EditNotificationsConfig P - - - - - + HTTP_ACCESS_VIEW_ALERTS_SILENCING = (1 << 9), // NC space:GetSystemSilencingRules A A A - A - + HTTP_ACCESS_EDIT_ALERTS_SILENCING = (1 << 10), // NC space:CreateSystemSilencingRule P P - - P - +} HTTP_ACCESS; // --------------------- + // A = always + // P = commercial plan + // SR = same room (Us+Ag) + +#define HTTP_ACCESS_FORMAT "0x%" PRIx32 +#define HTTP_ACCESS_FORMAT_CAST uint32_t + +#define HTTP_ACCESS_ALL (HTTP_ACCESS)( \ + HTTP_ACCESS_SIGNED_ID \ + | HTTP_ACCESS_SAME_SPACE \ + | HTTP_ACCESS_COMMERCIAL_SPACE \ + | HTTP_ACCESS_ANONYMOUS_DATA \ + | HTTP_ACCESS_SENSITIVE_DATA \ + | HTTP_ACCESS_VIEW_AGENT_CONFIG \ + | HTTP_ACCESS_EDIT_AGENT_CONFIG \ + | HTTP_ACCESS_VIEW_NOTIFICATIONS_CONFIG \ + | HTTP_ACCESS_EDIT_NOTIFICATIONS_CONFIG \ + | HTTP_ACCESS_VIEW_ALERTS_SILENCING \ + | HTTP_ACCESS_EDIT_ALERTS_SILENCING \ +) + +#define HTTP_ACCESS_MAP_OLD_ANY (HTTP_ACCESS)(HTTP_ACCESS_ANONYMOUS_DATA) + +#define HTTP_ACCESS_MAP_OLD_MEMBER (HTTP_ACCESS)( \ + HTTP_ACCESS_SIGNED_ID \ + | HTTP_ACCESS_SAME_SPACE \ + | HTTP_ACCESS_ANONYMOUS_DATA | HTTP_ACCESS_SENSITIVE_DATA) + +#define HTTP_ACCESS_MAP_OLD_ADMIN (HTTP_ACCESS)( \ + HTTP_ACCESS_SIGNED_ID \ + | HTTP_ACCESS_SAME_SPACE \ + | HTTP_ACCESS_ANONYMOUS_DATA | HTTP_ACCESS_SENSITIVE_DATA | HTTP_ACCESS_VIEW_AGENT_CONFIG \ + | HTTP_ACCESS_EDIT_AGENT_CONFIG \ +) + +HTTP_ACCESS http_access2id_one(const char *str); +HTTP_ACCESS http_access2id(char *str); +struct web_buffer; +void http_access2buffer_json_array(struct web_buffer *wb, const char *key, HTTP_ACCESS access); +void http_access2txt(char *buf, size_t size, const char *separator, HTTP_ACCESS access); +HTTP_ACCESS http_access_from_hex(const char *str); +HTTP_ACCESS http_access_from_hex_mapping_old_roles(const char *str); +HTTP_ACCESS http_access_from_source(const char *str); +bool log_cb_http_access_to_hex(struct web_buffer *wb, void *data); + +#define HTTP_ACCESS_PERMISSION_DENIED_HTTP_CODE(access) ((access & HTTP_ACCESS_SIGNED_ID) ? HTTP_RESP_FORBIDDEN : HTTP_RESP_PRECOND_FAIL) + +typedef enum __attribute__((packed)) { + HTTP_ACL_NONE = (0), + + HTTP_ACL_NOCHECK = (1 << 0), // Don't check anything - adding this to an endpoint, disables ACL checking + + // transports + HTTP_ACL_API = (1 << 1), // from the internal web server (TCP port) + HTTP_ACL_API_UDP = (1 << 2), // from the internal web server (UDP port) + HTTP_ACL_API_UNIX = (1 << 3), // from the internal web server (UNIX socket) + HTTP_ACL_H2O = (1 << 4), // from the h2o web server + HTTP_ACL_ACLK = (1 << 5), // from ACLK + HTTP_ACL_WEBRTC = (1 << 6), // from WebRTC + + // HTTP_ACL_API takes the following additional ACLs, based on pattern matching of the client IP + HTTP_ACL_DASHBOARD = (1 << 10), + HTTP_ACL_REGISTRY = (1 << 11), + HTTP_ACL_BADGES = (1 << 12), + HTTP_ACL_MANAGEMENT = (1 << 13), + HTTP_ACL_STREAMING = (1 << 14), + HTTP_ACL_NETDATACONF = (1 << 15), + + // SSL related + HTTP_ACL_SSL_OPTIONAL = (1 << 28), + HTTP_ACL_SSL_FORCE = (1 << 29), + HTTP_ACL_SSL_DEFAULT = (1 << 30), +} HTTP_ACL; + +#define HTTP_ACL_TRANSPORTS (HTTP_ACL)( \ + HTTP_ACL_API \ + | HTTP_ACL_API_UDP \ + | HTTP_ACL_API_UNIX \ + | HTTP_ACL_H2O \ + | HTTP_ACL_ACLK \ + | HTTP_ACL_WEBRTC \ +) + +#define HTTP_ACL_TRANSPORTS_WITHOUT_CLIENT_IP_VALIDATION (HTTP_ACL)( \ + HTTP_ACL_ACLK \ + | HTTP_ACL_WEBRTC \ +) + +#define HTTP_ACL_ALL_FEATURES (HTTP_ACL)( \ + HTTP_ACL_DASHBOARD \ + | HTTP_ACL_REGISTRY \ + | HTTP_ACL_BADGES \ + | HTTP_ACL_MANAGEMENT \ + | HTTP_ACL_STREAMING \ + | HTTP_ACL_NETDATACONF \ +) + +#ifdef NETDATA_DEV_MODE +#define ACL_DEV_OPEN_ACCESS HTTP_ACL_NOCHECK +#else +#define ACL_DEV_OPEN_ACCESS 0 +#endif + +#define http_can_access_dashboard(w) ((w)->acl & HTTP_ACL_DASHBOARD) +#define http_can_access_registry(w) ((w)->acl & HTTP_ACL_REGISTRY) +#define http_can_access_badges(w) ((w)->acl & HTTP_ACL_BADGES) +#define http_can_access_mgmt(w) ((w)->acl & HTTP_ACL_MANAGEMENT) +#define http_can_access_stream(w) ((w)->acl & HTTP_ACL_STREAMING) +#define http_can_access_netdataconf(w) ((w)->acl & HTTP_ACL_NETDATACONF) +#define http_is_using_ssl_optional(w) ((w)->port_acl & HTTP_ACL_SSL_OPTIONAL) +#define http_is_using_ssl_force(w) ((w)->port_acl & HTTP_ACL_SSL_FORCE) +#define http_is_using_ssl_default(w) ((w)->port_acl & HTTP_ACL_SSL_DEFAULT) + +#endif //NETDATA_HTTP_ACCESS_H diff --git a/src/libnetdata/http/http_defs.c b/src/libnetdata/http/http_defs.c new file mode 100644 index 00000000..ef7621a6 --- /dev/null +++ b/src/libnetdata/http/http_defs.c @@ -0,0 +1,245 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" + +ENUM_STR_MAP_DEFINE(HTTP_REQUEST_MODE) = +{ + { .name = "OPTIONS", .id = HTTP_REQUEST_MODE_OPTIONS }, + { .name = "GET", .id = HTTP_REQUEST_MODE_GET }, + { .name = "FILECOPY", .id = HTTP_REQUEST_MODE_FILECOPY }, + { .name = "POST", .id = HTTP_REQUEST_MODE_POST }, + { .name = "PUT", .id = HTTP_REQUEST_MODE_PUT }, + { .name = "DELETE", .id = HTTP_REQUEST_MODE_DELETE }, + { .name = "STREAM", .id = HTTP_REQUEST_MODE_STREAM }, + + // terminator + { .name = NULL, .id = 0 } +}; + +ENUM_STR_DEFINE_FUNCTIONS(HTTP_REQUEST_MODE, 0, "UNKNOWN"); + +const char *http_response_code2string(int code) { + switch(code) { + case 100: + return "Continue"; + case 101: + return "Switching Protocols"; + case 102: + return "Processing"; + case 103: + return "Early Hints"; + + case 200: + return "OK"; + case 201: + return "Created"; + case 202: + return "Accepted"; + case 203: + return "Non-Authoritative Information"; + case 204: + return "No Content"; + case 205: + return "Reset Content"; + case 206: + return "Partial Content"; + case 207: + return "Multi-Status"; + case 208: + return "Already Reported"; + case 226: + return "IM Used"; + + case 300: + return "Multiple Choices"; + case 301: + return "Moved Permanently"; + case 302: + return "Found"; + case 303: + return "See Other"; + case 304: + return "Not Modified"; + case 305: + return "Use Proxy"; + case 306: + return "Switch Proxy"; + case 307: + return "Temporary Redirect"; + case 308: + return "Permanent Redirect"; + + case 400: + return "Bad Request"; + case 401: + return "Unauthorized"; + case 402: + return "Payment Required"; + case 403: + return "Forbidden"; + case 404: + return "Not Found"; + case 405: + return "Method Not Allowed"; + case 406: + return "Not Acceptable"; + case 407: + return "Proxy Authentication Required"; + case 408: + return "Request Timeout"; + case 409: + return "Conflict"; + case 410: + return "Gone"; + case 411: + return "Length Required"; + case 412: + return "Precondition Failed"; + case 413: + return "Payload Too Large"; + case 414: + return "URI Too Long"; + case 415: + return "Unsupported Media Type"; + case 416: + return "Range Not Satisfiable"; + case 417: + return "Expectation Failed"; + case 418: + return "I'm a teapot"; + case 421: + return "Misdirected Request"; + case 422: + return "Unprocessable Entity"; + case 423: + return "Locked"; + case 424: + return "Failed Dependency"; + case 425: + return "Too Early"; + case 426: + return "Upgrade Required"; + case 428: + return "Precondition Required"; + case 429: + return "Too Many Requests"; + case 431: + return "Request Header Fields Too Large"; + case 451: + return "Unavailable For Legal Reasons"; + case 499: // nginx's extension to the standard + return "Client Closed Request"; + + case 500: + return "Internal Server Error"; + case 501: + return "Not Implemented"; + case 502: + return "Bad Gateway"; + case 503: + return "Service Unavailable"; + case 504: + return "Gateway Timeout"; + case 505: + return "HTTP Version Not Supported"; + case 506: + return "Variant Also Negotiates"; + case 507: + return "Insufficient Storage"; + case 508: + return "Loop Detected"; + case 510: + return "Not Extended"; + case 511: + return "Network Authentication Required"; + + default: + if(code >= 100 && code < 200) + return "Informational"; + + if(code >= 200 && code < 300) + return "Successful"; + + if(code >= 300 && code < 400) + return "Redirection"; + + if(code >= 400 && code < 500) + return "Client Error"; + + if(code >= 500 && code < 600) + return "Server Error"; + + return "Undefined Error"; + } +} + + +static struct { + const char *extension; + uint32_t hash; + HTTP_CONTENT_TYPE contenttype; +} mime_types[] = { + { "html" , 0 , CT_TEXT_HTML } + , { "js" , 0 , CT_APPLICATION_X_JAVASCRIPT } + , { "css" , 0 , CT_TEXT_CSS } + , { "xml" , 0 , CT_TEXT_XML } + , { "xsl" , 0 , CT_TEXT_XSL } + , { "txt" , 0 , CT_TEXT_PLAIN } + , { "svg" , 0 , CT_IMAGE_SVG_XML } + , { "ttf" , 0 , CT_APPLICATION_X_FONT_TRUETYPE } + , { "otf" , 0 , CT_APPLICATION_X_FONT_OPENTYPE } + , { "woff2", 0 , CT_APPLICATION_FONT_WOFF2 } + , { "woff" , 0 , CT_APPLICATION_FONT_WOFF } + , { "eot" , 0 , CT_APPLICATION_VND_MS_FONTOBJ } + , { "png" , 0 , CT_IMAGE_PNG } + , { "jpg" , 0 , CT_IMAGE_JPG } + , { "jpeg" , 0 , CT_IMAGE_JPG } + , { "gif" , 0 , CT_IMAGE_GIF } + , { "bmp" , 0 , CT_IMAGE_BMP } + , { "ico" , 0 , CT_IMAGE_XICON } + , { "icns" , 0 , CT_IMAGE_ICNS } + + // terminator + , { NULL , 0 , 0 } +}; + +HTTP_CONTENT_TYPE contenttype_for_filename(const char *filename) { + // netdata_log_info("checking filename '%s'", filename); + + static int initialized = 0; + int i; + + if(unlikely(!initialized)) { + for (i = 0; mime_types[i].extension; i++) + mime_types[i].hash = simple_hash(mime_types[i].extension); + + initialized = 1; + } + + const char *s = filename, *last_dot = NULL; + + // find the last dot + while(*s) { + if(unlikely(*s == '.')) last_dot = s; + s++; + } + + if(unlikely(!last_dot || !*last_dot || !last_dot[1])) { + // netdata_log_info("no extension for filename '%s'", filename); + return CT_APPLICATION_OCTET_STREAM; + } + last_dot++; + + // netdata_log_info("extension for filename '%s' is '%s'", filename, last_dot); + + uint32_t hash = simple_hash(last_dot); + for(i = 0; mime_types[i].extension ; i++) { + if(unlikely(hash == mime_types[i].hash && !strcmp(last_dot, mime_types[i].extension))) { + // netdata_log_info("matched extension for filename '%s': '%s'", filename, last_dot); + return mime_types[i].contenttype; + } + } + + // netdata_log_info("not matched extension for filename '%s': '%s'", filename, last_dot); + return CT_APPLICATION_OCTET_STREAM; +} diff --git a/src/libnetdata/http/http_defs.h b/src/libnetdata/http/http_defs.h new file mode 100644 index 00000000..e1e26863 --- /dev/null +++ b/src/libnetdata/http/http_defs.h @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_HTTP_DEFS_H +#define NETDATA_HTTP_DEFS_H + +#define HTTP_1_1 " HTTP/1.1" +#define HTTP_HDR_END "\r\n\r\n" +#define HTTP_ENDL "\r\n" + +// HTTP_CODES 1XX +#define HTTP_RESP_SWITCH_PROTO 101 + +// HTTP_CODES 2XX Success +#define HTTP_RESP_OK 200 +#define HTTP_RESP_ACCEPTED 202 + +// HTTP_CODES 3XX Redirections +#define HTTP_RESP_MOVED_PERM 301 +#define HTTP_RESP_NOT_MODIFIED 304 +#define HTTP_RESP_REDIR_TEMP 307 +#define HTTP_RESP_REDIR_PERM 308 +#define HTTP_RESP_HTTPS_UPGRADE 399 + +// HTTP_CODES 4XX Client Errors +#define HTTP_RESP_BAD_REQUEST 400 +#define HTTP_RESP_NOT_FOUND 404 +#define HTTP_RESP_METHOD_NOT_ALLOWED 405 +#define HTTP_RESP_CONFLICT 409 +#define HTTP_RESP_CONTENT_TOO_LONG 413 + +#define HTTP_RESP_UNAUTHORIZED 401 // do not use 401 when responding to users - it is used by authenticating proxies +#define HTTP_RESP_FORBIDDEN 403 // not enough permissions to access this resource +#define HTTP_RESP_PRECOND_FAIL 412 // An authorization bearer is required by it was not found in the request +#define HTTP_RESP_UNAVAILABLE_FOR_LEGAL_REASONS 451 // Unavailable For Legal Reasons, we use it instead of 403 when access is forbidden due to an ACL. + +#define HTTP_RESP_CLIENT_CLOSED_REQUEST 499 // nginx's enxtension to the standard + +// HTTP_CODES 5XX Server Errors +#define HTTP_RESP_INTERNAL_SERVER_ERROR 500 +#define HTTP_RESP_NOT_IMPLEMENTED 501 +#define HTTP_RESP_SERVICE_UNAVAILABLE 503 +#define HTTP_RESP_GATEWAY_TIMEOUT 504 +#define HTTP_RESP_BACKEND_RESPONSE_INVALID 591 + +typedef enum __attribute__((__packed__)) { + HTTP_REQUEST_MODE_NONE = 0, + HTTP_REQUEST_MODE_GET = 1, + HTTP_REQUEST_MODE_POST = 2, + HTTP_REQUEST_MODE_PUT = 3, + HTTP_REQUEST_MODE_DELETE = 4, + HTTP_REQUEST_MODE_FILECOPY = 5, + HTTP_REQUEST_MODE_OPTIONS = 6, + HTTP_REQUEST_MODE_STREAM = 7, +} HTTP_REQUEST_MODE; + +ENUM_STR_DEFINE_FUNCTIONS_EXTERN(HTTP_REQUEST_MODE); + +const char *http_response_code2string(int code); +HTTP_CONTENT_TYPE contenttype_for_filename(const char *filename); + +#endif /* NETDATA_HTTP_DEFS_H */ diff --git a/src/libnetdata/inlined.h b/src/libnetdata/inlined.h new file mode 100644 index 00000000..6b71590c --- /dev/null +++ b/src/libnetdata/inlined.h @@ -0,0 +1,695 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_INLINED_H +#define NETDATA_INLINED_H 1 + +#include "libnetdata.h" + +#ifdef KERNEL_32BIT +typedef uint32_t kernel_uint_t; +#define str2kernel_uint_t(string) str2uint32_t(string, NULL) +#define KERNEL_UINT_FORMAT "%u" +#else +typedef uint64_t kernel_uint_t; +#define str2kernel_uint_t(string) str2uint64_t(string, NULL) +#define KERNEL_UINT_FORMAT "%" PRIu64 +#endif + +#define str2pid_t(string) str2uint32_t(string, NULL) + + +// for faster execution, allow the compiler to inline +// these functions that are called thousands of times per second + +static inline uint32_t djb2_hash32(const char* name) { + unsigned char *s = (unsigned char *) name; + uint32_t hash = 5381; + while (*s) + hash = ((hash << 5) + hash) + (uint32_t) *s++; // hash * 33 + char + return hash; +} + +static inline uint32_t pluginsd_parser_hash32(const char *name) { + unsigned char *s = (unsigned char *) name; + uint32_t hash = 0; + while (*s) { + hash <<= 5; + hash += *s++ - ' '; + } + return hash; +} + +// https://stackoverflow.com/a/107657 +static inline uint32_t larson_hash32(const char *name) { + unsigned char *s = (unsigned char *) name; + uint32_t hash = 0; + while (*s) + hash = hash * 101 + (uint32_t) *s++; + return hash; +} + +// http://isthe.com/chongo/tech/comp/fnv/ +static inline uint32_t fnv1_hash32(const char *name) { + unsigned char *s = (unsigned char *) name; + uint32_t hash = 0x811c9dc5; + while (*s) { + hash *= 0x01000193; // 16777619 + hash ^= (uint32_t) *s++; + } + return hash; +} + +// http://isthe.com/chongo/tech/comp/fnv/ +static inline uint32_t fnv1a_hash32(const char *name) { + unsigned char *s = (unsigned char *) name; + uint32_t hash = 0x811c9dc5; + while (*s) { + hash ^= (uint32_t) *s++; + hash *= 0x01000193; // 16777619 + } + return hash; +} + +static inline uint32_t fnv1a_uhash32(const char *name) { + unsigned char *s = (unsigned char *) name; + uint32_t hash = 0x811c9dc5, c; + while ((c = *s++)) { + if (unlikely(c >= 'A' && c <= 'Z')) c += 'a' - 'A'; + hash ^= c; + hash *= 0x01000193; // 16777619 + } + return hash; +} + +#define simple_hash(s) fnv1a_hash32(s) +#define simple_uhash(s) fnv1a_uhash32(s) + +static uint32_t murmur32(uint32_t k) __attribute__((const)); +static inline uint32_t murmur32(uint32_t k) { + k ^= k >> 16; + k *= 0x85ebca6b; + k ^= k >> 13; + k *= 0xc2b2ae35; + k ^= k >> 16; + + return k; +} + +static uint64_t murmur64(uint64_t k) __attribute__((const)); +static inline uint64_t murmur64(uint64_t k) { + k ^= k >> 33; + k *= 0xff51afd7ed558ccdUL; + k ^= k >> 33; + k *= 0xc4ceb9fe1a85ec53UL; + k ^= k >> 33; + + return k; +} + +static inline size_t indexing_partition(Word_t ptr, Word_t modulo) __attribute__((const)); +static inline size_t indexing_partition(Word_t ptr, Word_t modulo) { +#ifdef ENV64BIT + uint64_t hash = murmur64(ptr); + return hash % modulo; +#else + uint32_t hash = murmur32(ptr); + return hash % modulo; +#endif +} + +static inline unsigned int str2u(const char *s) { + unsigned int n = 0; + + while(*s >= '0' && *s <= '9') + n = n * 10 + (*s++ - '0'); + + return n; +} + +static inline int str2i(const char *s) { + if(unlikely(*s == '-')) { + s++; + return -(int) str2u(s); + } + else { + if(unlikely(*s == '+')) s++; + return (int) str2u(s); + } +} + +static inline unsigned long str2ul(const char *s) { + unsigned long n = 0; + + while(*s >= '0' && *s <= '9') + n = n * 10 + (*s++ - '0'); + + return n; +} + +static inline long str2l(const char *s) { + if(unlikely(*s == '-')) { + s++; + return -(long) str2ul(s); + } + else { + if(unlikely(*s == '+')) s++; + return (long) str2ul(s); + } +} + +static inline uint32_t str2uint32_t(const char *s, char **endptr) { + uint32_t n = 0; + + while(*s >= '0' && *s <= '9') + n = n * 10 + (*s++ - '0'); + + if(unlikely(endptr)) + *endptr = (char *)s; + + return n; +} + +static inline uint64_t str2uint64_t(const char *s, char **endptr) { + uint64_t n = 0; + +#ifdef ENV32BIT + unsigned long n32 = 0; + while (*s >= '0' && *s <= '9' && n32 < (ULONG_MAX / 10)) + n32 = n32 * 10 + (*s++ - '0'); + + n = n32; +#endif + + while(*s >= '0' && *s <= '9') + n = n * 10 + (*s++ - '0'); + + if(unlikely(endptr)) + *endptr = (char *)s; + + return n; +} + +static inline unsigned long long int str2ull(const char *s, char **endptr) { + return str2uint64_t(s, endptr); +} + +static inline long long str2ll(const char *s, char **endptr) { + if(unlikely(*s == '-')) { + s++; + return -(long long) str2uint64_t(s, endptr); + } + else { + if(unlikely(*s == '+')) s++; + return (long long) str2uint64_t(s, endptr); + } +} + +static inline uint32_t str2uint32_hex(const char *src, char **endptr) { + uint32_t num = 0; + const unsigned char *s = (const unsigned char *)src; + unsigned char c; + + while ((c = hex_value_from_ascii[(uint8_t)*s]) != 255) { + num = (num << 4) | c; + s++; + } + + if(endptr) + *endptr = (char *)s; + + return num; +} + +static inline uint64_t str2uint64_hex(const char *src, char **endptr) { + uint64_t num = 0; + const unsigned char *s = (const unsigned char *)src; + unsigned char c; + + while ((c = hex_value_from_ascii[(uint8_t)*s]) != 255) { + num = (num << 4) | c; + s++; + } + + if(endptr) + *endptr = (char *)s; + + return num; +} + +static inline uint64_t str2uint64_base64(const char *src, char **endptr) { + uint64_t num = 0; + const unsigned char *s = (const unsigned char *)src; + unsigned char c; + + while ((c = base64_value_from_ascii[*s]) != 255) { + num = (num << 6) | c; + s++; + } + + if(endptr) + *endptr = (char *)s; + + return num; +} + +static inline NETDATA_DOUBLE str2ndd_parse_double_decimal_digits_internal(const char *src, int *digits) { + const char *s = src; + NETDATA_DOUBLE n = 0.0; + + while(*s >= '0' && *s <= '9') { + + // this works for both 32-bit and 64-bit systems + unsigned long ni = 0; + unsigned exponent = 0; + while (*s >= '0' && *s <= '9' && ni < (ULONG_MAX / 10)) { + ni = (ni * 10) + (*s++ - '0'); + exponent++; + } + + n = n * powndd(10.0, exponent) + (NETDATA_DOUBLE)ni; + } + + *digits = (int)(s - src); + return n; +} + +static inline NETDATA_DOUBLE str2ndd(const char *src, char **endptr) { + const char *s = src; + + NETDATA_DOUBLE sign = 1.0; + NETDATA_DOUBLE result; + int integral_digits = 0; + + NETDATA_DOUBLE fractional = 0.0; + int fractional_digits = 0; + + NETDATA_DOUBLE exponent = 0.0; + int exponent_digits = 0; + + switch(*s) { + case '-': + s++; + sign = -1.0; + break; + + case '+': + s++; + break; + + case 'n': + if(s[1] == 'a' && s[2] == 'n') { + if(endptr) *endptr = (char *)&s[3]; + return NAN; + } + if(s[1] == 'u' && s[2] == 'l' && s[3] == 'l') { + if(endptr) *endptr = (char *)&s[3]; + return NAN; + } + break; + + case 'i': + if(s[1] == 'n' && s[2] == 'f') { + if(endptr) *endptr = (char *)&s[3]; + return INFINITY; + } + break; + + default: + break; + } + + result = str2ndd_parse_double_decimal_digits_internal(s, &integral_digits); + s += integral_digits; + + if(unlikely(*s == '.')) { + s++; + fractional = str2ndd_parse_double_decimal_digits_internal(s, &fractional_digits); + s += fractional_digits; + } + + if (unlikely(*s == 'e' || *s == 'E')) { + const char *e_ptr = s; + s++; + + int exponent_sign = 1; + if (*s == '-') { + exponent_sign = -1; + s++; + } + else if(*s == '+') + s++; + + exponent = str2ndd_parse_double_decimal_digits_internal(s, &exponent_digits); + if(unlikely(!exponent_digits)) { + exponent = 0; + s = e_ptr; + } + else { + s += exponent_digits; + exponent *= exponent_sign; + } + } + + if(unlikely(endptr)) + *endptr = (char *)s; + + if (unlikely(exponent_digits)) + result *= powndd(10.0, exponent); + + if (unlikely(fractional_digits)) + result += fractional / powndd(10.0, fractional_digits) * (exponent_digits ? powndd(10.0, exponent) : 1.0); + + return sign * result; +} + +static inline unsigned long long str2ull_encoded(const char *s) { + if(*s == IEEE754_UINT64_B64_PREFIX[0]) + return str2uint64_base64(s + sizeof(IEEE754_UINT64_B64_PREFIX) - 1, NULL); + + if(s[0] == HEX_PREFIX[0] && s[1] == HEX_PREFIX[1]) + return str2uint64_hex(s + 2, NULL); + + return str2uint64_t(s, NULL); +} + +static inline long long str2ll_encoded(const char *s) { + if(*s == '-') + return -(long long) str2ull_encoded(&s[1]); + else + return (long long) str2ull_encoded(s); +} + +static inline NETDATA_DOUBLE str2ndd_encoded(const char *src, char **endptr) { + if (*src == IEEE754_DOUBLE_B64_PREFIX[0]) { + // double parsing from base64 + uint64_t n = str2uint64_base64(src + sizeof(IEEE754_DOUBLE_B64_PREFIX) - 1, endptr); + NETDATA_DOUBLE *ptr = (NETDATA_DOUBLE *) (&n); + return *ptr; + } + + if (*src == IEEE754_DOUBLE_HEX_PREFIX[0]) { + // double parsing from hex + uint64_t n = str2uint64_hex(src + sizeof(IEEE754_DOUBLE_HEX_PREFIX) - 1, endptr); + NETDATA_DOUBLE *ptr = (NETDATA_DOUBLE *) (&n); + return *ptr; + } + + double sign = 1.0; + + if(*src == '-') { + sign = -1.0; + src++; + } + + if(unlikely(*src == IEEE754_UINT64_B64_PREFIX[0])) + return (NETDATA_DOUBLE) str2uint64_base64(src + sizeof(IEEE754_UINT64_B64_PREFIX) - 1, endptr) * sign; + + if(unlikely(*src == HEX_PREFIX[0] && src[1] == HEX_PREFIX[1])) + return (NETDATA_DOUBLE) str2uint64_hex(src + sizeof(HEX_PREFIX) - 1, endptr) * sign; + + return str2ndd(src, endptr) * sign; +} + +static inline char *strncpyz(char *dst, const char *src, size_t dst_size_minus_1) { + char *p = dst; + + while (*src && dst_size_minus_1--) + *dst++ = *src++; + + *dst = '\0'; + + return p; +} + +static inline void sanitize_json_string(char *dst, const char *src, size_t dst_size) { + while (*src != '\0' && dst_size > 1) { + if (*src < 0x1F) { + *dst++ = '_'; + src++; + dst_size--; + } + else if (*src == '\\' || *src == '\"') { + *dst++ = '\\'; + *dst++ = *src++; + dst_size -= 2; + } + else { + *dst++ = *src++; + dst_size--; + } + } + *dst = '\0'; +} + +static inline bool sanitize_command_argument_string(char *dst, const char *src, size_t dst_size) { + if(dst_size) + *dst = '\0'; + + // skip leading dashes + while (*src == '-') + src++; + + while (*src != '\0') { + if (dst_size < 1) + return false; + + if (iscntrl((uint8_t)*src) || *src == '$') { + // remove control characters and characters that are expanded by bash + *dst++ = '_'; + dst_size--; + } + else if (*src == '\'' || *src == '`') { + // escape single quotes + if (dst_size < 4) + return false; + + dst[0] = '\''; dst[1] = '\\'; dst[2] = '\''; dst[3] = '\''; + + dst += 4; + dst_size -= 4; + } + else { + *dst++ = *src; + dst_size--; + } + + src++; + } + + // make sure we have space to terminate the string + if (dst_size == 0) + return false; + + *dst = '\0'; + + return true; +} + +static inline int read_txt_file(const char *filename, char *buffer, size_t size) { + if(unlikely(!size)) return 3; + + int fd = open(filename, O_RDONLY | O_CLOEXEC, 0666); + if(unlikely(fd == -1)) { + buffer[0] = '\0'; + return 1; + } + + ssize_t r = read(fd, buffer, size - 1); // leave space of the final zero + if(unlikely(r == -1)) { + buffer[0] = '\0'; + close(fd); + return 2; + } + buffer[r] = '\0'; + + close(fd); + return 0; +} + +static inline int read_proc_cmdline(const char *filename, char *buffer, size_t size) { + if (unlikely(!size)) return 3; + + int fd = open(filename, O_RDONLY | O_CLOEXEC, 0666); + if (unlikely(fd == -1)) { + buffer[0] = '\0'; + return 1; + } + + ssize_t r = read(fd, buffer, size - 1); // Leave space for final null character + if (unlikely(r == -1)) { + buffer[0] = '\0'; + close(fd); + return 2; + } + + if (r > 0) { + // Replace null characters with spaces, except for the last one + for (ssize_t i = 0; i < r - 1; i++) { + if (buffer[i] == '\0') { + buffer[i] = ' '; + } + } + buffer[r] = '\0'; // Null-terminate the string + } + else { + buffer[0] = '\0'; // Empty cmdline + } + + close(fd); + return 0; +} + +static inline int read_single_number_file(const char *filename, unsigned long long *result) { + char buffer[30 + 1]; + + int ret = read_txt_file(filename, buffer, sizeof(buffer)); + if(unlikely(ret)) { + *result = 0; + return ret; + } + + buffer[30] = '\0'; + *result = str2ull(buffer, NULL); + return 0; +} + +static inline int read_single_signed_number_file(const char *filename, long long *result) { + char buffer[30 + 1]; + + int ret = read_txt_file(filename, buffer, sizeof(buffer)); + if(unlikely(ret)) { + *result = 0; + return ret; + } + + buffer[30] = '\0'; + *result = atoll(buffer); + return 0; +} + +static inline int read_single_base64_or_hex_number_file(const char *filename, unsigned long long *result) { + char buffer[30 + 1]; + + int ret = read_txt_file(filename, buffer, sizeof(buffer)); + if(unlikely(ret)) { + *result = 0; + return ret; + } + + buffer[30] = '\0'; + + if(likely(buffer[0])){ + *result = str2ull_encoded(buffer); + return 0; + } + else { + *result = 0; + return -1; + } +} + +static inline char *strsep_skip_consecutive_separators(char **ptr, char *s) { + char *p = (char *)""; + while (p && !p[0] && *ptr) p = strsep(ptr, s); + return (p); +} + +// remove leading and trailing spaces; may return NULL +static inline char *trim(char *s) { + // skip leading spaces + while (*s && isspace((uint8_t)*s)) s++; + if (!*s) return NULL; + + // skip tailing spaces + // this way is way faster. Writes only one NUL char. + ssize_t l = (ssize_t)strlen(s); + if (--l >= 0) { + char *p = s + l; + while (p > s && isspace((uint8_t)*p)) p--; + *++p = '\0'; + } + + if (!*s) return NULL; + + return s; +} + +// like trim(), but also remove duplicate spaces inside the string; may return NULL +static inline char *trim_all(char *buffer) { + char *d = buffer, *s = buffer; + + // skip spaces + while(isspace((uint8_t)*s)) s++; + + while(*s) { + // copy the non-space part + while(*s && !isspace((uint8_t)*s)) *d++ = *s++; + + // add a space if we have to + if(*s && isspace((uint8_t)*s)) { + *d++ = ' '; + s++; + } + + // skip spaces + while(isspace((uint8_t)*s)) s++; + } + + *d = '\0'; + + if(d > buffer) { + d--; + if(isspace((uint8_t)*d)) *d = '\0'; + } + + if(!buffer[0]) return NULL; + return buffer; +} + +static inline bool streq(const char *a, const char *b) { + if (a == b) + return true; + + if (a == NULL || b == NULL) + return false; + + return strcmp(a, b) == 0; +} + +static inline bool strstartswith(const char *string, const char *prefix) { + if (string == NULL || prefix == NULL) + return false; + + size_t string_len = strlen(string); + size_t prefix_len = strlen(prefix); + + if (prefix_len > string_len) + return false; + + return strncmp(string, prefix, prefix_len) == 0; +} + +static inline bool strendswith(const char *string, const char *suffix) { + if (string == NULL || suffix == NULL) + return false; + + size_t string_len = strlen(string); + size_t suffix_len = strlen(suffix); + + if (suffix_len > string_len) + return false; + + return strcmp(string + string_len - suffix_len, suffix) == 0; +} + +static inline bool strendswith_lengths(const char *string, size_t string_len, const char *suffix, size_t suffix_len) { + if (string == NULL || suffix == NULL) + return false; + + if (suffix_len > string_len) + return false; + + return strcmp(string + string_len - suffix_len, suffix) == 0; +} + +#endif //NETDATA_INLINED_H diff --git a/src/libnetdata/json/README.md b/src/libnetdata/json/README.md new file mode 100644 index 00000000..9ae5ff38 --- /dev/null +++ b/src/libnetdata/json/README.md @@ -0,0 +1,14 @@ +<!-- +title: "json" +custom_edit_url: https://github.com/netdata/netdata/edit/master/src/libnetdata/json/README.md +sidebar_label: "json" +learn_status: "Published" +learn_topic_type: "Tasks" +learn_rel_path: "Developers/libnetdata" +--> + +# json + +`json` contains a parser for json strings, based on `jsmn` (<https://github.com/zserge/jsmn>), but case you have installed the JSON-C library, the installation script will prefer it, you can also force its use with `--enable-jsonc` in the compilation time. + + diff --git a/src/libnetdata/json/jsmn.c b/src/libnetdata/json/jsmn.c new file mode 100644 index 00000000..2f48bd65 --- /dev/null +++ b/src/libnetdata/json/jsmn.c @@ -0,0 +1,328 @@ +#include <stdlib.h> + +#include "jsmn.h" + +/** + * Alloc token + * + * Allocates a fresh unused token from the token pull. + * + * @param parser the controller + * @param tokens the tokens I am working + * @param num_tokens the number total of tokens. + * + * @return it returns the next token to work. + */ +static jsmntok_t *jsmn_alloc_token(jsmn_parser *parser, + jsmntok_t *tokens, size_t num_tokens) { + jsmntok_t *tok; + if (parser->toknext >= num_tokens) { + return NULL; + } + tok = &tokens[parser->toknext++]; + tok->start = tok->end = -1; + tok->size = 0; +#ifdef JSMN_PARENT_LINKS + tok->parent = -1; +#endif + return tok; +} + +/** + * Fill Token + * + * Fills token type and boundaries. + * + * @param token the structure to set the values + * @param type is the token type + * @param start is the first position of the value + * @param end is the end of the value + */ +static void jsmn_fill_token(jsmntok_t *token, jsmntype_t type, + int start, int end) { + token->type = type; + token->start = start; + token->end = end; + token->size = 0; +} + +/** + * Parse primitive + * + * Fills next available token with JSON primitive. + * + * @param parser is the control structure + * @param js is the json string + * @param type is the token type + */ +static jsmnerr_t jsmn_parse_primitive(jsmn_parser *parser, const char *js, + size_t len, jsmntok_t *tokens, size_t num_tokens) { + jsmntok_t *token; + int start; + + start = parser->pos; + + for (; parser->pos < len && js[parser->pos] != '\0'; parser->pos++) { + switch (js[parser->pos]) { +#ifndef JSMN_STRICT + /* In strict mode primitive must be followed by "," or "}" or "]" */ + case ':': +#endif + case '\t' : case '\r' : case '\n' : case ' ' : + case ',' : case ']' : case '}' : + goto found; + } + if (js[parser->pos] < 32 || js[parser->pos] >= 127) { + parser->pos = start; + return JSMN_ERROR_INVAL; + } + } +#ifdef JSMN_STRICT + /* In strict mode primitive must be followed by a comma/object/array */ + parser->pos = start; + return JSMN_ERROR_PART; +#endif + + found: + if (tokens == NULL) { + parser->pos--; + return 0; + } + token = jsmn_alloc_token(parser, tokens, num_tokens); + if (token == NULL) { + parser->pos = start; + return JSMN_ERROR_NOMEM; + } + jsmn_fill_token(token, JSMN_PRIMITIVE, start, parser->pos); +#ifdef JSMN_PARENT_LINKS + token->parent = parser->toksuper; +#endif + parser->pos--; + return 0; +} + +/** + * Parse string + * + * Fills next token with JSON string. + * + * @param parser is the control structure + * @param js is the json string + * @param len is the js length + * @param tokens is structure with the tokens mapped. + * @param num_tokens is the total number of tokens + * + * @return It returns 0 on success and another integer otherwise + */ +static jsmnerr_t jsmn_parse_string(jsmn_parser *parser, const char *js, + size_t len, jsmntok_t *tokens, size_t num_tokens) { + jsmntok_t *token; + + int start = parser->pos; + + parser->pos++; + + /* Skip starting quote */ + for (; parser->pos < len && js[parser->pos] != '\0'; parser->pos++) { + char c = js[parser->pos]; + + /* Quote: end of string */ + if (c == '\"') { + if (tokens == NULL) { + return 0; + } + token = jsmn_alloc_token(parser, tokens, num_tokens); + if (token == NULL) { + parser->pos = start; + return JSMN_ERROR_NOMEM; + } + jsmn_fill_token(token, JSMN_STRING, start+1, parser->pos); +#ifdef JSMN_PARENT_LINKS + token->parent = parser->toksuper; +#endif + return 0; + } + + /* Backslash: Quoted symbol expected */ + if (c == '\\') { + parser->pos++; + switch (js[parser->pos]) { + /* Allowed escaped symbols */ + case '\"': case '/' : case '\\' : case 'b' : + case 'f' : case 'r' : case 'n' : case 't' : + break; + /* Allows escaped symbol \uXXXX */ + case 'u': + parser->pos++; + int i = 0; + for(; i < 4 && js[parser->pos] != '\0'; i++) { + /* If it isn't a hex character we have an error */ + if(!((js[parser->pos] >= 48 && js[parser->pos] <= 57) || /* 0-9 */ + (js[parser->pos] >= 65 && js[parser->pos] <= 70) || /* A-F */ + (js[parser->pos] >= 97 && js[parser->pos] <= 102))) { /* a-f */ + parser->pos = start; + return JSMN_ERROR_INVAL; + } + parser->pos++; + } + parser->pos--; + break; + /* Unexpected symbol */ + default: + parser->pos = start; + return JSMN_ERROR_INVAL; + } + } + } + parser->pos = start; + return JSMN_ERROR_PART; +} + +/** + * JSMN Parse + * + * Parse JSON string and fill tokens. + * + * @param parser the auxiliary vector used to parser + * @param js the string to parse + * @param len the string length + * @param tokens the place to map the tokens + * @param num_tokens the number of tokens present in the tokens structure. + * + * @return It returns the number of tokens present in the string on success or a negative number otherwise + */ +jsmnerr_t jsmn_parse(jsmn_parser *parser, const char *js, size_t len, + jsmntok_t *tokens, unsigned int num_tokens) { + jsmnerr_t r; + int i; + jsmntok_t *token; + int count = 0; + + for (; parser->pos < len && js[parser->pos] != '\0'; parser->pos++) { + char c; + jsmntype_t type; + + c = js[parser->pos]; + switch (c) { + case '{': case '[': + count++; + if (tokens == NULL) { + break; + } + token = jsmn_alloc_token(parser, tokens, num_tokens); + if (token == NULL) + return JSMN_ERROR_NOMEM; + if (parser->toksuper != -1) { + tokens[parser->toksuper].size++; +#ifdef JSMN_PARENT_LINKS + token->parent = parser->toksuper; +#endif + } + token->type = (c == '{' ? JSMN_OBJECT : JSMN_ARRAY); + token->start = parser->pos; + parser->toksuper = parser->toknext - 1; + break; + case '}': case ']': + if (tokens == NULL) + break; + type = (c == '}' ? JSMN_OBJECT : JSMN_ARRAY); +#ifdef JSMN_PARENT_LINKS + if (parser->toknext < 1) { + return JSMN_ERROR_INVAL; + } + token = &tokens[parser->toknext - 1]; + for (;;) { + if (token->start != -1 && token->end == -1) { + if (token->type != type) { + return JSMN_ERROR_INVAL; + } + token->end = parser->pos + 1; + parser->toksuper = token->parent; + break; + } + if (token->parent == -1) { + break; + } + token = &tokens[token->parent]; + } +#else + for (i = parser->toknext - 1; i >= 0; i--) { + token = &tokens[i]; + if (token->start != -1 && token->end == -1) { + if (token->type != type) { + return JSMN_ERROR_INVAL; + } + parser->toksuper = -1; + token->end = parser->pos + 1; + break; + } + } + /* Error if unmatched closing bracket */ + if (i == -1) return JSMN_ERROR_INVAL; + for (; i >= 0; i--) { + token = &tokens[i]; + if (token->start != -1 && token->end == -1) { + parser->toksuper = i; + break; + } + } +#endif + break; + case '\"': + r = jsmn_parse_string(parser, js, len, tokens, num_tokens); + if (r < 0) return r; + count++; + if (parser->toksuper != -1 && tokens != NULL) + tokens[parser->toksuper].size++; + break; + case '\t' : case '\r' : case '\n' : case ':' : case ',': case ' ': + break; +#ifdef JSMN_STRICT + /* In strict mode primitives are: numbers and booleans */ + case '-': case '0': case '1' : case '2': case '3' : case '4': + case '5': case '6': case '7' : case '8': case '9': + case 't': case 'f': case 'n' : +#else + /* In non-strict mode every unquoted value is a primitive */ + default: +#endif + r = jsmn_parse_primitive(parser, js, len, tokens, num_tokens); + if (r < 0) return r; + count++; + if (parser->toksuper != -1 && tokens != NULL) + tokens[parser->toksuper].size++; + break; + +#ifdef JSMN_STRICT + /* Unexpected char in strict mode */ + default: + return JSMN_ERROR_INVAL; +#endif + } + } + + if (tokens) { + for (i = parser->toknext - 1; i >= 0; i--) { + /* Unmatched opened object or array */ + if (tokens[i].start != -1 && tokens[i].end == -1) { + return JSMN_ERROR_PART; + } + } + } + + return count; +} + +/** + * JSMN Init + * + * Creates a new parser based over a given buffer with an array of tokens + * available. + * + * @param parser is the structure with values to reset + */ +void jsmn_init(jsmn_parser *parser) { + parser->pos = 0; + parser->toknext = 0; + parser->toksuper = -1; +}
\ No newline at end of file diff --git a/src/libnetdata/json/jsmn.h b/src/libnetdata/json/jsmn.h new file mode 100644 index 00000000..beff586c --- /dev/null +++ b/src/libnetdata/json/jsmn.h @@ -0,0 +1,75 @@ +#ifndef __JSMN_H_ +#define __JSMN_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <stddef.h> +/** + * JSON type identifier. Basic types are: + * o Object + * o Array + * o String + * o Other primitive: number, boolean (true/false) or null + */ +typedef enum { + JSMN_PRIMITIVE = 0, + JSMN_OBJECT = 1, + JSMN_ARRAY = 2, + JSMN_STRING = 3 +} jsmntype_t; + +typedef enum { + /* Not enough tokens were provided */ + JSMN_ERROR_NOMEM = -1, + /* Invalid character inside JSON string */ + JSMN_ERROR_INVAL = -2, + /* The string is not a full JSON packet, more bytes expected */ + JSMN_ERROR_PART = -3, +} jsmnerr_t; + +/** + * JSON token description. + * + * @param type type (object, array, string etc.) + * @param start start position in JSON data string + * @param end end position in JSON data string + */ +typedef struct { + jsmntype_t type; + int start; + int end; + int size; +#ifdef JSMN_PARENT_LINKS + int parent; +#endif +} jsmntok_t; + +/** + * JSON parser. Contains an array of token blocks available. Also stores + * the string being parsed now and current position in that string + */ +typedef struct { + unsigned int pos; /* offset in the JSON string */ + unsigned int toknext; /* next token to allocate */ + int toksuper; /* superior token node, e.g parent object or array */ +} jsmn_parser; + +/** + * Create JSON parser over an array of tokens + */ +void jsmn_init(jsmn_parser *parser); + +/** + * Run JSON parser. It parses a JSON data string into and array of tokens, each describing + * a single JSON object. + */ +jsmnerr_t jsmn_parse(jsmn_parser *parser, const char *js, size_t len, + jsmntok_t *tokens, unsigned int num_tokens); + +#ifdef __cplusplus +} +#endif + +#endif /* __JSMN_H_ */
\ No newline at end of file diff --git a/src/libnetdata/json/json-c-parser-inline.h b/src/libnetdata/json/json-c-parser-inline.h new file mode 100644 index 00000000..543612a2 --- /dev/null +++ b/src/libnetdata/json/json-c-parser-inline.h @@ -0,0 +1,177 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_JSON_C_PARSER_INLINE_H +#define NETDATA_JSON_C_PARSER_INLINE_H + +#define JSONC_PARSE_BOOL_OR_ERROR_AND_RETURN(jobj, path, member, dst, error, required) do { \ + json_object *_j; \ + if (json_object_object_get_ex(jobj, member, &_j) && json_object_is_type(_j, json_type_boolean)) \ + dst = json_object_get_boolean(_j); \ + else if(required) { \ + buffer_sprintf(error, "missing or invalid type for '%s.%s' boolean", path, member); \ + return false; \ + } \ +} while(0) + +#define JSONC_PARSE_TXT2STRING_OR_ERROR_AND_RETURN(jobj, path, member, dst, error, required) do { \ + json_object *_j; \ + if (json_object_object_get_ex(jobj, member, &_j) && json_object_is_type(_j, json_type_string)) { \ + string_freez(dst); \ + dst = string_strdupz(json_object_get_string(_j)); \ + } \ + else if(required) { \ + buffer_sprintf(error, "missing or invalid type for '%s.%s' string", path, member); \ + return false; \ + } \ +} while(0) + +#define JSONC_PARSE_TXT2BUFFER_OR_ERROR_AND_RETURN(jobj, path, member, dst, error, required) do { \ + json_object *_j; \ + if (json_object_object_get_ex(jobj, member, &_j) && json_object_is_type(_j, json_type_string)) { \ + const char *_s = json_object_get_string(_j); \ + if(!_s || !*_s) { \ + buffer_free(dst); \ + dst = NULL; \ + } \ + else { \ + if (dst) \ + buffer_flush(dst); \ + else \ + dst = buffer_create(0, NULL); \ + if (_s && *_s) \ + buffer_strcat(dst, _s); \ + } \ + } \ + else if(required) { \ + buffer_sprintf(error, "missing or invalid type for '%s.%s' string", path, member); \ + return false; \ + } \ +} while(0) + +#define JSONC_PARSE_TXT2PATTERN_OR_ERROR_AND_RETURN(jobj, path, member, dst, error, required) do { \ + json_object *_j; \ + if (json_object_object_get_ex(jobj, member, &_j) && json_object_is_type(_j, json_type_string)) { \ + string_freez(dst); \ + const char *_v = json_object_get_string(_j); \ + if(strcmp(_v, "*") == 0) \ + dst = NULL; \ + else \ + dst = string_strdupz(_v); \ + } \ + else if(required) { \ + buffer_sprintf(error, "missing or invalid type for '%s.%s' string", path, member); \ + return false; \ + } \ +} while(0) + +#define JSONC_PARSE_TXT2EXPRESSION_OR_ERROR_AND_RETURN(jobj, path, member, dst, error, required) do { \ + json_object *_j; \ + if (json_object_object_get_ex(jobj, member, &_j) && json_object_is_type(_j, json_type_string)) { \ + const char *_t = json_object_get_string(_j); \ + if(_t && *_t && strcmp(_t, "*") != 0) { \ + const char *_failed_at = NULL; \ + int _err = 0; \ + expression_free(dst); \ + dst = expression_parse(_t, &_failed_at, &_err); \ + if(!dst) { \ + buffer_sprintf(error, "expression '%s.%s' has a non-parseable expression '%s': %s at '%s'", \ + path, member, _t, expression_strerror(_err), _failed_at); \ + return false; \ + } \ + } \ + } \ + else if(required) { \ + buffer_sprintf(error, "missing or invalid type for '%s.%s' expression", path, member); \ + return false; \ + } \ +} while(0) + +#define JSONC_PARSE_ARRAY_OF_TXT2BITMAP_OR_ERROR_AND_RETURN(jobj, path, member, converter, dst, error, required) do { \ + json_object *_jarray; \ + if (json_object_object_get_ex(jobj, member, &_jarray) && json_object_is_type(_jarray, json_type_array)) { \ + size_t _num_options = json_object_array_length(_jarray); \ + dst = 0; \ + for (size_t _i = 0; _i < _num_options; ++_i) { \ + json_object *_joption = json_object_array_get_idx(_jarray, _i); \ + if (!json_object_is_type(_joption, json_type_string)) { \ + buffer_sprintf(error, "invalid type for '%s.%s' at index %zu", path, member, _i); \ + return false; \ + } \ + const char *_option_str = json_object_get_string(_joption); \ + typeof(dst) _bit = converter(_option_str); \ + if (_bit == 0) { \ + buffer_sprintf(error, "unknown option '%s' in '%s.%s' at index %zu", _option_str, path, member, _i); \ + return false; \ + } \ + dst |= _bit; \ + } \ + } else if(required) { \ + buffer_sprintf(error, "missing or invalid type for '%s.%s' array", path, member); \ + return false; \ + } \ +} while(0) + + +#define JSONC_PARSE_TXT2ENUM_OR_ERROR_AND_RETURN(jobj, path, member, converter, dst, error, required) do { \ + json_object *_j; \ + if (json_object_object_get_ex(jobj, member, &_j) && json_object_is_type(_j, json_type_string)) \ + dst = converter(json_object_get_string(_j)); \ + else if(required) { \ + buffer_sprintf(error, "missing or invalid type (expected text value) for '%s.%s' enum", path, member); \ + return false; \ + } \ +} while(0) + +#define JSONC_PARSE_INT_OR_ERROR_AND_RETURN(jobj, path, member, dst, error, required) do { \ + json_object *_j; \ + if (json_object_object_get_ex(jobj, member, &_j)) { \ + if (_j != NULL && json_object_is_type(_j, json_type_int)) \ + dst = json_object_get_int(_j); \ + else if (_j != NULL && json_object_is_type(_j, json_type_double)) \ + dst = (typeof(dst))json_object_get_double(_j); \ + else if (_j == NULL) \ + dst = 0; \ + else { \ + buffer_sprintf(error, "not supported type (expected int) for '%s.%s'", path, member); \ + return false; \ + } \ + } else if(required) { \ + buffer_sprintf(error, "missing or invalid type (expected double value or null) for '%s.%s'", path, member); \ + return false; \ + } \ +} while(0) + +#define JSONC_PARSE_DOUBLE_OR_ERROR_AND_RETURN(jobj, path, member, dst, error, required) do { \ + json_object *_j; \ + if (json_object_object_get_ex(jobj, member, &_j)) { \ + if (_j != NULL && json_object_is_type(_j, json_type_double)) \ + dst = json_object_get_double(_j); \ + else if (_j != NULL && json_object_is_type(_j, json_type_int)) \ + dst = (typeof(dst))json_object_get_int(_j); \ + else if (_j == NULL) \ + dst = NAN; \ + else { \ + buffer_sprintf(error, "not supported type (expected double) for '%s.%s'", path, member); \ + return false; \ + } \ + } else if(required) { \ + buffer_sprintf(error, "missing or invalid type (expected double value or null) for '%s.%s'", path, member); \ + return false; \ + } \ +} while(0) + +#define JSONC_PARSE_SUBOBJECT(jobj, path, member, dst, callback, error, required) do { \ + json_object *_j; \ + if (json_object_object_get_ex(jobj, member, &_j)) { \ + char _new_path[strlen(path) + strlen(member) + 2]; \ + snprintfz(_new_path, sizeof(_new_path), "%s%s%s", path, *path?".":"", member); \ + if (!callback(_j, _new_path, dst, error, required)) { \ + return false; \ + } \ + } else if(required) { \ + buffer_sprintf(error, "missing '%s.%s' object", path, member); \ + return false; \ + } \ +} while(0) + +#endif //NETDATA_JSON_C_PARSER_INLINE_H diff --git a/src/libnetdata/json/json.c b/src/libnetdata/json/json.c new file mode 100644 index 00000000..a50f6b54 --- /dev/null +++ b/src/libnetdata/json/json.c @@ -0,0 +1,557 @@ +#include "jsmn.h" +#include "../libnetdata.h" +#include "json.h" +#include "libnetdata/libnetdata.h" +#include "health/health.h" + +#define JSON_TOKENS 1024 + +int json_tokens = JSON_TOKENS; + +/** + * Json Tokenise + * + * Map the string given inside tokens. + * + * @param js is the string used to create the tokens + * @param len is the string length + * @param count the number of tokens present in the string + * + * @return it returns the json parsed in tokens + */ +#ifdef ENABLE_JSONC +json_object *json_tokenise(char *js) { + if(!js) { + netdata_log_error("JSON: json string is empty."); + return NULL; + } + + json_object *token = json_tokener_parse(js); + if(!token) { + netdata_log_error("JSON: Invalid json string."); + return NULL; + } + + return token; +} +#else +jsmntok_t *json_tokenise(char *js, size_t len, size_t *count) +{ + int n = json_tokens; + if(!js || !len) { + netdata_log_error("JSON: json string is empty."); + return NULL; + } + + jsmn_parser parser; + jsmn_init(&parser); + + jsmntok_t *tokens = mallocz(sizeof(jsmntok_t) * n); + if(!tokens) return NULL; + + int ret = jsmn_parse(&parser, js, len, tokens, n); + while (ret == JSMN_ERROR_NOMEM) { + n *= 2; + jsmntok_t *new = reallocz(tokens, sizeof(jsmntok_t) * n); + if(!new) { + freez(tokens); + return NULL; + } + tokens = new; + ret = jsmn_parse(&parser, js, len, tokens, n); + } + + if (ret == JSMN_ERROR_INVAL) { + netdata_log_error("JSON: Invalid json string."); + freez(tokens); + return NULL; + } + else if (ret == JSMN_ERROR_PART) { + netdata_log_error("JSON: Truncated JSON string."); + freez(tokens); + return NULL; + } + + if(count) *count = (size_t)ret; + + if(json_tokens < n) json_tokens = n; + return tokens; +} +#endif + +/** + * Callback Print + * + * Set callback print case necessary and wrinte an information inside a buffer to write in the log. + * + * @param e a pointer for a structure that has the complete information about json structure. + * + * @return It always return 0 + */ +int json_callback_print(JSON_ENTRY *e) +{ + BUFFER *wb=buffer_create(300, NULL); + + buffer_sprintf(wb,"%s = ", e->name); + char txt[50]; + switch(e->type) { + case JSON_OBJECT: + e->callback_function = json_callback_print; + buffer_strcat(wb,"OBJECT"); + break; + + case JSON_ARRAY: + e->callback_function = json_callback_print; + sprintf(txt,"ARRAY[%lu]", (long unsigned int) e->data.items); + buffer_strcat(wb, txt); + break; + + case JSON_STRING: + buffer_strcat(wb, e->data.string); + break; + + case JSON_NUMBER: + sprintf(txt, NETDATA_DOUBLE_FORMAT_AUTO, e->data.number); + buffer_strcat(wb,txt); + + break; + + case JSON_BOOLEAN: + buffer_strcat(wb, e->data.boolean?"TRUE":"FALSE"); + break; + + case JSON_NULL: + buffer_strcat(wb,"NULL"); + break; + } + netdata_log_info("JSON: %s", buffer_tostring(wb)); + buffer_free(wb); + return 0; +} + +/** + * JSONC Set String + * + * Set the string value of the structure JSON_ENTRY. + * + * @param e the output structure + */ +static inline void json_jsonc_set_string(JSON_ENTRY *e,char *key,const char *value) { + size_t len = strlen(key); + if(len > JSON_NAME_LEN) + len = JSON_NAME_LEN; + e->type = JSON_STRING; + memcpy(e->name,key,len); + e->name[len] = 0x00; + e->data.string = (char *) value; +} + + +#ifdef ENABLE_JSONC +/** + * JSONC set Boolean + * + * Set the boolean value of the structure JSON_ENTRY + * + * @param e the output structure + * @param value the input value + */ +static inline void json_jsonc_set_boolean(JSON_ENTRY *e,int value) { + e->type = JSON_BOOLEAN; + e->data.boolean = value; +} + +static inline void json_jsonc_set_integer(JSON_ENTRY *e, char *key, int64_t value) { + size_t len = strlen(key); + if(len > JSON_NAME_LEN) + len = JSON_NAME_LEN; + e->type = JSON_NUMBER; + memcpy(e->name, key, len); + e->name[len] = 0; + e->data.number = (NETDATA_DOUBLE)value; +} + +/** + * Parse Array + * + * Parse the array object. + * + * @param ptr the pointer for the object that we will parse. + * @param callback_data additional data to be used together the callback function + * @param callback_function function used to create a silencer. + */ +static inline void json_jsonc_parse_array(json_object *ptr, void *callback_data,int (*callback_function)(struct json_entry *)) { + int end = json_object_array_length(ptr); + JSON_ENTRY e; + + if(end) { + int i; + i = 0; + + enum json_type type; + do { + json_object *jvalue = json_object_array_get_idx(ptr, i); + if(jvalue) { + e.callback_data = callback_data; + e.type = JSON_OBJECT; + callback_function(&e); + json_object_object_foreach(jvalue, key, val) { + type = json_object_get_type(val); + if (type == json_type_array) { + e.type = JSON_ARRAY; + json_jsonc_parse_array(val, callback_data, callback_function); + } else if (type == json_type_object) { + json_walk(val,callback_data,callback_function); + } else if (type == json_type_string) { + json_jsonc_set_string(&e,key,json_object_get_string(val)); + callback_function(&e); + } else if (type == json_type_boolean) { + json_jsonc_set_boolean(&e,json_object_get_boolean(val)); + callback_function(&e); + } + } + } + + } while (++i < end); + } +} +#else + +/** + * Walk string + * + * Set JSON_ENTRY to string and map the values from jsmntok_t. + * + * @param js the original string + * @param t the tokens + * @param start the first position + * @param e the output structure. + * + * @return It always return 1 + */ +size_t json_walk_string(char *js, jsmntok_t *t, size_t start, JSON_ENTRY *e) +{ + char old = js[t[start].end]; + js[t[start].end] = '\0'; + e->original_string = &js[t[start].start]; + + e->type = JSON_STRING; + e->data.string = e->original_string; + if(e->callback_function) e->callback_function(e); + js[t[start].end] = old; + return 1; +} + +/** + * Walk Primitive + * + * Define the data type of the string + * + * @param js the original string + * @param t the tokens + * @param start the first position + * @param e the output structure. + * + * @return It always return 1 + */ +size_t json_walk_primitive(char *js, jsmntok_t *t, size_t start, JSON_ENTRY *e) +{ + char old = js[t[start].end]; + js[t[start].end] = '\0'; + e->original_string = &js[t[start].start]; + + switch(e->original_string[0]) { + case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': + case '8': case '9': case '-': case '.': + e->type = JSON_NUMBER; + e->data.number = strtold(e->original_string, NULL); + break; + + case 't': case 'T': + e->type = JSON_BOOLEAN; + e->data.boolean = 1; + break; + + case 'f': case 'F': + e->type = JSON_BOOLEAN; + e->data.boolean = 0; + break; + + case 'n': case 'N': + default: + e->type = JSON_NULL; + break; + } + if(e->callback_function) e->callback_function(e); + js[t[start].end] = old; + return 1; +} + +/** + * Array + * + * Measure the array length + * + * @param js the original string + * @param t the tokens + * @param nest the length of structure t + * @param start the first position + * @param e the structure with values and callback to be used inside the function. + * + * @return It returns the array length + */ +size_t json_walk_array(char *js, jsmntok_t *t, size_t nest, size_t start, JSON_ENTRY *e) +{ + JSON_ENTRY ne; + + char old = js[t[start].end]; + js[t[start].end] = '\0'; + ne.original_string = &js[t[start].start]; + + memcpy(&ne, e, sizeof(JSON_ENTRY)); + ne.type = JSON_ARRAY; + ne.data.items = t[start].size; + ne.callback_function = e->callback_function; + ne.name[0]='\0'; + ne.fullname[0]='\0'; + if(e->callback_function) e->callback_function(&ne); + js[t[start].end] = old; + + size_t i, init = start, size = t[start].size; + + start++; + for(i = 0; i < size ; i++) { + ne.pos = i; + if (strlen(e->name) > JSON_NAME_LEN - 24 || strlen(e->fullname) > JSON_FULLNAME_LEN -24) { + netdata_log_info("JSON: JSON walk_array ignoring element with name:%s fullname:%s",e->name, e->fullname); + continue; + } + snprintfz(ne.name, JSON_NAME_LEN, "%s[%lu]", e->name, i); + snprintfz(ne.fullname, JSON_FULLNAME_LEN, "%s[%lu]", e->fullname, i); + + switch(t[start].type) { + case JSMN_PRIMITIVE: + start += json_walk_primitive(js, t, start, &ne); + break; + + case JSMN_OBJECT: + start += json_walk_object(js, t, nest + 1, start, &ne); + break; + + case JSMN_ARRAY: + start += json_walk_array(js, t, nest + 1, start, &ne); + break; + + case JSMN_STRING: + start += json_walk_string(js, t, start, &ne); + break; + } + } + return start - init; +} + +/** + * Object + * + * Measure the Object length + * + * @param js the original string + * @param t the tokens + * @param nest the length of structure t + * @param start the first position + * @param e the output structure. + * + * @return It returns the Object length + */ +size_t json_walk_object(char *js, jsmntok_t *t, size_t nest, size_t start, JSON_ENTRY *e) +{ + JSON_ENTRY ne = { + .name = "", + .fullname = "", + .callback_data = NULL, + .callback_function = NULL + }; + + char old = js[t[start].end]; + js[t[start].end] = '\0'; + ne.original_string = &js[t[start].start]; + memcpy(&ne, e, sizeof(JSON_ENTRY)); + ne.type = JSON_OBJECT; + ne.callback_function = e->callback_function; + if(e->callback_function) e->callback_function(&ne); + js[t[start].end] = old; + + int key = 1; + size_t i, init = start, size = t[start].size; + + start++; + for(i = 0; i < size ; i++) { + switch(t[start].type) { + case JSMN_PRIMITIVE: + start += json_walk_primitive(js, t, start, &ne); + key = 1; + break; + + case JSMN_OBJECT: + start += json_walk_object(js, t, nest + 1, start, &ne); + key = 1; + break; + + case JSMN_ARRAY: + start += json_walk_array(js, t, nest + 1, start, &ne); + key = 1; + break; + + case JSMN_STRING: + default: + if(key) { + int len = t[start].end - t[start].start; + if (unlikely(len>JSON_NAME_LEN)) len=JSON_NAME_LEN; + strncpy(ne.name, &js[t[start].start], len); + ne.name[len] = '\0'; + len=strlen(e->fullname) + strlen(e->fullname[0]?".":"") + strlen(ne.name); + char *c = mallocz((len+1)*sizeof(char)); + sprintf(c,"%s%s%s", e->fullname, e->fullname[0]?".":"", ne.name); + if (unlikely(len>JSON_FULLNAME_LEN)) len=JSON_FULLNAME_LEN; + strncpy(ne.fullname, c, len); + freez(c); + start++; + key = 0; + } + else { + start += json_walk_string(js, t, start, &ne); + key = 1; + } + break; + } + } + return start - init; +} +#endif + +/** + * Tree + * + * Call the correct walk function according its type. + * + * @param t the json object to work + * @param callback_data additional data to be used together the callback function + * @param callback_function function used to create a silencer. + * + * @return It always return 1 + */ +#ifdef ENABLE_JSONC +size_t json_walk(json_object *t, void *callback_data, int (*callback_function)(struct json_entry *)) { + JSON_ENTRY e; + + e.callback_data = callback_data; + enum json_type type; + json_object_object_foreach(t, key, val) { + type = json_object_get_type(val); + if (type == json_type_array) { + e.type = JSON_ARRAY; + json_jsonc_parse_array(val,NULL,health_silencers_json_read_callback); + } else if (type == json_type_object) { + e.type = JSON_OBJECT; + } else if (type == json_type_string) { + json_jsonc_set_string(&e,key,json_object_get_string(val)); + callback_function(&e); + } else if (type == json_type_boolean) { + json_jsonc_set_boolean(&e,json_object_get_boolean(val)); + callback_function(&e); + } else if (type == json_type_int) { + json_jsonc_set_integer(&e,key,json_object_get_int64(val)); + callback_function(&e); + } + } + + return 1; +} +#else +/** + * Tree + * + * Call the correct walk function according its type. + * + * @param js the original string + * @param t the tokens + * @param callback_data additional data to be used together the callback function + * @param callback_function function used to create a silencer. + * + * @return It always return 1 + */ +size_t json_walk_tree(char *js, jsmntok_t *t, void *callback_data, int (*callback_function)(struct json_entry *)) +{ + JSON_ENTRY e = { + .name = "", + .fullname = "", + .callback_data = callback_data, + .callback_function = callback_function + }; + + switch (t[0].type) { + case JSMN_OBJECT: + e.type = JSON_OBJECT; + json_walk_object(js, t, 0, 0, &e); + break; + + case JSMN_ARRAY: + e.type = JSON_ARRAY; + json_walk_array(js, t, 0, 0, &e); + break; + + case JSMN_PRIMITIVE: + case JSMN_STRING: + break; + } + + return 1; +} +#endif + +/** + * JSON Parse + * + * Parse the json message with the callback function + * + * @param js the string that the callback function will parse + * @param callback_data additional data to be used together the callback function + * @param callback_function function used to create a silencer. + * + * @return JSON_OK case everything happened as expected, JSON_CANNOT_PARSE case there were errors in the + * parsing process and JSON_CANNOT_DOWNLOAD case the string given(js) is NULL. + */ +int json_parse(char *js, void *callback_data, int (*callback_function)(JSON_ENTRY *)) +{ + if(js) { +#ifdef ENABLE_JSONC + json_object *tokens = json_tokenise(js); +#else + size_t count; + jsmntok_t *tokens = json_tokenise(js, strlen(js), &count); +#endif + + if(tokens) { +#ifdef ENABLE_JSONC + json_walk(tokens, callback_data, callback_function); + json_object_put(tokens); +#else + json_walk_tree(js, tokens, callback_data, callback_function); + freez(tokens); +#endif + return JSON_OK; + } + + return JSON_CANNOT_PARSE; + } + + return JSON_CANNOT_DOWNLOAD; +} + +/* +int json_test(char *str) +{ + return json_parse(str, NULL, json_callback_print); +} + */ + diff --git a/src/libnetdata/json/json.h b/src/libnetdata/json/json.h new file mode 100644 index 00000000..5c3459ed --- /dev/null +++ b/src/libnetdata/json/json.h @@ -0,0 +1,80 @@ +#ifndef CHECKIN_JSON_H +#define CHECKIN_JSON_H 1 + +#if ENABLE_JSONC +#include <json-c/json.h> +// fix an older json-c bug +// https://github.com/json-c/json-c/issues/135 +#ifdef error_description +#undef error_description +#endif // error_description +#endif // ENABLE_JSONC + +#include "jsmn.h" + +//https://www.ibm.com/support/knowledgecenter/en/SS9H2Y_7.6.0/com.ibm.dp.doc/json_parserlimits.html +#define JSON_NAME_LEN 256 +#define JSON_FULLNAME_LEN 1024 + +typedef enum { + JSON_OBJECT = 0, + JSON_ARRAY = 1, + JSON_STRING = 2, + JSON_NUMBER = 3, + JSON_BOOLEAN = 4, + JSON_NULL = 5, +} JSON_ENTRY_TYPE; + +typedef struct json_entry { + JSON_ENTRY_TYPE type; + char name[JSON_NAME_LEN + 1]; + char fullname[JSON_FULLNAME_LEN + 1]; + union { + char *string; // type == JSON_STRING + NETDATA_DOUBLE number; // type == JSON_NUMBER + int boolean; // type == JSON_BOOLEAN + size_t items; // type == JSON_ARRAY + } data; + size_t pos; // the position of this item in its parent + + char *original_string; + + void *callback_data; + int (*callback_function)(struct json_entry *); +} JSON_ENTRY; + +// ---------------------------------------------------------------------------- +// public functions + +#define JSON_OK 0 +#define JSON_CANNOT_DOWNLOAD 1 +#define JSON_CANNOT_PARSE 2 + +int json_parse(char *js, void *callback_data, int (*callback_function)(JSON_ENTRY *)); + + +// ---------------------------------------------------------------------------- +// private functions + +#ifdef ENABLE_JSONC +json_object *json_tokenise(char *js); +size_t json_walk(json_object *t, void *callback_data, int (*callback_function)(struct json_entry *)); +#else +jsmntok_t *json_tokenise(char *js, size_t len, size_t *count); +size_t json_walk_tree(char *js, jsmntok_t *t, void *callback_data, int (*callback_function)(struct json_entry *)); +#endif + +size_t json_walk_object(char *js, jsmntok_t *t, size_t nest, size_t start, JSON_ENTRY *e); +size_t json_walk_array(char *js, jsmntok_t *t, size_t nest, size_t start, JSON_ENTRY *e); +size_t json_walk_string(char *js, jsmntok_t *t, size_t start, JSON_ENTRY *e); +size_t json_walk_primitive(char *js, jsmntok_t *t, size_t start, JSON_ENTRY *e); + +int json_callback_print(JSON_ENTRY *e); + +static inline void cleanup_json_object_pp(struct json_object **jobj) { + if(*jobj) + json_object_put(*jobj); +} +#define CLEAN_JSON_OBJECT _cleanup_(cleanup_json_object_pp) struct json_object + +#endif // CHECKIN_JSON_H diff --git a/src/libnetdata/july/README.md b/src/libnetdata/july/README.md new file mode 100644 index 00000000..72c862aa --- /dev/null +++ b/src/libnetdata/july/README.md @@ -0,0 +1,14 @@ +<!-- +custom_edit_url: https://github.com/netdata/netdata/edit/master/src/libnetdata/july/README.md +sidebar_label: "July interface" +learn_status: "Published" +learn_topic_type: "Tasks" +learn_rel_path: "Developers/libnetdata" +--> + + +# July + +An interface similar to `Judy` that uses minimal allocations (that can be cached) +for items that are mainly appended (just a few insertions in the middle) + diff --git a/src/libnetdata/july/july.c b/src/libnetdata/july/july.c new file mode 100644 index 00000000..56b8494b --- /dev/null +++ b/src/libnetdata/july/july.c @@ -0,0 +1,453 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "july.h" + +#define JULYL_MIN_ENTRIES 10 + +struct JulyL_item { + Word_t index; + void *value; +}; + +struct JulyL { + size_t entries; + size_t used; + + // statistics + size_t bytes; + size_t bytes_moved; + size_t reallocs; + + struct { + struct JulyL *prev; + struct JulyL *next; + } cache; + + struct JulyL_item array[]; +}; + +// ---------------------------------------------------------------------------- +// JulyL cache + +static struct { + struct { + SPINLOCK spinlock; + struct JulyL *available_items; + size_t available; + } protected; + + struct { + size_t bytes; + size_t allocated; + size_t bytes_moved; + size_t reallocs; + } atomics; +} julyl_globals = { + .protected = { + .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .available_items = NULL, + .available = 0, + }, + .atomics = { + .bytes = 0, + .allocated = 0, + .bytes_moved = 0, + .reallocs = 0, + }, +}; + +void julyl_cleanup1(void) { + struct JulyL *item = NULL; + + if(!spinlock_trylock(&julyl_globals.protected.spinlock)) + return; + + if(julyl_globals.protected.available_items && julyl_globals.protected.available > 10) { + item = julyl_globals.protected.available_items; + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(julyl_globals.protected.available_items, item, cache.prev, cache.next); + julyl_globals.protected.available--; + } + + spinlock_unlock(&julyl_globals.protected.spinlock); + + if(item) { + size_t bytes = item->bytes; + freez(item); + __atomic_sub_fetch(&julyl_globals.atomics.bytes, bytes, __ATOMIC_RELAXED); + __atomic_sub_fetch(&julyl_globals.atomics.allocated, 1, __ATOMIC_RELAXED); + } +} + +struct JulyL *julyl_get(void) { + struct JulyL *j; + + spinlock_lock(&julyl_globals.protected.spinlock); + + j = julyl_globals.protected.available_items; + if(likely(j)) { + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(julyl_globals.protected.available_items, j, cache.prev, cache.next); + julyl_globals.protected.available--; + } + + spinlock_unlock(&julyl_globals.protected.spinlock); + + if(unlikely(!j)) { + size_t bytes = sizeof(struct JulyL) + JULYL_MIN_ENTRIES * sizeof(struct JulyL_item); + j = mallocz(bytes); + j->bytes = bytes; + j->entries = JULYL_MIN_ENTRIES; + __atomic_add_fetch(&julyl_globals.atomics.bytes, bytes, __ATOMIC_RELAXED); + __atomic_add_fetch(&julyl_globals.atomics.allocated, 1, __ATOMIC_RELAXED); + } + + j->used = 0; + j->bytes_moved = 0; + j->reallocs = 0; + j->cache.next = j->cache.prev = NULL; + return j; +} + +static void julyl_release(struct JulyL *j) { + if(unlikely(!j)) return; + + __atomic_add_fetch(&julyl_globals.atomics.bytes_moved, j->bytes_moved, __ATOMIC_RELAXED); + __atomic_add_fetch(&julyl_globals.atomics.reallocs, j->reallocs, __ATOMIC_RELAXED); + + spinlock_lock(&julyl_globals.protected.spinlock); + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(julyl_globals.protected.available_items, j, cache.prev, cache.next); + julyl_globals.protected.available++; + spinlock_unlock(&julyl_globals.protected.spinlock); +} + +size_t julyl_cache_size(void) { + return __atomic_load_n(&julyl_globals.atomics.bytes, __ATOMIC_RELAXED); +} + +size_t julyl_bytes_moved(void) { + return __atomic_load_n(&julyl_globals.atomics.bytes_moved, __ATOMIC_RELAXED); +} + +// ---------------------------------------------------------------------------- +// JulyL + +size_t JulyLGet_binary_search_position_of_index(const struct JulyL *July, Word_t Index) { + // return the position of the first item >= Index + + size_t left = 0; + size_t right = July->used; + while(left < right) { + size_t middle = (left + right) >> 1; + + if(July->array[middle].index > Index) + right = middle; + + else + left = middle + 1; + } + + internal_fatal(left > July->used, "JULY: invalid position returned"); + + if(left > 0 && July->array[left - 1].index == Index) + return left - 1; + + internal_fatal( (left < July->used && July->array[left].index < Index) || + (left > 0 && July->array[left - 1].index >= Index) + , "JULY: wrong item returned"); + + return left; +} + +PPvoid_t JulyLGet(Pcvoid_t PArray, Word_t Index, PJError_t PJError __maybe_unused) { + const struct JulyL *July = PArray; + if(!July) + return NULL; + + size_t pos = JulyLGet_binary_search_position_of_index(July, Index); + + if(unlikely(pos >= July->used || July->array[pos].index != Index)) + return NULL; + + return (PPvoid_t)&July->array[pos].value; +} + +PPvoid_t JulyLIns(PPvoid_t PPArray, Word_t Index, PJError_t PJError __maybe_unused) { + struct JulyL *July = *PPArray; + if(unlikely(!July)) { + July = julyl_get(); + July->used = 0; + *PPArray = July; + } + + size_t pos = JulyLGet_binary_search_position_of_index(July, Index); + + if((pos == July->used || July->array[pos].index != Index)) { + // we have to add this entry + + if (unlikely(July->used == July->entries)) { + // we have to expand the array + size_t bytes = sizeof(struct JulyL) + July->entries * 2 * sizeof(struct JulyL_item); + __atomic_add_fetch(&julyl_globals.atomics.bytes, bytes - July->bytes, __ATOMIC_RELAXED); + July = reallocz(July, bytes); + July->bytes = bytes; + July->entries *= 2; + July->reallocs++; + *PPArray = July; + } + + if (unlikely(pos != July->used)) { + // we have to shift some members to make room + size_t size = (July->used - pos) * sizeof(struct JulyL_item); + memmove(&July->array[pos + 1], &July->array[pos], size); + July->bytes_moved += size; + } + + July->used++; + July->array[pos].value = NULL; + July->array[pos].index = Index; + } + + return &July->array[pos].value; +} + +PPvoid_t JulyLFirst(Pcvoid_t PArray, Word_t *Index, PJError_t PJError __maybe_unused) { + const struct JulyL *July = PArray; + if(!July) + return NULL; + + size_t pos = JulyLGet_binary_search_position_of_index(July, *Index); + // pos is >= Index + + if(unlikely(pos == July->used)) + return NULL; + + *Index = July->array[pos].index; + return (PPvoid_t)&July->array[pos].value; +} + +PPvoid_t JulyLNext(Pcvoid_t PArray, Word_t *Index, PJError_t PJError __maybe_unused) { + const struct JulyL *July = PArray; + if(!July) + return NULL; + + size_t pos = JulyLGet_binary_search_position_of_index(July, *Index); + // pos is >= Index + + if(unlikely(pos == July->used)) + return NULL; + + if(July->array[pos].index == *Index) { + pos++; + + if(unlikely(pos == July->used)) + return NULL; + } + + *Index = July->array[pos].index; + return (PPvoid_t)&July->array[pos].value; +} + +PPvoid_t JulyLLast(Pcvoid_t PArray, Word_t *Index, PJError_t PJError __maybe_unused) { + const struct JulyL *July = PArray; + if(!July) + return NULL; + + size_t pos = JulyLGet_binary_search_position_of_index(July, *Index); + // pos is >= Index + + if(pos > 0 && (pos == July->used || July->array[pos].index > *Index)) + pos--; + + if(unlikely(pos == 0 && July->array[0].index > *Index)) + return NULL; + + *Index = July->array[pos].index; + return (PPvoid_t)&July->array[pos].value; +} + +PPvoid_t JulyLPrev(Pcvoid_t PArray, Word_t *Index, PJError_t PJError __maybe_unused) { + const struct JulyL *July = PArray; + if(!July) + return NULL; + + size_t pos = JulyLGet_binary_search_position_of_index(July, *Index); + // pos is >= Index + + if(unlikely(pos == 0 || July->used == 0)) + return NULL; + + // get the previous one + pos--; + + *Index = July->array[pos].index; + return (PPvoid_t)&July->array[pos].value; +} + +Word_t JulyLFreeArray(PPvoid_t PPArray, PJError_t PJError __maybe_unused) { + struct JulyL *July = *PPArray; + if(unlikely(!July)) + return 0; + + size_t bytes = July->bytes; + julyl_release(July); + *PPArray = NULL; + return bytes; +} + +// ---------------------------------------------------------------------------- +// unittest + +#define item_index(i) (((i) * 2) + 100) + +int julytest(void) { + Word_t entries = 10000; + Pvoid_t array = NULL; + + // test additions + for(Word_t i = 0; i < entries ;i++) { + Pvoid_t *PValue = JulyLIns(&array, item_index(i), PJE0); + if(!PValue) + fatal("JULY: cannot insert item %lu", item_index(i)); + + *PValue = (void *)(item_index(i)); + } + + // test successful finds + for(Word_t i = 0; i < entries ;i++) { + Pvoid_t *PValue = JulyLGet(array, item_index(i), PJE0); + if(!PValue) + fatal("JULY: cannot find item %lu", item_index(i)); + + if(*PValue != (void *)(item_index(i))) + fatal("JULY: item %lu has the value %lu", item_index(i), (unsigned long)(*PValue)); + } + + // test finding the first item + for(Word_t i = 0; i < entries ;i++) { + Word_t index = item_index(i); + Pvoid_t *PValue = JulyLFirst(array, &index, PJE0); + if(!PValue) + fatal("JULY: cannot find first item %lu", item_index(i)); + + if(*PValue != (void *)(item_index(i))) + fatal("JULY: item %lu has the value %lu", item_index(i), (unsigned long)(*PValue)); + + if(index != item_index(i)) + fatal("JULY: item %lu has index %lu", item_index(i), index); + } + + // test finding the next item + for(Word_t i = 0; i < entries - 1 ;i++) { + Word_t index = item_index(i); + Pvoid_t *PValue = JulyLNext(array, &index, PJE0); + if(!PValue) + fatal("JULY: cannot find next item %lu", item_index(i)); + + if(*PValue != (void *)(item_index(i + 1))) + fatal("JULY: item %lu next has the value %lu", item_index(i), (unsigned long)(*PValue)); + + if(index != item_index(i + 1)) + fatal("JULY: item %lu next has index %lu", item_index(i), index); + } + + // test finding the last item + for(Word_t i = 0; i < entries ;i++) { + Word_t index = item_index(i); + Pvoid_t *PValue = JulyLLast(array, &index, PJE0); + if(!PValue) + fatal("JULY: cannot find last item %lu", item_index(i)); + + if(*PValue != (void *)(item_index(i))) + fatal("JULY: item %lu has the value %lu", item_index(i), (unsigned long)(*PValue)); + + if(index != item_index(i)) + fatal("JULY: item %lu has index %lu", item_index(i), index); + } + + // test finding the prev item + for(Word_t i = 1; i < entries ;i++) { + Word_t index = item_index(i); + Pvoid_t *PValue = JulyLPrev(array, &index, PJE0); + if(!PValue) + fatal("JULY: cannot find prev item %lu", item_index(i)); + + if(*PValue != (void *)(item_index(i - 1))) + fatal("JULY: item %lu prev has the value %lu", item_index(i), (unsigned long)(*PValue)); + + if(index != item_index(i - 1)) + fatal("JULY: item %lu prev has index %lu", item_index(i), index); + } + + // test full traversal forward + { + Word_t i = 0; + Word_t index = 0; + bool first = true; + Pvoid_t *PValue; + while((PValue = JulyLFirstThenNext(array, &index, &first))) { + if(*PValue != (void *)(item_index(i))) + fatal("JULY: item %lu traversal has the value %lu", item_index(i), (unsigned long)(*PValue)); + + if(index != item_index(i)) + fatal("JULY: item %lu traversal has index %lu", item_index(i), index); + + i++; + } + + if(i != entries) + fatal("JULY: expected to forward traverse %lu entries, but traversed %lu", entries, i); + } + + // test full traversal backward + { + Word_t i = 0; + Word_t index = (Word_t)(-1); + bool first = true; + Pvoid_t *PValue; + while((PValue = JulyLLastThenPrev(array, &index, &first))) { + if(*PValue != (void *)(item_index(entries - i - 1))) + fatal("JULY: item %lu traversal has the value %lu", item_index(i), (unsigned long)(*PValue)); + + if(index != item_index(entries - i - 1)) + fatal("JULY: item %lu traversal has index %lu", item_index(i), index); + + i++; + } + + if(i != entries) + fatal("JULY: expected to back traverse %lu entries, but traversed %lu", entries, i); + } + + // test finding non-existing first item + for(Word_t i = 0; i < entries ;i++) { + Word_t index = item_index(i) - 1; + Pvoid_t *PValue = JulyLFirst(array, &index, PJE0); + if(!PValue) + fatal("JULY: cannot find first item %lu", item_index(i) - 1); + + if(*PValue != (void *)(item_index(i))) + fatal("JULY: item %lu has the value %lu", item_index(i), (unsigned long)(*PValue)); + + if(index != item_index(i)) + fatal("JULY: item %lu has index %lu", item_index(i), index); + } + + // test finding non-existing last item + for(Word_t i = 0; i < entries ;i++) { + Word_t index = item_index(i) + 1; + Pvoid_t *PValue = JulyLLast(array, &index, PJE0); + if(!PValue) + fatal("JULY: cannot find last item %lu", item_index(i) + 1); + + if(*PValue != (void *)(item_index(i))) + fatal("JULY: item %lu has the value %lu", item_index(i), (unsigned long)(*PValue)); + + if(index != item_index(i)) + fatal("JULY: item %lu has index %lu", item_index(i), index); + } + + JulyLFreeArray(&array, PJE0); + + return 0; +} + + diff --git a/src/libnetdata/july/july.h b/src/libnetdata/july/july.h new file mode 100644 index 00000000..672ed44e --- /dev/null +++ b/src/libnetdata/july/july.h @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_JULY_H +#define NETDATA_JULY_H 1 + +#include "../libnetdata.h" + +// #define PDC_USE_JULYL 1 + +PPvoid_t JulyLGet(Pcvoid_t PArray, Word_t Index, PJError_t PJError); +PPvoid_t JulyLIns(PPvoid_t PPArray, Word_t Index, PJError_t PJError); +PPvoid_t JulyLFirst(Pcvoid_t PArray, Word_t *Index, PJError_t PJError); +PPvoid_t JulyLNext(Pcvoid_t PArray, Word_t *Index, PJError_t PJError); +PPvoid_t JulyLLast(Pcvoid_t PArray, Word_t *Index, PJError_t PJError); +PPvoid_t JulyLPrev(Pcvoid_t PArray, Word_t *Index, PJError_t PJError); +Word_t JulyLFreeArray(PPvoid_t PPArray, PJError_t PJError); + +static inline PPvoid_t JulyLFirstThenNext(Pcvoid_t PArray, Word_t * PIndex, bool *first) { + if(unlikely(*first)) { + *first = false; + return JulyLFirst(PArray, PIndex, PJE0); + } + + return JulyLNext(PArray, PIndex, PJE0); +} + +static inline PPvoid_t JulyLLastThenPrev(Pcvoid_t PArray, Word_t * PIndex, bool *first) { + if(unlikely(*first)) { + *first = false; + return JulyLLast(PArray, PIndex, PJE0); + } + + return JulyLPrev(PArray, PIndex, PJE0); +} + +void julyl_cleanup1(void); +size_t julyl_cache_size(void); +size_t julyl_bytes_moved(void); + +#endif // NETDATA_JULY_H diff --git a/src/libnetdata/libjudy/src/Judy.h b/src/libnetdata/libjudy/src/Judy.h new file mode 100644 index 00000000..adfb5b53 --- /dev/null +++ b/src/libnetdata/libjudy/src/Judy.h @@ -0,0 +1,622 @@ +#ifndef _JUDY_INCLUDED +#define _JUDY_INCLUDED +// _________________ +// +// Copyright (C) 2000 - 2002 Hewlett-Packard Company +// +// This program is free software; you can redistribute it and/or modify it +// under the term of the GNU Lesser General Public License as published by the +// Free Software Foundation; either version 2 of the License, or (at your +// option) any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License +// for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this program; if not, write to the Free Software Foundation, +// Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// _________________ + +// @(#) $Revision: 4.52 $ $Source: /judy/src/Judy.h $ +// +// HEADER FILE FOR EXPORTED FEATURES IN JUDY LIBRARY, libJudy.* +// +// See the manual entries for details. +// +// Note: This header file uses old-style comments on #-directive lines and +// avoids "()" on macro names in comments for compatibility with older cc -Aa +// and some tools on some platforms. + + +// PLATFORM-SPECIFIC + +#ifdef JU_WIN /* =============================================== */ + +typedef __int8 int8_t; +typedef __int16 int16_t; +typedef __int32 int32_t; +typedef __int64 int64_t; + +typedef unsigned __int8 uint8_t; +typedef unsigned __int16 uint16_t; +typedef unsigned __int32 uint32_t; +typedef unsigned __int64 uint64_t; + +#else /* ================ ! JU_WIN ============================= */ + +// ISO C99: 7.8 Format conversion of integer types <inttypes.h> +#include <inttypes.h> /* if this FAILS, try #include <stdint.h> */ + +// ISO C99: 7.18 Integer types uint*_t +//#include <stdint.h> + +#endif /* ================ ! JU_WIN ============================= */ + +// ISO C99 Standard: 7.20 General utilities +#include <stdlib.h> + +// ISO C99 Standard: 7.10/5.2.4.2.1 Sizes of integer types +#include <limits.h> + +#ifdef __cplusplus /* support use by C++ code */ +extern "C" { +#endif + + +// **************************************************************************** +// DECLARE SOME BASE TYPES IN CASE THEY ARE MISSING: +// +// These base types include "const" where appropriate, but only where of +// interest to the caller. For example, a caller cares that a variable passed +// by reference will not be modified, such as, "const void * Pindex", but not +// that the called function internally does not modify the pointer itself, such +// as, "void * const Pindex". +// +// Note that its OK to pass a Pvoid_t to a Pcvoid_t; the latter is the same, +// only constant. Callers need to do this so they can also pass & Pvoid_t to +// PPvoid_t (non-constant). + +#ifndef _PCVOID_T +#define _PCVOID_T +typedef const void * Pcvoid_t; +#endif + +#ifndef _PVOID_T +#define _PVOID_T +typedef void * Pvoid_t; +typedef void ** PPvoid_t; +#endif + +#ifndef _WORD_T +#define _WORD_T +typedef unsigned long Word_t, * PWord_t; // expect 32-bit or 64-bit words. +#endif + +#ifndef NULL +#define NULL 0 +#endif + + +// **************************************************************************** +// SUPPORT FOR ERROR HANDLING: +// +// Judy error numbers: +// +// Note: These are an enum so theres a related typedef, but the numbers are +// spelled out so you can map a number back to its name. + +typedef enum // uint8_t -- but C does not support this type of enum. +{ + +// Note: JU_ERRNO_NONE and JU_ERRNO_FULL are not real errors. They specify +// conditions which are otherwise impossible return values from 32-bit +// Judy1Count, which has 2^32 + 1 valid returns (0..2^32) plus one error +// return. These pseudo-errors support the return values that cannot otherwise +// be unambiguously represented in a 32-bit word, and will never occur on a +// 64-bit system. + + JU_ERRNO_NONE = 0, + JU_ERRNO_FULL = 1, + JU_ERRNO_NFMAX = JU_ERRNO_FULL, + +// JU_ERRNO_NOMEM comes from malloc(3C) when Judy cannot obtain needed memory. +// The system errno value is also set to ENOMEM. This error can be recoverable +// if the calling application frees other memory. +// +// TBD: Currently there is no guarantee the Judy array has no memory leaks +// upon JU_ERRNO_NOMEM. + + JU_ERRNO_NOMEM = 2, + +// Problems with parameters from the calling program: +// +// JU_ERRNO_NULLPPARRAY means PPArray was null; perhaps PArray was passed where +// &PArray was intended. Similarly, JU_ERRNO_NULLPINDEX means PIndex was null; +// perhaps &Index was intended. Also, JU_ERRNO_NONNULLPARRAY, +// JU_ERRNO_NULLPVALUE, and JU_ERRNO_UNSORTED, all added later (hence with +// higher numbers), mean: A non-null array was passed in where a null pointer +// was required; PValue was null; and unsorted indexes were detected. + + JU_ERRNO_NULLPPARRAY = 3, // see above. + JU_ERRNO_NONNULLPARRAY = 10, // see above. + JU_ERRNO_NULLPINDEX = 4, // see above. + JU_ERRNO_NULLPVALUE = 11, // see above. + JU_ERRNO_NOTJUDY1 = 5, // PArray is not to a Judy1 array. + JU_ERRNO_NOTJUDYL = 6, // PArray is not to a JudyL array. + JU_ERRNO_NOTJUDYSL = 7, // PArray is not to a JudySL array. + JU_ERRNO_UNSORTED = 12, // see above. + +// Errors below this point are not recoverable; further tries to access the +// Judy array might result in EFAULT and a core dump: +// +// JU_ERRNO_OVERRUN occurs when Judy detects, upon reallocation, that a block +// of memory in its own freelist was modified since being freed. + + JU_ERRNO_OVERRUN = 8, + +// JU_ERRNO_CORRUPT occurs when Judy detects an impossible value in a Judy data +// structure: +// +// Note: The Judy data structure contains some redundant elements that support +// this type of checking. + + JU_ERRNO_CORRUPT = 9 + +// Warning: At least some C or C++ compilers do not tolerate a trailing comma +// above here. At least we know of one case, in aCC; see JAGad58928. + +} JU_Errno_t; + + +// Judy errno structure: +// +// WARNING: For compatibility with possible future changes, the fields of this +// struct should not be referenced directly. Instead use the macros supplied +// below. + +// This structure should be declared on the stack in a threaded process. + +typedef struct J_UDY_ERROR_STRUCT +{ + JU_Errno_t je_Errno; // one of the enums above. + int je_ErrID; // often an internal source line number. + Word_t je_reserved[4]; // for future backward compatibility. + +} JError_t, * PJError_t; + + +// Related macros: +// +// Fields from error struct: + +#define JU_ERRNO(PJError) ((PJError)->je_Errno) +#define JU_ERRID(PJError) ((PJError)->je_ErrID) + +// For checking return values from various Judy functions: +// +// Note: Define JERR as -1, not as the seemingly more portable (Word_t) +// (~0UL), to avoid a compiler "overflow in implicit constant conversion" +// warning. + +#define JERR (-1) /* functions returning int or Word_t */ +#define PJERR ((Pvoid_t) (~0UL)) /* mainly for use here, see below */ +#define PPJERR ((PPvoid_t) (~0UL)) /* functions that return PPvoid_t */ + +// Convenience macro for when detailed error information (PJError_t) is not +// desired by the caller; a purposely short name: + +#define PJE0 ((PJError_t) NULL) + + +// **************************************************************************** +// JUDY FUNCTIONS: +// +// P_JE is a shorthand for use below: + +#define P_JE PJError_t PJError + +// **************************************************************************** +// JUDY1 FUNCTIONS: + +extern int Judy1Test( Pcvoid_t PArray, Word_t Index, P_JE); +extern int Judy1Set( PPvoid_t PPArray, Word_t Index, P_JE); +extern int Judy1SetArray( PPvoid_t PPArray, Word_t Count, + const Word_t * const PIndex, + P_JE); +extern int Judy1Unset( PPvoid_t PPArray, Word_t Index, P_JE); +extern Word_t Judy1Count( Pcvoid_t PArray, Word_t Index1, + Word_t Index2, P_JE); +extern int Judy1ByCount( Pcvoid_t PArray, Word_t Count, + Word_t * PIndex, P_JE); +extern Word_t Judy1FreeArray( PPvoid_t PPArray, P_JE); +extern Word_t Judy1MemUsed( Pcvoid_t PArray); +extern Word_t Judy1MemActive( Pcvoid_t PArray); +extern int Judy1First( Pcvoid_t PArray, Word_t * PIndex, P_JE); +extern int Judy1Next( Pcvoid_t PArray, Word_t * PIndex, P_JE); +extern int Judy1Last( Pcvoid_t PArray, Word_t * PIndex, P_JE); +extern int Judy1Prev( Pcvoid_t PArray, Word_t * PIndex, P_JE); +extern int Judy1FirstEmpty( Pcvoid_t PArray, Word_t * PIndex, P_JE); +extern int Judy1NextEmpty( Pcvoid_t PArray, Word_t * PIndex, P_JE); +extern int Judy1LastEmpty( Pcvoid_t PArray, Word_t * PIndex, P_JE); +extern int Judy1PrevEmpty( Pcvoid_t PArray, Word_t * PIndex, P_JE); + +extern PPvoid_t JudyLGet( Pcvoid_t PArray, Word_t Index, P_JE); +extern PPvoid_t JudyLIns( PPvoid_t PPArray, Word_t Index, P_JE); +extern int JudyLInsArray( PPvoid_t PPArray, Word_t Count, + const Word_t * const PIndex, + const Word_t * const PValue, + +// **************************************************************************** +// JUDYL FUNCTIONS: + P_JE); +extern int JudyLDel( PPvoid_t PPArray, Word_t Index, P_JE); +extern Word_t JudyLCount( Pcvoid_t PArray, Word_t Index1, + Word_t Index2, P_JE); +extern PPvoid_t JudyLByCount( Pcvoid_t PArray, Word_t Count, + Word_t * PIndex, P_JE); +extern Word_t JudyLFreeArray( PPvoid_t PPArray, P_JE); +extern Word_t JudyLMemUsed( Pcvoid_t PArray); +extern Word_t JudyLMemActive( Pcvoid_t PArray); +extern PPvoid_t JudyLFirst( Pcvoid_t PArray, Word_t * PIndex, P_JE); +extern PPvoid_t JudyLNext( Pcvoid_t PArray, Word_t * PIndex, P_JE); +extern PPvoid_t JudyLLast( Pcvoid_t PArray, Word_t * PIndex, P_JE); +extern PPvoid_t JudyLPrev( Pcvoid_t PArray, Word_t * PIndex, P_JE); +extern int JudyLFirstEmpty( Pcvoid_t PArray, Word_t * PIndex, P_JE); +extern int JudyLNextEmpty( Pcvoid_t PArray, Word_t * PIndex, P_JE); +extern int JudyLLastEmpty( Pcvoid_t PArray, Word_t * PIndex, P_JE); +extern int JudyLPrevEmpty( Pcvoid_t PArray, Word_t * PIndex, P_JE); + +// **************************************************************************** +// JUDYSL FUNCTIONS: + +extern PPvoid_t JudySLGet( Pcvoid_t, const uint8_t * Index, P_JE); +extern PPvoid_t JudySLIns( PPvoid_t, const uint8_t * Index, P_JE); +extern int JudySLDel( PPvoid_t, const uint8_t * Index, P_JE); +extern Word_t JudySLFreeArray( PPvoid_t, P_JE); +extern PPvoid_t JudySLFirst( Pcvoid_t, uint8_t * Index, P_JE); +extern PPvoid_t JudySLNext( Pcvoid_t, uint8_t * Index, P_JE); +extern PPvoid_t JudySLLast( Pcvoid_t, uint8_t * Index, P_JE); +extern PPvoid_t JudySLPrev( Pcvoid_t, uint8_t * Index, P_JE); + +// **************************************************************************** +// JUDYHSL FUNCTIONS: + +extern PPvoid_t JudyHSGet( Pcvoid_t, void *, Word_t); +extern PPvoid_t JudyHSIns( PPvoid_t, void *, Word_t, P_JE); +extern int JudyHSDel( PPvoid_t, void *, Word_t, P_JE); +extern Word_t JudyHSFreeArray( PPvoid_t, P_JE); + +extern const char *Judy1MallocSizes; +extern const char *JudyLMallocSizes; + +// **************************************************************************** +// JUDY memory interface to malloc() FUNCTIONS: + +extern Word_t JudyMalloc(Word_t); // words reqd => words allocd. +extern Word_t JudyMallocVirtual(Word_t); // words reqd => words allocd. +extern void JudyFree(Pvoid_t, Word_t); // free, size in words. +extern void JudyFreeVirtual(Pvoid_t, Word_t); // free, size in words. + +#define JLAP_INVALID 0x1 /* flag to mark pointer "not a Judy array" */ + +// **************************************************************************** +// MACRO EQUIVALENTS FOR JUDY FUNCTIONS: +// +// The following macros, such as J1T, are shorthands for calling Judy functions +// with parameter address-of and detailed error checking included. Since they +// are macros, the error checking code is replicated each time the macro is +// used, but it runs fast in the normal case of no error. +// +// If the caller does not like the way the default JUDYERROR macro handles +// errors (such as an exit(1) call when out of memory), they may define their +// own before the "#include <Judy.h>". A routine such as HandleJudyError +// could do checking on specific error numbers and print a different message +// dependent on the error. The following is one example: +// +// Note: the back-slashes are removed because some compilers will not accept +// them in comments. +// +// void HandleJudyError(uint8_t *, int, uint8_t *, int, int); +// #define JUDYERROR(CallerFile, CallerLine, JudyFunc, JudyErrno, JudyErrID) +// { +// HandleJudyError(CallerFile, CallerLine, JudyFunc, JudyErrno, JudyErrID); +// } +// +// The routine HandleJudyError could do checking on specific error numbers and +// print a different message dependent on the error. +// +// The macro receives five parameters that are: +// +// 1. CallerFile: Source filename where a Judy call returned a serious error. +// 2. CallerLine: Line number in that source file. +// 3. JudyFunc: Name of Judy function reporting the error. +// 4. JudyErrno: One of the JU_ERRNO* values enumerated above. +// 5. JudyErrID: The je_ErrID field described above. + +#ifndef JUDYERROR_NOTEST +#ifndef JUDYERROR /* supply a default error macro */ +#include <stdio.h> + +#define JUDYERROR(CallerFile, CallerLine, JudyFunc, JudyErrno, JudyErrID) \ + { \ + (void) fprintf(stderr, "File '%s', line %d: %s(), " \ + "JU_ERRNO_* == %d, ID == %d\n", \ + CallerFile, CallerLine, \ + JudyFunc, JudyErrno, JudyErrID); \ + exit(1); \ + } + +#endif /* JUDYERROR */ +#endif /* JUDYERROR_NOTEST */ + +// If the JUDYERROR macro is not desired at all, then the following eliminates +// it. However, the return code from each Judy function (that is, the first +// parameter of each macro) must be checked by the caller to assure that an +// error did not occur. +// +// Example: +// +// #define JUDYERROR_NOTEST 1 +// #include <Judy.h> +// +// or use this cc option at compile time: +// +// cc -DJUDYERROR_NOTEST ... +// +// Example code: +// +// J1S(Rc, PArray, Index); +// if (Rc == JERR) goto ...error +// +// or: +// +// JLI(PValue, PArray, Index); +// if (PValue == PJERR) goto ...error + + +// Internal shorthand macros for writing the J1S, etc. macros: + +#ifdef JUDYERROR_NOTEST /* ============================================ */ + +// "Judy Set Error": + +#define J_SE(FuncName,Errno) ((void) 0) + +// Note: In each J_*() case below, the digit is the number of key parameters +// to the Judy*() call. Just assign the Func result to the callers Rc value +// without a cast because none is required, and this keeps the API simpler. +// However, a family of different J_*() macros is needed to support the +// different numbers of key parameters (0,1,2) and the Func return type. +// +// In the names below, "I" = integer result; "P" = pointer result. Note, the +// Funcs for J_*P() return PPvoid_t, but cast this to a Pvoid_t for flexible, +// error-free assignment, and then compare to PJERR. + +#define J_0I(Rc,PArray,Func,FuncName) \ + { (Rc) = Func(PArray, PJE0); } + +#define J_1I(Rc,PArray,Index,Func,FuncName) \ + { (Rc) = Func(PArray, Index, PJE0); } + +#define J_1P(PV,PArray,Index,Func,FuncName) \ + { (PV) = (Pvoid_t) Func(PArray, Index, PJE0); } + +#define J_2I(Rc,PArray,Index,Arg2,Func,FuncName) \ + { (Rc) = Func(PArray, Index, Arg2, PJE0); } + +#define J_2C(Rc,PArray,Index1,Index2,Func,FuncName) \ + { (Rc) = Func(PArray, Index1, Index2, PJE0); } + +#define J_2P(PV,PArray,Index,Arg2,Func,FuncName) \ + { (PV) = (Pvoid_t) Func(PArray, Index, Arg2, PJE0); } + +// Variations for Judy*Set/InsArray functions: + +#define J_2AI(Rc,PArray,Count,PIndex,Func,FuncName) \ + { (Rc) = Func(PArray, Count, PIndex, PJE0); } +#define J_3AI(Rc,PArray,Count,PIndex,PValue,Func,FuncName) \ + { (Rc) = Func(PArray, Count, PIndex, PValue, PJE0); } + +#else /* ================ ! JUDYERROR_NOTEST ============================= */ + +#define J_E(FuncName,PJE) \ + JUDYERROR(__FILE__, __LINE__, FuncName, JU_ERRNO(PJE), JU_ERRID(PJE)) + +#define J_SE(FuncName,Errno) \ + { \ + JError_t J_Error; \ + JU_ERRNO(&J_Error) = (Errno); \ + JU_ERRID(&J_Error) = __LINE__; \ + J_E(FuncName, &J_Error); \ + } + +// Note: In each J_*() case below, the digit is the number of key parameters +// to the Judy*() call. Just assign the Func result to the callers Rc value +// without a cast because none is required, and this keeps the API simpler. +// However, a family of different J_*() macros is needed to support the +// different numbers of key parameters (0,1,2) and the Func return type. +// +// In the names below, "I" = integer result; "P" = pointer result. Note, the +// Funcs for J_*P() return PPvoid_t, but cast this to a Pvoid_t for flexible, +// error-free assignment, and then compare to PJERR. + +#define J_0I(Rc,PArray,Func,FuncName) \ + { \ + JError_t J_Error; \ + if (((Rc) = Func(PArray, &J_Error)) == JERR) \ + J_E(FuncName, &J_Error); \ + } + +#define J_1I(Rc,PArray,Index,Func,FuncName) \ + { \ + JError_t J_Error; \ + if (((Rc) = Func(PArray, Index, &J_Error)) == JERR) \ + J_E(FuncName, &J_Error); \ + } + +#define J_1P(Rc,PArray,Index,Func,FuncName) \ + { \ + JError_t J_Error; \ + if (((Rc) = (Pvoid_t) Func(PArray, Index, &J_Error)) == PJERR) \ + J_E(FuncName, &J_Error); \ + } + +#define J_2I(Rc,PArray,Index,Arg2,Func,FuncName) \ + { \ + JError_t J_Error; \ + if (((Rc) = Func(PArray, Index, Arg2, &J_Error)) == JERR) \ + J_E(FuncName, &J_Error); \ + } + +// Variation for Judy*Count functions, which return 0, not JERR, for error (and +// also for other non-error cases): +// +// Note: JU_ERRNO_NFMAX should only apply to 32-bit Judy1, but this header +// file lacks the necessary ifdefs to make it go away otherwise, so always +// check against it. + +#define J_2C(Rc,PArray,Index1,Index2,Func,FuncName) \ + { \ + JError_t J_Error; \ + if ((((Rc) = Func(PArray, Index1, Index2, &J_Error)) == 0) \ + && (JU_ERRNO(&J_Error) > JU_ERRNO_NFMAX)) \ + { \ + J_E(FuncName, &J_Error); \ + } \ + } + +#define J_2P(PV,PArray,Index,Arg2,Func,FuncName) \ + { \ + JError_t J_Error; \ + if (((PV) = (Pvoid_t) Func(PArray, Index, Arg2, &J_Error)) \ + == PJERR) J_E(FuncName, &J_Error); \ + } + +// Variations for Judy*Set/InsArray functions: + +#define J_2AI(Rc,PArray,Count,PIndex,Func,FuncName) \ + { \ + JError_t J_Error; \ + if (((Rc) = Func(PArray, Count, PIndex, &J_Error)) == JERR) \ + J_E(FuncName, &J_Error); \ + } + +#define J_3AI(Rc,PArray,Count,PIndex,PValue,Func,FuncName) \ + { \ + JError_t J_Error; \ + if (((Rc) = Func(PArray, Count, PIndex, PValue, &J_Error)) \ + == JERR) J_E(FuncName, &J_Error); \ + } + +#endif /* ================ ! JUDYERROR_NOTEST ============================= */ + +// Some of the macros are special cases that use inlined shortcuts for speed +// with root-level leaves: + +// This is a slower version with current processors, but in the future... + +#define J1T(Rc,PArray,Index) \ + (Rc) = Judy1Test((Pvoid_t)(PArray), Index, PJE0) + +#define J1S( Rc, PArray, Index) \ + J_1I(Rc, (&(PArray)), Index, Judy1Set, "Judy1Set") +#define J1SA(Rc, PArray, Count, PIndex) \ + J_2AI(Rc,(&(PArray)), Count, PIndex, Judy1SetArray, "Judy1SetArray") +#define J1U( Rc, PArray, Index) \ + J_1I(Rc, (&(PArray)), Index, Judy1Unset, "Judy1Unset") +#define J1F( Rc, PArray, Index) \ + J_1I(Rc, PArray, &(Index), Judy1First, "Judy1First") +#define J1N( Rc, PArray, Index) \ + J_1I(Rc, PArray, &(Index), Judy1Next, "Judy1Next") +#define J1L( Rc, PArray, Index) \ + J_1I(Rc, PArray, &(Index), Judy1Last, "Judy1Last") +#define J1P( Rc, PArray, Index) \ + J_1I(Rc, PArray, &(Index), Judy1Prev, "Judy1Prev") +#define J1FE(Rc, PArray, Index) \ + J_1I(Rc, PArray, &(Index), Judy1FirstEmpty, "Judy1FirstEmpty") +#define J1NE(Rc, PArray, Index) \ + J_1I(Rc, PArray, &(Index), Judy1NextEmpty, "Judy1NextEmpty") +#define J1LE(Rc, PArray, Index) \ + J_1I(Rc, PArray, &(Index), Judy1LastEmpty, "Judy1LastEmpty") +#define J1PE(Rc, PArray, Index) \ + J_1I(Rc, PArray, &(Index), Judy1PrevEmpty, "Judy1PrevEmpty") +#define J1C( Rc, PArray, Index1, Index2) \ + J_2C(Rc, PArray, Index1, Index2, Judy1Count, "Judy1Count") +#define J1BC(Rc, PArray, Count, Index) \ + J_2I(Rc, PArray, Count, &(Index), Judy1ByCount, "Judy1ByCount") +#define J1FA(Rc, PArray) \ + J_0I(Rc, (&(PArray)), Judy1FreeArray, "Judy1FreeArray") +#define J1MU(Rc, PArray) \ + (Rc) = Judy1MemUsed(PArray) + +#define JLG(PV,PArray,Index) \ + (PV) = (Pvoid_t)JudyLGet((Pvoid_t)PArray, Index, PJE0) + +#define JLI( PV, PArray, Index) \ + J_1P(PV, (&(PArray)), Index, JudyLIns, "JudyLIns") + +#define JLIA(Rc, PArray, Count, PIndex, PValue) \ + J_3AI(Rc,(&(PArray)), Count, PIndex, PValue, JudyLInsArray, \ + "JudyLInsArray") +#define JLD( Rc, PArray, Index) \ + J_1I(Rc, (&(PArray)), Index, JudyLDel, "JudyLDel") + +#define JLF( PV, PArray, Index) \ + J_1P(PV, PArray, &(Index), JudyLFirst, "JudyLFirst") + +#define JLN( PV, PArray, Index) \ + J_1P(PV, PArray, &(Index), JudyLNext, "JudyLNext") + +#define JLL( PV, PArray, Index) \ + J_1P(PV, PArray, &(Index), JudyLLast, "JudyLLast") +#define JLP( PV, PArray, Index) \ + J_1P(PV, PArray, &(Index), JudyLPrev, "JudyLPrev") +#define JLFE(Rc, PArray, Index) \ + J_1I(Rc, PArray, &(Index), JudyLFirstEmpty, "JudyLFirstEmpty") +#define JLNE(Rc, PArray, Index) \ + J_1I(Rc, PArray, &(Index), JudyLNextEmpty, "JudyLNextEmpty") +#define JLLE(Rc, PArray, Index) \ + J_1I(Rc, PArray, &(Index), JudyLLastEmpty, "JudyLLastEmpty") +#define JLPE(Rc, PArray, Index) \ + J_1I(Rc, PArray, &(Index), JudyLPrevEmpty, "JudyLPrevEmpty") +#define JLC( Rc, PArray, Index1, Index2) \ + J_2C(Rc, PArray, Index1, Index2, JudyLCount, "JudyLCount") +#define JLBC(PV, PArray, Count, Index) \ + J_2P(PV, PArray, Count, &(Index), JudyLByCount, "JudyLByCount") +#define JLFA(Rc, PArray) \ + J_0I(Rc, (&(PArray)), JudyLFreeArray, "JudyLFreeArray") +#define JLMU(Rc, PArray) \ + (Rc) = JudyLMemUsed(PArray) + +#define JHSI(PV, PArray, PIndex, Count) \ + J_2P(PV, (&(PArray)), PIndex, Count, JudyHSIns, "JudyHSIns") +#define JHSG(PV, PArray, PIndex, Count) \ + (PV) = (Pvoid_t) JudyHSGet(PArray, PIndex, Count) +#define JHSD(Rc, PArray, PIndex, Count) \ + J_2I(Rc, (&(PArray)), PIndex, Count, JudyHSDel, "JudyHSDel") +#define JHSFA(Rc, PArray) \ + J_0I(Rc, (&(PArray)), JudyHSFreeArray, "JudyHSFreeArray") + +#define JSLG( PV, PArray, Index) \ + J_1P( PV, PArray, Index, JudySLGet, "JudySLGet") +#define JSLI( PV, PArray, Index) \ + J_1P( PV, (&(PArray)), Index, JudySLIns, "JudySLIns") +#define JSLD( Rc, PArray, Index) \ + J_1I( Rc, (&(PArray)), Index, JudySLDel, "JudySLDel") +#define JSLF( PV, PArray, Index) \ + J_1P( PV, PArray, Index, JudySLFirst, "JudySLFirst") +#define JSLN( PV, PArray, Index) \ + J_1P( PV, PArray, Index, JudySLNext, "JudySLNext") +#define JSLL( PV, PArray, Index) \ + J_1P( PV, PArray, Index, JudySLLast, "JudySLLast") +#define JSLP( PV, PArray, Index) \ + J_1P( PV, PArray, Index, JudySLPrev, "JudySLPrev") +#define JSLFA(Rc, PArray) \ + J_0I( Rc, (&(PArray)), JudySLFreeArray, "JudySLFreeArray") + +#ifdef __cplusplus +} +#endif +#endif /* ! _JUDY_INCLUDED */ diff --git a/src/libnetdata/libjudy/src/JudyCommon/JudyMalloc.c b/src/libnetdata/libjudy/src/JudyCommon/JudyMalloc.c new file mode 100644 index 00000000..09a20e39 --- /dev/null +++ b/src/libnetdata/libjudy/src/JudyCommon/JudyMalloc.c @@ -0,0 +1,87 @@ +// Copyright (C) 2000 - 2002 Hewlett-Packard Company +// +// This program is free software; you can redistribute it and/or modify it +// under the term of the GNU Lesser General Public License as published by the +// Free Software Foundation; either version 2 of the License, or (at your +// option) any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License +// for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this program; if not, write to the Free Software Foundation, +// Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// _________________ + +// @(#) $Revision: 4.33 $ $Source: /judy/src/JudyCommon/JudyMalloc.c $ +// ************************************************************************ // +// JUDY - Memory Allocater // +// -by- // +// Douglas L. Baskins // +// Hewlett Packard // +// Fort Collins, Co // +// (970) 229-2027 // +// // +// ************************************************************************ // + +// JUDY INCLUDE FILES +#include "Judy.h" + +// **************************************************************************** +// J U D Y M A L L O C +// +// Allocate RAM. This is the single location in Judy code that calls +// malloc(3C). Note: JPM accounting occurs at a higher level. + +Word_t JudyMalloc( + Word_t Words) +{ + Word_t Addr; + + Addr = (Word_t) malloc(Words * sizeof(Word_t)); + return(Addr); + +} // JudyMalloc() + + +// **************************************************************************** +// J U D Y F R E E + +void JudyFree( + void * PWord, + Word_t Words) +{ + (void) Words; + free(PWord); + +} // JudyFree() + + +// **************************************************************************** +// J U D Y M A L L O C +// +// Higher-level "wrapper" for allocating objects that need not be in RAM, +// although at this time they are in fact only in RAM. Later we hope that some +// entire subtrees (at a JPM or branch) can be "virtual", so their allocations +// and frees should go through this level. + +Word_t JudyMallocVirtual( + Word_t Words) +{ + return(JudyMalloc(Words)); + +} // JudyMallocVirtual() + + +// **************************************************************************** +// J U D Y F R E E + +void JudyFreeVirtual( + void * PWord, + Word_t Words) +{ + JudyFree(PWord, Words); + +} // JudyFreeVirtual() diff --git a/src/libnetdata/libjudy/src/JudyCommon/JudyPrivate.h b/src/libnetdata/libjudy/src/JudyCommon/JudyPrivate.h new file mode 100644 index 00000000..350631f0 --- /dev/null +++ b/src/libnetdata/libjudy/src/JudyCommon/JudyPrivate.h @@ -0,0 +1,1613 @@ +#ifndef _JUDYPRIVATE_INCLUDED +#define _JUDYPRIVATE_INCLUDED +// _________________ +// +// Copyright (C) 2000 - 2002 Hewlett-Packard Company +// +// This program is free software; you can redistribute it and/or modify it +// under the term of the GNU Lesser General Public License as published by the +// Free Software Foundation; either version 2 of the License, or (at your +// option) any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License +// for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this program; if not, write to the Free Software Foundation, +// Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// _________________ + +// @(#) $Revision: 4.77 $ $Source: /judy/src/JudyCommon/JudyPrivate.h $ +// +// Header file for all Judy sources, for global but private (non-exported) +// declarations. + +#include "Judy.h" + +// **************************************************************************** +// A VERY BRIEF EXPLANATION OF A JUDY ARRAY +// +// A Judy array is, effectively, a digital tree (or Trie) with 256 element +// branches (nodes), and with "compression tricks" applied to low-population +// branches or leaves to save a lot of memory at the cost of relatively little +// CPU time or cache fills. +// +// In the actual implementation, a Judy array is level-less, and traversing the +// "tree" actually means following the states in a state machine (SM) as +// directed by the Index. A Judy array is referred to here as an "SM", rather +// than as a "tree"; having "states", rather than "levels". +// +// Each branch or leaf in the SM decodes a portion ("digit") of the original +// Index; with 256-way branches there are 8 bits per digit. There are 3 kinds +// of branches, called: Linear, Bitmap and Uncompressed, of which the first 2 +// are compressed to contain no NULL entries. +// +// An Uncompressed branch has a 1.0 cache line fill cost to decode 8 bits of +// (digit, part of an Index), but it might contain many NULL entries, and is +// therefore inefficient with memory if lightly populated. +// +// A Linear branch has a ~1.75 cache line fill cost when at maximum population. +// A Bitmap branch has ~2.0 cache line fills. Linear and Bitmap branches are +// converted to Uncompressed branches when the additional memory can be +// amortized with larger populations. Higher-state branches have higher +// priority to be converted. +// +// Linear branches can hold 28 elements (based on detailed analysis) -- thus 28 +// expanses. A Linear branch is converted to a Bitmap branch when the 29th +// expanse is required. +// +// A Bitmap branch could hold 256 expanses, but is forced to convert to an +// Uncompressed branch when 185 expanses are required. Hopefully, it is +// converted before that because of population growth (again, based on detailed +// analysis and heuristics in the code). +// +// A path through the SM terminates to a leaf when the Index (or key) +// population in the expanse below a pointer will fit into 1 or 2 cache lines +// (~31..255 Indexes). A maximum-population Leaf has ~1.5 cache line fill +// cost. +// +// Leaves are sorted arrays of Indexes, where the Index Sizes (IS) are: 0, 1, +// 8, 16, 24, 32, [40, 48, 56, 64] bits. The IS depends on the "density" +// (population/expanse) of the values in the Leaf. Zero bits are possible if +// population == expanse in the SM (that is, a full small expanse). +// +// Elements of a branches are called Judy Pointers (JPs). Each JP object +// points to the next object in the SM, plus, a JP can decode an additional +// 2[6] bytes of an Index, but at the cost of "narrowing" the expanse +// represented by the next object in the SM. A "narrow" JP (one which has +// decode bytes/digits) is a way of skipping states in the SM. +// +// Although counterintuitive, we think a Judy SM is optimal when the Leaves are +// stored at MINIMUM compression (narrowing, or use of Decode bytes). If more +// aggressive compression was used, decompression of a leaf be required to +// insert an index. Additional compression would save a little memory but not +// help performance significantly. + + +#ifdef A_PICTURE_IS_WORTH_1000_WORDS +******************************************************************************* + +JUDY 32-BIT STATE MACHINE (SM) EXAMPLE, FOR INDEX = 0x02040103 + +The Index used in this example is purposely chosen to allow small, simple +examples below; each 1-byte "digit" from the Index has a small numeric value +that fits in one column. In the drawing below: + + JRP == Judy Root Pointer; + + C == 1 byte of a 1..3 byte Population (count of Indexes) below this + pointer. Since this is shared with the Decode field, the combined + sizes must be 3[7], that is, 1 word less 1 byte for the JP Type. + + The 1-byte field jp_Type is represented as: + + 1..3 == Number of bytes in the population (Pop0) word of the Branch or Leaf + below the pointer (note: 1..7 on 64-bit); indicates: + - number of bytes in Decode field == 3 - this number; + - number of bytes remaining to decode. + Note: The maximum is 3, not 4, because the 1st byte of the Index is + always decoded digitally in the top branch. + -B- == JP points to a Branch (there are many kinds of Branches). + -L- == JP points to a Leaf (there are many kinds of Leaves). + + (2) == Digit of Index decoded by position offset in branch (really + 0..0xff). + + 4* == Digit of Index necessary for decoding a "narrow" pointer, in a + Decode field; replaces 1 missing branch (really 0..0xff). + + 4+ == Digit of Index NOT necessary for decoding a "narrow" pointer, but + used for fast traversal of the SM by Judy1Test() and JudyLGet() + (see the code) (really 0..0xff). + + 0 == Byte in a JPs Pop0 field that is always ignored, because a leaf + can never contain more than 256 Indexes (Pop0 <= 255). + + +----- == A Branch or Leaf; drawn open-ended to remind you that it could + | have up to 256 columns. + +----- + + | + | == Pointer to next Branch or Leaf. + V + + | + O == A state is skipped by using a "narrow" pointer. + | + + < 1 > == Digit (Index) shown as an example is not necessarily in the + position shown; is sorted in order with neighbor Indexes. + (Really 0..0xff.) + +Note that this example shows every possibly topology to reach a leaf in a +32-bit Judy SM, although this is a very subtle point! + + STATE or` + LEVEL + +---+ +---+ +---+ +---+ +---+ +---+ +---+ +---+ + |RJP| |RJP| |RJP| |RJP| |RJP| |RJP| |RJP| |RJP| + L---+ B---+ B---+ B---+ B---+ B---+ B---+ B---+ + | | | | | | | | + | | | | | | | | + V V (2) V (2) V (2) V (2) V (2) V (2) V (2) + +------ +------ +------ +------ +------ +------ +------ +------ +Four |< 2 > | 0 | 4* | C | 4* | 4* | C | C +byte |< 4 > | 0 | 0 | C | 1* | C | C | C 4 +Index|< 1 > | C | C | C | C | C | C | C +Leaf |< 3 > | 3 | 2 | 3 | 1 | 2 | 3 | 3 + +------ +--L--- +--L--- +--B--- +--L--- +--B--- +--B--- +--B--- + | | | | | | | + / | / | | / / + / | / | | / / + | | | | | | | + V | V (4) | | V (4) V (4) + +------ | +------ | | +------ +------ + Three |< 4 > | | 4+ | | | 4+ | 4+ + byte Index|< 1 > O | 0 O O | 1* | C 3 + Leaf |< 3 > | | C | | | C | C + +------ | | 2 | | | 1 | 2 + / +----L- | | +----L- +----B- + / | | | | | + | / | / / / + | / | / / / + | / | | / / + | / | | / / + | | | | | | + V V | V(1) | V(1) + +------ +------ | +------ | +------ + Two byte |< 1 > |< 1 > | | 4+ | | 4+ + Index Leaf |< 3 > |< 3 > O | 1+ O | 1+ 2 + +------ +------ / | C | | C + / | 1 | | 1 + | +-L---- | +-L---- + | | | | + | / | / + | | | | + V V V V + +------ +------ +------ +------ + One byte Index Leaf |< 3 > |< 3 > |< 3 > |< 3 > 1 + +------ +------ +------ +------ + + +#endif // A_PICTURE_IS_WORTH_1000_WORDS + + +// **************************************************************************** +// MISCELLANEOUS GLOBALS: +// +// PLATFORM-SPECIFIC CONVENIENCE MACROS: +// +// These are derived from context (set by cc or in system header files) or +// based on JU_<PLATFORM> macros from make_includes/platform.*.mk. We decided +// on 011018 that any macro reliably derivable from context (cc or headers) for +// ALL platforms supported by Judy is based on that derivation, but ANY +// exception means to stop using the external macro completely and derive from +// JU_<PLATFORM> instead. + +// Other miscellaneous stuff: + +#ifndef _BOOL_T +#define _BOOL_T +typedef int bool_t; +#endif + +#define FUNCTION // null; easy to find functions. + +#ifndef TRUE +#define TRUE 1 +#endif + +#ifndef FALSE +#define FALSE 0 +#endif + +#ifdef TRACE // turn on all other tracing in the code: +#define TRACEJP 1 // JP traversals in JudyIns.c and JudyDel.c. +#define TRACEJPR 1 // JP traversals in retrieval code, JudyGet.c. +#define TRACECF 1 // cache fills in JudyGet.c. +#define TRACEMI 1 // malloc calls in JudyMallocIF.c. +#define TRACEMF 1 // malloc calls at a lower level in JudyMalloc.c. +#endif + + +// SUPPORT FOR DEBUG-ONLY CODE: +// +// By convention, use -DDEBUG to enable both debug-only code AND assertions in +// the Judy sources. +// +// Invert the sense of assertions, so they are off unless explicitly requested, +// in a uniform way. +// +// Note: It is NOT appropriate to put this in Judy.h; it would mess up +// application code. + +#ifndef DEBUG +#define NDEBUG 1 // must be 1 for "#if". +#endif + +// Shorthand notations to avoid #ifdefs for single-line conditional statements: +// +// Warning: These cannot be used around compiler directives, such as +// "#include", nor in the case where Code contains a comma other than nested +// within parentheses or quotes. + +#ifndef DEBUG +#define DBGCODE(Code) // null. +#else +#define DBGCODE(Code) Code +#endif + +#ifdef JUDY1 +#define JUDY1CODE(Code) Code +#define JUDYLCODE(Code) // null. +#endif + +#ifdef JUDYL +#define JUDYLCODE(Code) Code +#define JUDY1CODE(Code) // null. +#endif + +#include <assert.h> + +// **************************************************************************** +// FUNDAMENTAL CONSTANTS FOR MACHINE +// **************************************************************************** + +// Machine (CPU) cache line size: +// +// NOTE: A leaf size of 2 cache lines maximum is the target (optimal) for +// Judy. Its hard to obtain a machines cache line size at compile time, but +// if the machine has an unexpected cache line size, its not devastating if +// the following constants end up causing leaves that are 1 cache line in size, +// or even 4 cache lines in size. The assumed 32-bit system has 16-word = +// 64-byte cache lines, and the assumed 64-bit system has 16-word = 128-byte +// cache lines. + +#ifdef JU_64BIT +#define cJU_BYTESPERCL 128 // cache line size in bytes. +#else +#define cJU_BYTESPERCL 64 // cache line size in bytes. +#endif + +// Bits Per Byte: + +#define cJU_BITSPERBYTE 0x8 + +// Bytes Per Word and Bits Per Word, latter assuming sizeof(byte) is 8 bits: +// +// Expect 32 [64] bits per word. + +#define cJU_BYTESPERWORD (sizeof(Word_t)) +#define cJU_BITSPERWORD (sizeof(Word_t) * cJU_BITSPERBYTE) + +#define JU_BYTESTOWORDS(BYTES) \ + (((BYTES) + cJU_BYTESPERWORD - 1) / cJU_BYTESPERWORD) + +// A word that is all-ones, normally equal to -1UL, but safer with ~0: + +#define cJU_ALLONES (~0UL) + +// Note, these are forward references, but thats OK: + +#define cJU_FULLBITMAPB ((BITMAPB_t) cJU_ALLONES) +#define cJU_FULLBITMAPL ((BITMAPL_t) cJU_ALLONES) + + +// **************************************************************************** +// MISCELLANEOUS JUDY-SPECIFIC DECLARATIONS +// **************************************************************************** + +// ROOT STATE: +// +// State at the start of the Judy SM, based on 1 byte decoded per state; equal +// to the number of bytes per Index to decode. + +#define cJU_ROOTSTATE (sizeof(Word_t)) + + +// SUBEXPANSES PER STATE: +// +// Number of subexpanses per state traversed, which is the number of JPs in a +// branch (actual or theoretical) and the number of bits in a bitmap. + +#define cJU_SUBEXPPERSTATE 256 + + +// LEAF AND VALUE POINTERS: +// +// Some other basic object types are in declared in JudyPrivateBranch.h +// (Pjbl_t, Pjbb_t, Pjbu_t, Pjp_t) or are Judy1/L-specific (Pjlb_t). The +// few remaining types are declared below. +// +// Note: Leaf pointers are cast to different-sized objects depending on the +// leafs level, but are at least addresses (not just numbers), so use void * +// (Pvoid_t), not PWord_t or Word_t for them, except use Pjlw_t for whole-word +// (top-level, root-level) leaves. Value areas, however, are always whole +// words. +// +// Furthermore, use Pjll_t only for generic leaf pointers (for various size +// LeafLs). Use Pjlw_t for LeafWs. Use Pleaf (with type uint8_t *, uint16_t +// *, etc) when the leaf index size is known. + +typedef PWord_t Pjlw_t; // pointer to root-level leaf (whole-word indexes). +typedef Pvoid_t Pjll_t; // pointer to lower-level linear leaf. + +#ifdef JUDYL +typedef PWord_t Pjv_t; // pointer to JudyL value area. +#endif + + +// POINTER PREPARATION MACROS: +// +// These macros are used to strip malloc-namespace-type bits from a pointer + +// malloc-type word (which references any Judy mallocd object that might be +// obtained from other than a direct call of malloc()), prior to dereferencing +// the pointer as an address. The malloc-type bits allow Judy mallocd objects +// to come from different "malloc() namespaces". +// +// (root pointer) (JRP, see above) +// jp.jp_Addr generic pointer to next-level node, except when used +// as a JudyL Immed01 value area +// JU_JBB_PJP macro hides jbbs_Pjp (pointer to JP subarray) +// JL_JLB_PVALUE macro hides jLlbs_PValue (pointer to value subarray) +// +// When setting one of these fields or passing an address to j__udyFree*(), the +// "raw" memory address is used; otherwise the memory address must be passed +// through one of the macros below before its dereferenced. +// +// Note: After much study, the typecasts below appear in the macros rather +// than at the point of use, which is both simpler and allows the compiler to +// do type-checking. + + +#define P_JLW( ADDR) ((Pjlw_t) (ADDR)) // root leaf. +#define P_JPM( ADDR) ((Pjpm_t) (ADDR)) // root JPM. +#define P_JBL( ADDR) ((Pjbl_t) (ADDR)) // BranchL. +#define P_JBB( ADDR) ((Pjbb_t) (ADDR)) // BranchB. +#define P_JBU( ADDR) ((Pjbu_t) (ADDR)) // BranchU. +#define P_JLL( ADDR) ((Pjll_t) (ADDR)) // LeafL. +#define P_JLB( ADDR) ((Pjlb_t) (ADDR)) // LeafB1. +#define P_JP( ADDR) ((Pjp_t) (ADDR)) // JP. + +#ifdef JUDYL +#define P_JV( ADDR) ((Pjv_t) (ADDR)) // &value. +#endif + + +// LEAST BYTES: +// +// Mask for least bytes of a word, and a macro to perform this mask on an +// Index. +// +// Note: This macro has been problematic in the past to get right and to make +// portable. Its not OK on all systems to shift by the full word size. This +// macro should allow shifting by 1..N bytes, where N is the word size, but +// should produce a compiler warning if the macro is called with Bytes == 0. +// +// Warning: JU_LEASTBYTESMASK() is not a constant macro unless Bytes is a +// constant; otherwise it is a variable shift, which is expensive on some +// processors. + +#define JU_LEASTBYTESMASK(BYTES) \ + ((0x100UL << (cJU_BITSPERBYTE * ((BYTES) - 1))) - 1) + +#define JU_LEASTBYTES(INDEX,BYTES) ((INDEX) & JU_LEASTBYTESMASK(BYTES)) + + +// BITS IN EACH BITMAP SUBEXPANSE FOR BITMAP BRANCH AND LEAF: +// +// The bits per bitmap subexpanse times the number of subexpanses equals a +// constant (cJU_SUBEXPPERSTATE). You can also think of this as a compile-time +// choice of "aspect ratio" for bitmap branches and leaves (which can be set +// independently for each). +// +// A default aspect ratio is hardwired here if not overridden at compile time, +// such as by "EXTCCOPTS=-DBITMAP_BRANCH16x16 make". + +#if (! (defined(BITMAP_BRANCH8x32) || defined(BITMAP_BRANCH16x16) || defined(BITMAP_BRANCH32x8))) +#define BITMAP_BRANCH32x8 1 // 32 bits per subexpanse, 8 subexpanses. +#endif + +#ifdef BITMAP_BRANCH8x32 +#define BITMAPB_t uint8_t +#endif + +#ifdef BITMAP_BRANCH16x16 +#define BITMAPB_t uint16_t +#endif + +#ifdef BITMAP_BRANCH32x8 +#define BITMAPB_t uint32_t +#endif + +// Note: For bitmap leaves, BITMAP_LEAF64x4 is only valid for 64 bit: +// +// Note: Choice of aspect ratio mostly matters for JudyL bitmap leaves. For +// Judy1 the choice doesnt matter much -- the code generated for different +// BITMAP_LEAF* values choices varies, but correctness and performance are the +// same. + +#ifndef JU_64BIT + +#if (! (defined(BITMAP_LEAF8x32) || defined(BITMAP_LEAF16x16) || defined(BITMAP_LEAF32x8))) +#define BITMAP_LEAF32x8 // 32 bits per subexpanse, 8 subexpanses. +#endif + +#else // 32BIT + +#if (! (defined(BITMAP_LEAF8x32) || defined(BITMAP_LEAF16x16) || defined(BITMAP_LEAF32x8) || defined(BITMAP_LEAF64x4))) +#define BITMAP_LEAF64x4 // 64 bits per subexpanse, 4 subexpanses. + +#endif +#endif // JU_64BIT + +#ifdef BITMAP_LEAF8x32 +#define BITMAPL_t uint8_t +#endif + +#ifdef BITMAP_LEAF16x16 +#define BITMAPL_t uint16_t +#endif + +#ifdef BITMAP_LEAF32x8 +#define BITMAPL_t uint32_t +#endif + +#ifdef BITMAP_LEAF64x4 +#define BITMAPL_t uint64_t +#endif + + +// EXPORTED DATA AND FUNCTIONS: + +#ifdef JUDY1 +extern const uint8_t j__1_BranchBJPPopToWords[]; +#endif + +#ifdef JUDYL +extern const uint8_t j__L_BranchBJPPopToWords[]; +#endif + +// Fast LeafL search routine used for inlined code: + +#if (! defined(SEARCH_BINARY)) || (! defined(SEARCH_LINEAR)) +// default a binary search leaf method +#define SEARCH_BINARY 1 +//#define SEARCH_LINEAR 1 +#endif + +#ifdef SEARCH_LINEAR + +#define SEARCHLEAFNATIVE(LEAFTYPE,ADDR,POP1,INDEX) \ + LEAFTYPE *P_leaf = (LEAFTYPE *)(ADDR); \ + LEAFTYPE I_ndex = (INDEX); /* with masking */ \ + if (I_ndex > P_leaf[(POP1) - 1]) return(~(POP1)); \ + while(I_ndex > *P_leaf) P_leaf++; \ + if (I_ndex == *P_leaf) return(P_leaf - (LEAFTYPE *)(ADDR)); \ + return(~(P_leaf - (LEAFTYPE *)(ADDR))); + + +#define SEARCHLEAFNONNAT(ADDR,POP1,INDEX,LFBTS,COPYINDEX) \ +{ \ + uint8_t *P_leaf, *P_leafEnd; \ + Word_t i_ndex; \ + Word_t I_ndex = JU_LEASTBYTES((INDEX), (LFBTS)); \ + Word_t p_op1; \ + \ + P_leaf = (uint8_t *)(ADDR); \ + P_leafEnd = P_leaf + ((POP1) * (LFBTS)); \ + \ + do { \ + JU_COPY3_PINDEX_TO_LONG(i_ndex, P_leaf); \ + if (I_ndex <= i_ndex) break; \ + P_leaf += (LFBTS); \ + } while (P_leaf < P_leafEnd); \ + \ + p_op1 = (P_leaf - (uint8_t *) (ADDR)) / (LFBTS); \ + if (I_ndex == i_ndex) return(p_op1); \ + return(~p_op1); \ +} +#endif // SEARCH_LINEAR + +#ifdef SEARCH_BINARY + +#define SEARCHLEAFNATIVE(LEAFTYPE,ADDR,POP1,INDEX) \ + LEAFTYPE *P_leaf = (LEAFTYPE *)(ADDR); \ + LEAFTYPE I_ndex = (LEAFTYPE)INDEX; /* truncate hi bits */ \ + Word_t l_ow = cJU_ALLONES; \ + Word_t m_id; \ + Word_t h_igh = POP1; \ + \ + while ((h_igh - l_ow) > 1UL) \ + { \ + m_id = (h_igh + l_ow) / 2; \ + if (P_leaf[m_id] > I_ndex) \ + h_igh = m_id; \ + else \ + l_ow = m_id; \ + } \ + if (l_ow == cJU_ALLONES || P_leaf[l_ow] != I_ndex) \ + return(~h_igh); \ + return(l_ow) + + +#define SEARCHLEAFNONNAT(ADDR,POP1,INDEX,LFBTS,COPYINDEX) \ + uint8_t *P_leaf = (uint8_t *)(ADDR); \ + Word_t l_ow = cJU_ALLONES; \ + Word_t m_id; \ + Word_t h_igh = POP1; \ + Word_t I_ndex = JU_LEASTBYTES((INDEX), (LFBTS)); \ + Word_t i_ndex; \ + \ + I_ndex = JU_LEASTBYTES((INDEX), (LFBTS)); \ + \ + while ((h_igh - l_ow) > 1UL) \ + { \ + m_id = (h_igh + l_ow) / 2; \ + COPYINDEX(i_ndex, &P_leaf[m_id * (LFBTS)]); \ + if (i_ndex > I_ndex) \ + h_igh = m_id; \ + else \ + l_ow = m_id; \ + } \ + if (l_ow == cJU_ALLONES) return(~h_igh); \ + \ + COPYINDEX(i_ndex, &P_leaf[l_ow * (LFBTS)]); \ + if (i_ndex != I_ndex) return(~h_igh); \ + return(l_ow) + +#endif // SEARCH_BINARY + +// Fast way to count bits set in 8..32[64]-bit int: +// +// For performance, j__udyCountBits*() are written to take advantage of +// platform-specific features where available. +// + +#ifdef JU_NOINLINE + +extern BITMAPB_t j__udyCountBitsB(BITMAPB_t word); +extern BITMAPL_t j__udyCountBitsL(BITMAPL_t word); + +// Compiler supports inline + +#elif defined(JU_HPUX_IPF) + +#define j__udyCountBitsB(WORD) _Asm_popcnt(WORD) +#define j__udyCountBitsL(WORD) _Asm_popcnt(WORD) + +#elif defined(JU_LINUX_IPF) + +static inline BITMAPB_t j__udyCountBitsB(BITMAPB_t word) +{ + BITMAPB_t result; + __asm__ ("popcnt %0=%1" : "=r" (result) : "r" (word)); + return(result); +} + +static inline BITMAPL_t j__udyCountBitsL(BITMAPL_t word) +{ + BITMAPL_t result; + __asm__ ("popcnt %0=%1" : "=r" (result) : "r" (word)); + return(result); +} + + +#else // No instructions available, use inline code + +// **************************************************************************** +// __ J U D Y C O U N T B I T S B +// +// Return the number of bits set in "Word", for a bitmap branch. +// +// Note: Bitmap branches have maximum bitmap size = 32 bits. + +#ifdef JU_WIN +static __inline BITMAPB_t j__udyCountBitsB(BITMAPB_t word) +#else +static inline BITMAPB_t j__udyCountBitsB(BITMAPB_t word) +#endif +{ + word = (word & 0x55555555) + ((word & 0xAAAAAAAA) >> 1); + word = (word & 0x33333333) + ((word & 0xCCCCCCCC) >> 2); + word = (word & 0x0F0F0F0F) + ((word & 0xF0F0F0F0) >> 4); // >= 8 bits. +#if defined(BITMAP_BRANCH16x16) || defined(BITMAP_BRANCH32x8) + word = (word & 0x00FF00FF) + ((word & 0xFF00FF00) >> 8); // >= 16 bits. +#endif + +#ifdef BITMAP_BRANCH32x8 + word = (word & 0x0000FFFF) + ((word & 0xFFFF0000) >> 16); // >= 32 bits. +#endif + return(word); + +} // j__udyCountBitsB() + + +// **************************************************************************** +// __ J U D Y C O U N T B I T S L +// +// Return the number of bits set in "Word", for a bitmap leaf. +// +// Note: Bitmap branches have maximum bitmap size = 32 bits. + +// Note: Need both 32-bit and 64-bit versions of j__udyCountBitsL() because +// bitmap leaves can have 64-bit bitmaps. + +#ifdef JU_WIN +static __inline BITMAPL_t j__udyCountBitsL(BITMAPL_t word) +#else +static inline BITMAPL_t j__udyCountBitsL(BITMAPL_t word) +#endif +{ +#ifndef JU_64BIT + + word = (word & 0x55555555) + ((word & 0xAAAAAAAA) >> 1); + word = (word & 0x33333333) + ((word & 0xCCCCCCCC) >> 2); + word = (word & 0x0F0F0F0F) + ((word & 0xF0F0F0F0) >> 4); // >= 8 bits. +#if defined(BITMAP_LEAF16x16) || defined(BITMAP_LEAF32x8) + word = (word & 0x00FF00FF) + ((word & 0xFF00FF00) >> 8); // >= 16 bits. +#endif +#ifdef BITMAP_LEAF32x8 + word = (word & 0x0000FFFF) + ((word & 0xFFFF0000) >> 16); // >= 32 bits. +#endif + +#else // JU_64BIT + + word = (word & 0x5555555555555555) + ((word & 0xAAAAAAAAAAAAAAAA) >> 1); + word = (word & 0x3333333333333333) + ((word & 0xCCCCCCCCCCCCCCCC) >> 2); + word = (word & 0x0F0F0F0F0F0F0F0F) + ((word & 0xF0F0F0F0F0F0F0F0) >> 4); +#if defined(BITMAP_LEAF16x16) || defined(BITMAP_LEAF32x8) || defined(BITMAP_LEAF64x4) + word = (word & 0x00FF00FF00FF00FF) + ((word & 0xFF00FF00FF00FF00) >> 8); +#endif +#if defined(BITMAP_LEAF32x8) || defined(BITMAP_LEAF64x4) + word = (word & 0x0000FFFF0000FFFF) + ((word & 0xFFFF0000FFFF0000) >>16); +#endif +#ifdef BITMAP_LEAF64x4 + word = (word & 0x00000000FFFFFFFF) + ((word & 0xFFFFFFFF00000000) >>32); +#endif +#endif // JU_64BIT + + return(word); + +} // j__udyCountBitsL() + +#endif // Compiler supports inline + +// GET POP0: +// +// Get from jp_DcdPopO the Pop0 for various JP Types. +// +// Notes: +// +// - Different macros require different parameters... +// +// - There are no simple macros for cJU_BRANCH* Types because their +// populations must be added up and dont reside in an already-calculated +// place. (TBD: This is no longer true, now its in the JPM.) +// +// - cJU_JPIMM_POP0() is not defined because it would be redundant because the +// Pop1 is already encoded in each enum name. +// +// - A linear or bitmap leaf Pop0 cannot exceed cJU_SUBEXPPERSTATE - 1 (Pop0 = +// 0..255), so use a simpler, faster macro for it than for other JP Types. +// +// - Avoid any complex calculations that would slow down the compiled code. +// Assume these macros are only called for the appropriate JP Types. +// Unfortunately theres no way to trigger an assertion here if the JP type +// is incorrect for the macro, because these are merely expressions, not +// statements. + +#define JU_LEAFW_POP0(JRP) (*P_JLW(JRP)) +#define cJU_JPFULLPOPU1_POP0 (cJU_SUBEXPPERSTATE - 1) + +// GET JP Type: +// Since bit fields greater than 32 bits are not supported in some compilers +// the jp_DcdPopO field is expanded to include the jp_Type in the high 8 bits +// of the Word_t. +// First the read macro: + +#define JU_JPTYPE(PJP) ((PJP)->jp_Type) + +#define JU_JPLEAF_POP0(PJP) ((PJP)->jp_DcdP0[sizeof(Word_t) - 2]) + +#ifdef JU_64BIT + +#define JU_JPDCDPOP0(PJP) \ + ((Word_t)(PJP)->jp_DcdP0[0] << 48 | \ + (Word_t)(PJP)->jp_DcdP0[1] << 40 | \ + (Word_t)(PJP)->jp_DcdP0[2] << 32 | \ + (Word_t)(PJP)->jp_DcdP0[3] << 24 | \ + (Word_t)(PJP)->jp_DcdP0[4] << 16 | \ + (Word_t)(PJP)->jp_DcdP0[5] << 8 | \ + (Word_t)(PJP)->jp_DcdP0[6]) + + +#define JU_JPSETADT(PJP,ADDR,DCDPOP0,TYPE) \ +{ \ + (PJP)->jp_Addr = (ADDR); \ + (PJP)->jp_DcdP0[0] = (uint8_t)((Word_t)(DCDPOP0) >> 48); \ + (PJP)->jp_DcdP0[1] = (uint8_t)((Word_t)(DCDPOP0) >> 40); \ + (PJP)->jp_DcdP0[2] = (uint8_t)((Word_t)(DCDPOP0) >> 32); \ + (PJP)->jp_DcdP0[3] = (uint8_t)((Word_t)(DCDPOP0) >> 24); \ + (PJP)->jp_DcdP0[4] = (uint8_t)((Word_t)(DCDPOP0) >> 16); \ + (PJP)->jp_DcdP0[5] = (uint8_t)((Word_t)(DCDPOP0) >> 8); \ + (PJP)->jp_DcdP0[6] = (uint8_t)((Word_t)(DCDPOP0)); \ + (PJP)->jp_Type = (TYPE); \ +} + +#else // 32 Bit + +#define JU_JPDCDPOP0(PJP) \ + ((Word_t)(PJP)->jp_DcdP0[0] << 16 | \ + (Word_t)(PJP)->jp_DcdP0[1] << 8 | \ + (Word_t)(PJP)->jp_DcdP0[2]) + + +#define JU_JPSETADT(PJP,ADDR,DCDPOP0,TYPE) \ +{ \ + (PJP)->jp_Addr = (ADDR); \ + (PJP)->jp_DcdP0[0] = (uint8_t)((Word_t)(DCDPOP0) >> 16); \ + (PJP)->jp_DcdP0[1] = (uint8_t)((Word_t)(DCDPOP0) >> 8); \ + (PJP)->jp_DcdP0[2] = (uint8_t)((Word_t)(DCDPOP0)); \ + (PJP)->jp_Type = (TYPE); \ +} + +#endif // 32 Bit + +// NUMBER OF BITS IN A BRANCH OR LEAF BITMAP AND SUBEXPANSE: +// +// Note: cJU_BITSPERBITMAP must be the same as the number of JPs in a branch. + +#define cJU_BITSPERBITMAP cJU_SUBEXPPERSTATE + +// Bitmaps are accessed in units of "subexpanses": + +#define cJU_BITSPERSUBEXPB (sizeof(BITMAPB_t) * cJU_BITSPERBYTE) +#define cJU_NUMSUBEXPB (cJU_BITSPERBITMAP / cJU_BITSPERSUBEXPB) + +#define cJU_BITSPERSUBEXPL (sizeof(BITMAPL_t) * cJU_BITSPERBYTE) +#define cJU_NUMSUBEXPL (cJU_BITSPERBITMAP / cJU_BITSPERSUBEXPL) + + +// MASK FOR A SPECIFIED BIT IN A BITMAP: +// +// Warning: If BitNum is a variable, this results in a variable shift that is +// expensive, at least on some processors. Use with caution. +// +// Warning: BitNum must be less than cJU_BITSPERWORD, that is, 0 .. +// cJU_BITSPERWORD - 1, to avoid a truncated shift on some machines. +// +// TBD: Perhaps use an array[32] of masks instead of calculating them. + +#define JU_BITPOSMASKB(BITNUM) (1L << ((BITNUM) % cJU_BITSPERSUBEXPB)) +#define JU_BITPOSMASKL(BITNUM) (1L << ((BITNUM) % cJU_BITSPERSUBEXPL)) + + +// TEST/SET/CLEAR A BIT IN A BITMAP LEAF: +// +// Test if a byte-sized Digit (portion of Index) has a corresponding bit set in +// a bitmap, or set a byte-sized Digits bit into a bitmap, by looking up the +// correct subexpanse and then checking/setting the correct bit. +// +// Note: Mask higher bits, if any, for the convenience of the user of this +// macro, in case they pass a full Index, not just a digit. If the caller has +// a true 8-bit digit, make it of type uint8_t and the compiler should skip the +// unnecessary mask step. + +#define JU_SUBEXPL(DIGIT) (((DIGIT) / cJU_BITSPERSUBEXPL) & (cJU_NUMSUBEXPL-1)) + +#define JU_BITMAPTESTL(PJLB, INDEX) \ + (JU_JLB_BITMAP(PJLB, JU_SUBEXPL(INDEX)) & JU_BITPOSMASKL(INDEX)) + +#define JU_BITMAPSETL(PJLB, INDEX) \ + (JU_JLB_BITMAP(PJLB, JU_SUBEXPL(INDEX)) |= JU_BITPOSMASKL(INDEX)) + +#define JU_BITMAPCLEARL(PJLB, INDEX) \ + (JU_JLB_BITMAP(PJLB, JU_SUBEXPL(INDEX)) ^= JU_BITPOSMASKL(INDEX)) + + +// MAP BITMAP BIT OFFSET TO DIGIT: +// +// Given a digit variable to set, a bitmap branch or leaf subexpanse (base 0), +// the bitmap (BITMAP*_t) for that subexpanse, and an offset (Nth set bit in +// the bitmap, base 0), compute the digit (also base 0) corresponding to the +// subexpanse and offset by counting all bits in the bitmap until offset+1 set +// bits are seen. Avoid expensive variable shifts. Offset should be less than +// the number of set bits in the bitmap; assert this. +// +// If theres a better way to do this, I dont know what it is. + +#define JU_BITMAPDIGITB(DIGIT,SUBEXP,BITMAP,OFFSET) \ + { \ + BITMAPB_t bitmap = (BITMAP); int remain = (OFFSET); \ + (DIGIT) = (SUBEXP) * cJU_BITSPERSUBEXPB; \ + \ + while ((remain -= (bitmap & 1)) >= 0) \ + { \ + bitmap >>= 1; ++(DIGIT); \ + assert((DIGIT) < ((SUBEXP) + 1) * cJU_BITSPERSUBEXPB); \ + } \ + } + +#define JU_BITMAPDIGITL(DIGIT,SUBEXP,BITMAP,OFFSET) \ + { \ + BITMAPL_t bitmap = (BITMAP); int remain = (OFFSET); \ + (DIGIT) = (SUBEXP) * cJU_BITSPERSUBEXPL; \ + \ + while ((remain -= (bitmap & 1)) >= 0) \ + { \ + bitmap >>= 1; ++(DIGIT); \ + assert((DIGIT) < ((SUBEXP) + 1) * cJU_BITSPERSUBEXPL); \ + } \ + } + + +// MASKS FOR PORTIONS OF 32-BIT WORDS: +// +// These are useful for bitmap subexpanses. +// +// "LOWER"/"HIGHER" means bits representing lower/higher-valued Indexes. The +// exact order of bits in the word is explicit here but is hidden from the +// caller. +// +// "EXC" means exclusive of the specified bit; "INC" means inclusive. +// +// In each case, BitPos is either "JU_BITPOSMASK*(BitNum)", or a variable saved +// from an earlier call of that macro; either way, it must be a 32-bit word +// with a single bit set. In the first case, assume the compiler is smart +// enough to optimize out common subexpressions. +// +// The expressions depend on unsigned decimal math that should be universal. + +#define JU_MASKLOWEREXC( BITPOS) ((BITPOS) - 1) +#define JU_MASKLOWERINC( BITPOS) (JU_MASKLOWEREXC(BITPOS) | (BITPOS)) +#define JU_MASKHIGHERINC(BITPOS) (-(BITPOS)) +#define JU_MASKHIGHEREXC(BITPOS) (JU_MASKHIGHERINC(BITPOS) ^ (BITPOS)) + + +// **************************************************************************** +// SUPPORT FOR NATIVE INDEX SIZES +// **************************************************************************** +// +// Copy a series of generic objects (uint8_t, uint16_t, uint32_t, Word_t) from +// one place to another. + +#define JU_COPYMEM(PDST,PSRC,POP1) \ + { \ + Word_t i_ndex = 0; \ + assert((POP1) > 0); \ + do { (PDST)[i_ndex] = (PSRC)[i_ndex]; } \ + while (++i_ndex < (POP1)); \ + } + + +// **************************************************************************** +// SUPPORT FOR NON-NATIVE INDEX SIZES +// **************************************************************************** +// +// Copy a 3-byte Index pointed by a uint8_t * to a Word_t: +// +#define JU_COPY3_PINDEX_TO_LONG(DESTLONG,PINDEX) \ + DESTLONG = (Word_t)(PINDEX)[0] << 16; \ + DESTLONG += (Word_t)(PINDEX)[1] << 8; \ + DESTLONG += (Word_t)(PINDEX)[2] + +// Copy a Word_t to a 3-byte Index pointed at by a uint8_t *: + +#define JU_COPY3_LONG_TO_PINDEX(PINDEX,SOURCELONG) \ + (PINDEX)[0] = (uint8_t)((SOURCELONG) >> 16); \ + (PINDEX)[1] = (uint8_t)((SOURCELONG) >> 8); \ + (PINDEX)[2] = (uint8_t)((SOURCELONG)) + +#ifdef JU_64BIT + +// Copy a 5-byte Index pointed by a uint8_t * to a Word_t: +// +#define JU_COPY5_PINDEX_TO_LONG(DESTLONG,PINDEX) \ + DESTLONG = (Word_t)(PINDEX)[0] << 32; \ + DESTLONG += (Word_t)(PINDEX)[1] << 24; \ + DESTLONG += (Word_t)(PINDEX)[2] << 16; \ + DESTLONG += (Word_t)(PINDEX)[3] << 8; \ + DESTLONG += (Word_t)(PINDEX)[4] + +// Copy a Word_t to a 5-byte Index pointed at by a uint8_t *: + +#define JU_COPY5_LONG_TO_PINDEX(PINDEX,SOURCELONG) \ + (PINDEX)[0] = (uint8_t)((SOURCELONG) >> 32); \ + (PINDEX)[1] = (uint8_t)((SOURCELONG) >> 24); \ + (PINDEX)[2] = (uint8_t)((SOURCELONG) >> 16); \ + (PINDEX)[3] = (uint8_t)((SOURCELONG) >> 8); \ + (PINDEX)[4] = (uint8_t)((SOURCELONG)) + +// Copy a 6-byte Index pointed by a uint8_t * to a Word_t: +// +#define JU_COPY6_PINDEX_TO_LONG(DESTLONG,PINDEX) \ + DESTLONG = (Word_t)(PINDEX)[0] << 40; \ + DESTLONG += (Word_t)(PINDEX)[1] << 32; \ + DESTLONG += (Word_t)(PINDEX)[2] << 24; \ + DESTLONG += (Word_t)(PINDEX)[3] << 16; \ + DESTLONG += (Word_t)(PINDEX)[4] << 8; \ + DESTLONG += (Word_t)(PINDEX)[5] + +// Copy a Word_t to a 6-byte Index pointed at by a uint8_t *: + +#define JU_COPY6_LONG_TO_PINDEX(PINDEX,SOURCELONG) \ + (PINDEX)[0] = (uint8_t)((SOURCELONG) >> 40); \ + (PINDEX)[1] = (uint8_t)((SOURCELONG) >> 32); \ + (PINDEX)[2] = (uint8_t)((SOURCELONG) >> 24); \ + (PINDEX)[3] = (uint8_t)((SOURCELONG) >> 16); \ + (PINDEX)[4] = (uint8_t)((SOURCELONG) >> 8); \ + (PINDEX)[5] = (uint8_t)((SOURCELONG)) + +// Copy a 7-byte Index pointed by a uint8_t * to a Word_t: +// +#define JU_COPY7_PINDEX_TO_LONG(DESTLONG,PINDEX) \ + DESTLONG = (Word_t)(PINDEX)[0] << 48; \ + DESTLONG += (Word_t)(PINDEX)[1] << 40; \ + DESTLONG += (Word_t)(PINDEX)[2] << 32; \ + DESTLONG += (Word_t)(PINDEX)[3] << 24; \ + DESTLONG += (Word_t)(PINDEX)[4] << 16; \ + DESTLONG += (Word_t)(PINDEX)[5] << 8; \ + DESTLONG += (Word_t)(PINDEX)[6] + +// Copy a Word_t to a 7-byte Index pointed at by a uint8_t *: + +#define JU_COPY7_LONG_TO_PINDEX(PINDEX,SOURCELONG) \ + (PINDEX)[0] = (uint8_t)((SOURCELONG) >> 48); \ + (PINDEX)[1] = (uint8_t)((SOURCELONG) >> 40); \ + (PINDEX)[2] = (uint8_t)((SOURCELONG) >> 32); \ + (PINDEX)[3] = (uint8_t)((SOURCELONG) >> 24); \ + (PINDEX)[4] = (uint8_t)((SOURCELONG) >> 16); \ + (PINDEX)[5] = (uint8_t)((SOURCELONG) >> 8); \ + (PINDEX)[6] = (uint8_t)((SOURCELONG)) + +#endif // JU_64BIT + +// **************************************************************************** +// COMMON CODE FRAGMENTS (MACROS) +// **************************************************************************** +// +// These code chunks are shared between various source files. + + +// SET (REPLACE) ONE DIGIT IN AN INDEX: +// +// To avoid endian issues, use masking and ORing, which operates in a +// big-endian register, rather than treating the Index as an array of bytes, +// though that would be simpler, but would operate in endian-specific memory. +// +// TBD: This contains two variable shifts, is that bad? + +#define JU_SETDIGIT(INDEX,DIGIT,STATE) \ + (INDEX) = ((INDEX) & (~cJU_MASKATSTATE(STATE))) \ + | (((Word_t) (DIGIT)) \ + << (((STATE) - 1) * cJU_BITSPERBYTE)) + +// Fast version for single LSB: + +#define JU_SETDIGIT1(INDEX,DIGIT) (INDEX) = ((INDEX) & ~0xff) | (DIGIT) + + +// SET (REPLACE) "N" LEAST DIGITS IN AN INDEX: + +#define JU_SETDIGITS(INDEX,INDEX2,cSTATE) \ + (INDEX) = ((INDEX ) & (~JU_LEASTBYTESMASK(cSTATE))) \ + | ((INDEX2) & ( JU_LEASTBYTESMASK(cSTATE))) + +// COPY DECODE BYTES FROM JP TO INDEX: +// +// Modify Index digit(s) to match the bytes in jp_DcdPopO in case one or more +// branches are skipped and the digits are significant. Its probably faster +// to just do this unconditionally than to check if its necessary. +// +// To avoid endian issues, use masking and ORing, which operates in a +// big-endian register, rather than treating the Index as an array of bytes, +// though that would be simpler, but would operate in endian-specific memory. +// +// WARNING: Must not call JU_LEASTBYTESMASK (via cJU_DCDMASK) with Bytes = +// cJU_ROOTSTATE or a bad mask is generated, but there are no Dcd bytes to copy +// in this case anyway. In fact there are no Dcd bytes unless State < +// cJU_ROOTSTATE - 1, so dont call this macro except in those cases. +// +// TBD: It would be nice to validate jp_DcdPopO against known digits to ensure +// no corruption, but this is non-trivial. + +#define JU_SETDCD(INDEX,PJP,cSTATE) \ + (INDEX) = ((INDEX) & ~cJU_DCDMASK(cSTATE)) \ + | (JU_JPDCDPOP0(PJP) & cJU_DCDMASK(cSTATE)) + +// INSERT/DELETE AN INDEX IN-PLACE IN MEMORY: +// +// Given a pointer to an array of "even" (native), same-sized objects +// (indexes), the current population of the array, an offset in the array, and +// a new Index to insert, "shift up" the array elements (Indexes) above the +// insertion point and insert the new Index. Assume there is sufficient memory +// to do this. +// +// In these macros, "i_offset" is an index offset, and "b_off" is a byte +// offset for odd Index sizes. +// +// Note: Endian issues only arise fro insertion, not deletion, and even for +// insertion, they are transparent when native (even) objects are used, and +// handled explicitly for odd (non-native) Index sizes. +// +// Note: The following macros are tricky enough that there is some test code +// for them appended to this file. + +#define JU_INSERTINPLACE(PARRAY,POP1,OFFSET,INDEX) \ + assert((long) (POP1) > 0); \ + assert((Word_t) (OFFSET) <= (Word_t) (POP1)); \ + { \ + Word_t i_offset = (POP1); \ + \ + while (i_offset-- > (OFFSET)) \ + (PARRAY)[i_offset + 1] = (PARRAY)[i_offset]; \ + \ + (PARRAY)[OFFSET] = (INDEX); \ + } + + +// Variation for non-native Indexes, where cIS = Index Size +// and PByte must point to a uint8_t (byte); shift byte-by-byte: +// + +#define JU_INSERTINPLACE3(PBYTE,POP1,OFFSET,INDEX) \ +{ \ + Word_t i_off = POP1; \ + \ + while (i_off-- > (OFFSET)) \ + { \ + Word_t i_dx = i_off * 3; \ + (PBYTE)[i_dx + 0 + 3] = (PBYTE)[i_dx + 0]; \ + (PBYTE)[i_dx + 1 + 3] = (PBYTE)[i_dx + 1]; \ + (PBYTE)[i_dx + 2 + 3] = (PBYTE)[i_dx + 2]; \ + } \ + JU_COPY3_LONG_TO_PINDEX(&((PBYTE)[(OFFSET) * 3]), INDEX); \ +} + +#ifdef JU_64BIT + +#define JU_INSERTINPLACE5(PBYTE,POP1,OFFSET,INDEX) \ +{ \ + Word_t i_off = POP1; \ + \ + while (i_off-- > (OFFSET)) \ + { \ + Word_t i_dx = i_off * 5; \ + (PBYTE)[i_dx + 0 + 5] = (PBYTE)[i_dx + 0]; \ + (PBYTE)[i_dx + 1 + 5] = (PBYTE)[i_dx + 1]; \ + (PBYTE)[i_dx + 2 + 5] = (PBYTE)[i_dx + 2]; \ + (PBYTE)[i_dx + 3 + 5] = (PBYTE)[i_dx + 3]; \ + (PBYTE)[i_dx + 4 + 5] = (PBYTE)[i_dx + 4]; \ + } \ + JU_COPY5_LONG_TO_PINDEX(&((PBYTE)[(OFFSET) * 5]), INDEX); \ +} + +#define JU_INSERTINPLACE6(PBYTE,POP1,OFFSET,INDEX) \ +{ \ + Word_t i_off = POP1; \ + \ + while (i_off-- > (OFFSET)) \ + { \ + Word_t i_dx = i_off * 6; \ + (PBYTE)[i_dx + 0 + 6] = (PBYTE)[i_dx + 0]; \ + (PBYTE)[i_dx + 1 + 6] = (PBYTE)[i_dx + 1]; \ + (PBYTE)[i_dx + 2 + 6] = (PBYTE)[i_dx + 2]; \ + (PBYTE)[i_dx + 3 + 6] = (PBYTE)[i_dx + 3]; \ + (PBYTE)[i_dx + 4 + 6] = (PBYTE)[i_dx + 4]; \ + (PBYTE)[i_dx + 5 + 6] = (PBYTE)[i_dx + 5]; \ + } \ + JU_COPY6_LONG_TO_PINDEX(&((PBYTE)[(OFFSET) * 6]), INDEX); \ +} + +#define JU_INSERTINPLACE7(PBYTE,POP1,OFFSET,INDEX) \ +{ \ + Word_t i_off = POP1; \ + \ + while (i_off-- > (OFFSET)) \ + { \ + Word_t i_dx = i_off * 7; \ + (PBYTE)[i_dx + 0 + 7] = (PBYTE)[i_dx + 0]; \ + (PBYTE)[i_dx + 1 + 7] = (PBYTE)[i_dx + 1]; \ + (PBYTE)[i_dx + 2 + 7] = (PBYTE)[i_dx + 2]; \ + (PBYTE)[i_dx + 3 + 7] = (PBYTE)[i_dx + 3]; \ + (PBYTE)[i_dx + 4 + 7] = (PBYTE)[i_dx + 4]; \ + (PBYTE)[i_dx + 5 + 7] = (PBYTE)[i_dx + 5]; \ + (PBYTE)[i_dx + 6 + 7] = (PBYTE)[i_dx + 6]; \ + } \ + JU_COPY7_LONG_TO_PINDEX(&((PBYTE)[(OFFSET) * 7]), INDEX); \ +} +#endif // JU_64BIT + +// Counterparts to the above for deleting an Index: +// +// "Shift down" the array elements starting at the Index to be deleted. + +#define JU_DELETEINPLACE(PARRAY,POP1,OFFSET,IGNORE) \ + assert((long) (POP1) > 0); \ + assert((Word_t) (OFFSET) < (Word_t) (POP1)); \ + { \ + Word_t i_offset = (OFFSET); \ + \ + while (++i_offset < (POP1)) \ + (PARRAY)[i_offset - 1] = (PARRAY)[i_offset]; \ + } + +// Variation for odd-byte-sized (non-native) Indexes, where cIS = Index Size +// and PByte must point to a uint8_t (byte); copy byte-by-byte: +// +// Note: If cIS == 1, JU_DELETEINPLACE_ODD == JU_DELETEINPLACE. +// +// Note: There are no endian issues here because bytes are just shifted as-is, +// not converted to/from an Index. + +#define JU_DELETEINPLACE_ODD(PBYTE,POP1,OFFSET,cIS) \ + assert((long) (POP1) > 0); \ + assert((Word_t) (OFFSET) < (Word_t) (POP1)); \ + { \ + Word_t b_off = (((OFFSET) + 1) * (cIS)) - 1; \ + \ + while (++b_off < ((POP1) * (cIS))) \ + (PBYTE)[b_off - (cIS)] = (PBYTE)[b_off]; \ + } + + +// INSERT/DELETE AN INDEX WHILE COPYING OTHERS: +// +// Copy PSource[] to PDest[], where PSource[] has Pop1 elements (Indexes), +// inserting Index at PDest[Offset]. Unlike JU_*INPLACE*() above, these macros +// are used when moving Indexes from one memory object to another. + +#define JU_INSERTCOPY(PDEST,PSOURCE,POP1,OFFSET,INDEX) \ + assert((long) (POP1) > 0); \ + assert((Word_t) (OFFSET) <= (Word_t) (POP1)); \ + { \ + Word_t i_offset; \ + \ + for (i_offset = 0; i_offset < (OFFSET); ++i_offset) \ + (PDEST)[i_offset] = (PSOURCE)[i_offset]; \ + \ + (PDEST)[i_offset] = (INDEX); \ + \ + for (/* null */; i_offset < (POP1); ++i_offset) \ + (PDEST)[i_offset + 1] = (PSOURCE)[i_offset]; \ + } + +#define JU_INSERTCOPY3(PDEST,PSOURCE,POP1,OFFSET,INDEX) \ +assert((long) (POP1) > 0); \ +assert((Word_t) (OFFSET) <= (Word_t) (POP1)); \ +{ \ + Word_t o_ff; \ + \ + for (o_ff = 0; o_ff < (OFFSET); o_ff++) \ + { \ + Word_t i_dx = o_ff * 3; \ + (PDEST)[i_dx + 0] = (PSOURCE)[i_dx + 0]; \ + (PDEST)[i_dx + 1] = (PSOURCE)[i_dx + 1]; \ + (PDEST)[i_dx + 2] = (PSOURCE)[i_dx + 2]; \ + } \ + JU_COPY3_LONG_TO_PINDEX(&((PDEST)[(OFFSET) * 3]), INDEX); \ + \ + for (/* null */; o_ff < (POP1); o_ff++) \ + { \ + Word_t i_dx = o_ff * 3; \ + (PDEST)[i_dx + 0 + 3] = (PSOURCE)[i_dx + 0]; \ + (PDEST)[i_dx + 1 + 3] = (PSOURCE)[i_dx + 1]; \ + (PDEST)[i_dx + 2 + 3] = (PSOURCE)[i_dx + 2]; \ + } \ +} + +#ifdef JU_64BIT + +#define JU_INSERTCOPY5(PDEST,PSOURCE,POP1,OFFSET,INDEX) \ +assert((long) (POP1) > 0); \ +assert((Word_t) (OFFSET) <= (Word_t) (POP1)); \ +{ \ + Word_t o_ff; \ + \ + for (o_ff = 0; o_ff < (OFFSET); o_ff++) \ + { \ + Word_t i_dx = o_ff * 5; \ + (PDEST)[i_dx + 0] = (PSOURCE)[i_dx + 0]; \ + (PDEST)[i_dx + 1] = (PSOURCE)[i_dx + 1]; \ + (PDEST)[i_dx + 2] = (PSOURCE)[i_dx + 2]; \ + (PDEST)[i_dx + 3] = (PSOURCE)[i_dx + 3]; \ + (PDEST)[i_dx + 4] = (PSOURCE)[i_dx + 4]; \ + } \ + JU_COPY5_LONG_TO_PINDEX(&((PDEST)[(OFFSET) * 5]), INDEX); \ + \ + for (/* null */; o_ff < (POP1); o_ff++) \ + { \ + Word_t i_dx = o_ff * 5; \ + (PDEST)[i_dx + 0 + 5] = (PSOURCE)[i_dx + 0]; \ + (PDEST)[i_dx + 1 + 5] = (PSOURCE)[i_dx + 1]; \ + (PDEST)[i_dx + 2 + 5] = (PSOURCE)[i_dx + 2]; \ + (PDEST)[i_dx + 3 + 5] = (PSOURCE)[i_dx + 3]; \ + (PDEST)[i_dx + 4 + 5] = (PSOURCE)[i_dx + 4]; \ + } \ +} + +#define JU_INSERTCOPY6(PDEST,PSOURCE,POP1,OFFSET,INDEX) \ +assert((long) (POP1) > 0); \ +assert((Word_t) (OFFSET) <= (Word_t) (POP1)); \ +{ \ + Word_t o_ff; \ + \ + for (o_ff = 0; o_ff < (OFFSET); o_ff++) \ + { \ + Word_t i_dx = o_ff * 6; \ + (PDEST)[i_dx + 0] = (PSOURCE)[i_dx + 0]; \ + (PDEST)[i_dx + 1] = (PSOURCE)[i_dx + 1]; \ + (PDEST)[i_dx + 2] = (PSOURCE)[i_dx + 2]; \ + (PDEST)[i_dx + 3] = (PSOURCE)[i_dx + 3]; \ + (PDEST)[i_dx + 4] = (PSOURCE)[i_dx + 4]; \ + (PDEST)[i_dx + 5] = (PSOURCE)[i_dx + 5]; \ + } \ + JU_COPY6_LONG_TO_PINDEX(&((PDEST)[(OFFSET) * 6]), INDEX); \ + \ + for (/* null */; o_ff < (POP1); o_ff++) \ + { \ + Word_t i_dx = o_ff * 6; \ + (PDEST)[i_dx + 0 + 6] = (PSOURCE)[i_dx + 0]; \ + (PDEST)[i_dx + 1 + 6] = (PSOURCE)[i_dx + 1]; \ + (PDEST)[i_dx + 2 + 6] = (PSOURCE)[i_dx + 2]; \ + (PDEST)[i_dx + 3 + 6] = (PSOURCE)[i_dx + 3]; \ + (PDEST)[i_dx + 4 + 6] = (PSOURCE)[i_dx + 4]; \ + (PDEST)[i_dx + 5 + 6] = (PSOURCE)[i_dx + 5]; \ + } \ +} + +#define JU_INSERTCOPY7(PDEST,PSOURCE,POP1,OFFSET,INDEX) \ +assert((long) (POP1) > 0); \ +assert((Word_t) (OFFSET) <= (Word_t) (POP1)); \ +{ \ + Word_t o_ff; \ + \ + for (o_ff = 0; o_ff < (OFFSET); o_ff++) \ + { \ + Word_t i_dx = o_ff * 7; \ + (PDEST)[i_dx + 0] = (PSOURCE)[i_dx + 0]; \ + (PDEST)[i_dx + 1] = (PSOURCE)[i_dx + 1]; \ + (PDEST)[i_dx + 2] = (PSOURCE)[i_dx + 2]; \ + (PDEST)[i_dx + 3] = (PSOURCE)[i_dx + 3]; \ + (PDEST)[i_dx + 4] = (PSOURCE)[i_dx + 4]; \ + (PDEST)[i_dx + 5] = (PSOURCE)[i_dx + 5]; \ + (PDEST)[i_dx + 6] = (PSOURCE)[i_dx + 6]; \ + } \ + JU_COPY7_LONG_TO_PINDEX(&((PDEST)[(OFFSET) * 7]), INDEX); \ + \ + for (/* null */; o_ff < (POP1); o_ff++) \ + { \ + Word_t i_dx = o_ff * 7; \ + (PDEST)[i_dx + 0 + 7] = (PSOURCE)[i_dx + 0]; \ + (PDEST)[i_dx + 1 + 7] = (PSOURCE)[i_dx + 1]; \ + (PDEST)[i_dx + 2 + 7] = (PSOURCE)[i_dx + 2]; \ + (PDEST)[i_dx + 3 + 7] = (PSOURCE)[i_dx + 3]; \ + (PDEST)[i_dx + 4 + 7] = (PSOURCE)[i_dx + 4]; \ + (PDEST)[i_dx + 5 + 7] = (PSOURCE)[i_dx + 5]; \ + (PDEST)[i_dx + 6 + 7] = (PSOURCE)[i_dx + 6]; \ + } \ +} + +#endif // JU_64BIT + +// Counterparts to the above for deleting an Index: + +#define JU_DELETECOPY(PDEST,PSOURCE,POP1,OFFSET,IGNORE) \ + assert((long) (POP1) > 0); \ + assert((Word_t) (OFFSET) < (Word_t) (POP1)); \ + { \ + Word_t i_offset; \ + \ + for (i_offset = 0; i_offset < (OFFSET); ++i_offset) \ + (PDEST)[i_offset] = (PSOURCE)[i_offset]; \ + \ + for (++i_offset; i_offset < (POP1); ++i_offset) \ + (PDEST)[i_offset - 1] = (PSOURCE)[i_offset]; \ + } + +// Variation for odd-byte-sized (non-native) Indexes, where cIS = Index Size; +// copy byte-by-byte: +// +// Note: There are no endian issues here because bytes are just shifted as-is, +// not converted to/from an Index. +// +// Note: If cIS == 1, JU_DELETECOPY_ODD == JU_DELETECOPY, at least in concept. + +#define JU_DELETECOPY_ODD(PDEST,PSOURCE,POP1,OFFSET,cIS) \ + assert((long) (POP1) > 0); \ + assert((Word_t) (OFFSET) < (Word_t) (POP1)); \ + { \ + uint8_t *_Pdest = (uint8_t *) (PDEST); \ + uint8_t *_Psource = (uint8_t *) (PSOURCE); \ + Word_t b_off; \ + \ + for (b_off = 0; b_off < ((OFFSET) * (cIS)); ++b_off) \ + *_Pdest++ = *_Psource++; \ + \ + _Psource += (cIS); \ + \ + for (b_off += (cIS); b_off < ((POP1) * (cIS)); ++b_off) \ + *_Pdest++ = *_Psource++; \ + } + + +// GENERIC RETURN CODE HANDLING FOR JUDY1 (NO VALUE AREAS) AND JUDYL (VALUE +// AREAS): +// +// This common code hides Judy1 versus JudyL details of how to return various +// conditions, including a pointer to a value area for JudyL. +// +// First, define an internal variation of JERR called JERRI (I = int) to make +// lint happy. We accidentally shipped to 11.11 OEUR with all functions that +// return int or Word_t using JERR, which is type Word_t, for errors. Lint +// complains about this for functions that return int. So, internally use +// JERRI for error returns from the int functions. Experiments show that +// callers which compare int Foo() to (Word_t) JERR (~0UL) are OK, since JERRI +// sign-extends to match JERR. + +#define JERRI ((int) ~0) // see above. + +#ifdef JUDY1 + +#define JU_RET_FOUND return(1) +#define JU_RET_NOTFOUND return(0) + +// For Judy1, these all "fall through" to simply JU_RET_FOUND, since there is no +// value area pointer to return: + +#define JU_RET_FOUND_LEAFW(PJLW,POP1,OFFSET) JU_RET_FOUND + +#define JU_RET_FOUND_JPM(Pjpm) JU_RET_FOUND +#define JU_RET_FOUND_PVALUE(Pjv,OFFSET) JU_RET_FOUND +#ifndef JU_64BIT +#define JU_RET_FOUND_LEAF1(Pjll,POP1,OFFSET) JU_RET_FOUND +#endif +#define JU_RET_FOUND_LEAF2(Pjll,POP1,OFFSET) JU_RET_FOUND +#define JU_RET_FOUND_LEAF3(Pjll,POP1,OFFSET) JU_RET_FOUND +#ifdef JU_64BIT +#define JU_RET_FOUND_LEAF4(Pjll,POP1,OFFSET) JU_RET_FOUND +#define JU_RET_FOUND_LEAF5(Pjll,POP1,OFFSET) JU_RET_FOUND +#define JU_RET_FOUND_LEAF6(Pjll,POP1,OFFSET) JU_RET_FOUND +#define JU_RET_FOUND_LEAF7(Pjll,POP1,OFFSET) JU_RET_FOUND +#endif +#define JU_RET_FOUND_IMM_01(Pjp) JU_RET_FOUND +#define JU_RET_FOUND_IMM(Pjp,OFFSET) JU_RET_FOUND + +// Note: No JudyL equivalent: + +#define JU_RET_FOUND_FULLPOPU1 JU_RET_FOUND +#define JU_RET_FOUND_LEAF_B1(PJLB,SUBEXP,OFFSET) JU_RET_FOUND + +#else // JUDYL + +// JU_RET_FOUND // see below; must NOT be defined for JudyL. +#define JU_RET_NOTFOUND return((PPvoid_t) NULL) + +// For JudyL, the location of the value area depends on the JP type and other +// factors: +// +// TBD: The value areas should be accessed via data structures, here and in +// Dougs code, not by hard-coded address calculations. +// +// This is useful in insert/delete code when the value area is returned from +// lower levels in the JPM: + +#define JU_RET_FOUND_JPM(Pjpm) return((PPvoid_t) ((Pjpm)->jpm_PValue)) + +// This is useful in insert/delete code when the value area location is already +// computed: + +#define JU_RET_FOUND_PVALUE(Pjv,OFFSET) return((PPvoid_t) ((Pjv) + OFFSET)) + +#define JU_RET_FOUND_LEAFW(PJLW,POP1,OFFSET) \ + return((PPvoid_t) (JL_LEAFWVALUEAREA(PJLW, POP1) + (OFFSET))) + +#define JU_RET_FOUND_LEAF1(Pjll,POP1,OFFSET) \ + return((PPvoid_t) (JL_LEAF1VALUEAREA(Pjll, POP1) + (OFFSET))) +#define JU_RET_FOUND_LEAF2(Pjll,POP1,OFFSET) \ + return((PPvoid_t) (JL_LEAF2VALUEAREA(Pjll, POP1) + (OFFSET))) +#define JU_RET_FOUND_LEAF3(Pjll,POP1,OFFSET) \ + return((PPvoid_t) (JL_LEAF3VALUEAREA(Pjll, POP1) + (OFFSET))) +#ifdef JU_64BIT +#define JU_RET_FOUND_LEAF4(Pjll,POP1,OFFSET) \ + return((PPvoid_t) (JL_LEAF4VALUEAREA(Pjll, POP1) + (OFFSET))) +#define JU_RET_FOUND_LEAF5(Pjll,POP1,OFFSET) \ + return((PPvoid_t) (JL_LEAF5VALUEAREA(Pjll, POP1) + (OFFSET))) +#define JU_RET_FOUND_LEAF6(Pjll,POP1,OFFSET) \ + return((PPvoid_t) (JL_LEAF6VALUEAREA(Pjll, POP1) + (OFFSET))) +#define JU_RET_FOUND_LEAF7(Pjll,POP1,OFFSET) \ + return((PPvoid_t) (JL_LEAF7VALUEAREA(Pjll, POP1) + (OFFSET))) +#endif + +// Note: Here jp_Addr is a value area itself and not an address, so P_JV() is +// not needed: + +#define JU_RET_FOUND_IMM_01(PJP) return((PPvoid_t) (&((PJP)->jp_Addr))) + +// Note: Here jp_Addr is a pointer to a separately-mallocd value area, so +// P_JV() is required; likewise for JL_JLB_PVALUE: + +#define JU_RET_FOUND_IMM(PJP,OFFSET) \ + return((PPvoid_t) (P_JV((PJP)->jp_Addr) + (OFFSET))) + +#define JU_RET_FOUND_LEAF_B1(PJLB,SUBEXP,OFFSET) \ + return((PPvoid_t) (P_JV(JL_JLB_PVALUE(PJLB, SUBEXP)) + (OFFSET))) + +#endif // JUDYL + + +// GENERIC ERROR HANDLING: +// +// This is complicated by variations in the needs of the callers of these +// macros. Only use JU_SET_ERRNO() for PJError, because it can be null; use +// JU_SET_ERRNO_NONNULL() for Pjpm, which is never null, and also in other +// cases where the pointer is known not to be null (to save dead branches). +// +// Note: Most cases of JU_ERRNO_OVERRUN or JU_ERRNO_CORRUPT should result in +// an assertion failure in debug code, so they are more likely to be caught, so +// do that here in each macro. + +#define JU_SET_ERRNO(PJError, JErrno) \ + { \ + assert((JErrno) != JU_ERRNO_OVERRUN); \ + assert((JErrno) != JU_ERRNO_CORRUPT); \ + \ + if (PJError != (PJError_t) NULL) \ + { \ + JU_ERRNO(PJError) = (JErrno); \ + JU_ERRID(PJError) = __LINE__; \ + } \ + } + +// Variation for callers who know already that PJError is non-null; and, it can +// also be Pjpm (both PJError_t and Pjpm_t have je_* fields), so only assert it +// for null, not cast to any specific pointer type: + +#define JU_SET_ERRNO_NONNULL(PJError, JErrno) \ + { \ + assert((JErrno) != JU_ERRNO_OVERRUN); \ + assert((JErrno) != JU_ERRNO_CORRUPT); \ + assert(PJError); \ + \ + JU_ERRNO(PJError) = (JErrno); \ + JU_ERRID(PJError) = __LINE__; \ + } + +// Variation to copy error info from a (required) JPM to an (optional) +// PJError_t: +// +// Note: The assertions above about JU_ERRNO_OVERRUN and JU_ERRNO_CORRUPT +// should have already popped, so they are not needed here. + +#define JU_COPY_ERRNO(PJError, Pjpm) \ + { \ + if (PJError) \ + { \ + JU_ERRNO(PJError) = (uint8_t)JU_ERRNO(Pjpm); \ + JU_ERRID(PJError) = JU_ERRID(Pjpm); \ + } \ + } + +// For JErrno parameter to previous macros upon return from Judy*Alloc*(): +// +// The memory allocator returns an address of 0 for out of memory, +// 1..sizeof(Word_t)-1 for corruption (an invalid pointer), otherwise a valid +// pointer. + +#define JU_ALLOC_ERRNO(ADDR) \ + (((void *) (ADDR) != (void *) NULL) ? JU_ERRNO_OVERRUN : JU_ERRNO_NOMEM) + +#define JU_CHECKALLOC(Type,Ptr,Retval) \ + if ((Ptr) < (Type) sizeof(Word_t)) \ + { \ + JU_SET_ERRNO(PJError, JU_ALLOC_ERRNO(Ptr)); \ + return(Retval); \ + } + +// Leaf search routines + +#ifdef JU_NOINLINE + +int j__udySearchLeaf1(Pjll_t Pjll, Word_t LeafPop1, Word_t Index); +int j__udySearchLeaf2(Pjll_t Pjll, Word_t LeafPop1, Word_t Index); +int j__udySearchLeaf3(Pjll_t Pjll, Word_t LeafPop1, Word_t Index); + +#ifdef JU_64BIT + +int j__udySearchLeaf4(Pjll_t Pjll, Word_t LeafPop1, Word_t Index); +int j__udySearchLeaf5(Pjll_t Pjll, Word_t LeafPop1, Word_t Index); +int j__udySearchLeaf6(Pjll_t Pjll, Word_t LeafPop1, Word_t Index); +int j__udySearchLeaf7(Pjll_t Pjll, Word_t LeafPop1, Word_t Index); + +#endif // JU_64BIT + +int j__udySearchLeafW(Pjlw_t Pjlw, Word_t LeafPop1, Word_t Index); + +#else // complier support for inline + +#ifdef JU_WIN +static __inline int j__udySearchLeaf1(Pjll_t Pjll, Word_t LeafPop1, Word_t Index) +#else +static inline int j__udySearchLeaf1(Pjll_t Pjll, Word_t LeafPop1, Word_t Index) +#endif +{ SEARCHLEAFNATIVE(uint8_t, Pjll, LeafPop1, Index); } + +#ifdef JU_WIN +static __inline int j__udySearchLeaf2(Pjll_t Pjll, Word_t LeafPop1, Word_t Index) +#else +static inline int j__udySearchLeaf2(Pjll_t Pjll, Word_t LeafPop1, Word_t Index) +#endif +{ SEARCHLEAFNATIVE(uint16_t, Pjll, LeafPop1, Index); } + +#ifdef JU_WIN +static __inline int j__udySearchLeaf3(Pjll_t Pjll, Word_t LeafPop1, Word_t Index) +#else +static inline int j__udySearchLeaf3(Pjll_t Pjll, Word_t LeafPop1, Word_t Index) +#endif +{ SEARCHLEAFNONNAT(Pjll, LeafPop1, Index, 3, JU_COPY3_PINDEX_TO_LONG); } + +#ifdef JU_64BIT + +#ifdef JU_WIN +static __inline int j__udySearchLeaf4(Pjll_t Pjll, Word_t LeafPop1, Word_t Index) +#else +static inline int j__udySearchLeaf4(Pjll_t Pjll, Word_t LeafPop1, Word_t Index) +#endif +{ SEARCHLEAFNATIVE(uint32_t, Pjll, LeafPop1, Index); } + +#ifdef JU_WIN +static __inline int j__udySearchLeaf5(Pjll_t Pjll, Word_t LeafPop1, Word_t Index) +#else +static inline int j__udySearchLeaf5(Pjll_t Pjll, Word_t LeafPop1, Word_t Index) +#endif +{ SEARCHLEAFNONNAT(Pjll, LeafPop1, Index, 5, JU_COPY5_PINDEX_TO_LONG); } + +#ifdef JU_WIN +static __inline int j__udySearchLeaf6(Pjll_t Pjll, Word_t LeafPop1, Word_t Index) +#else +static inline int j__udySearchLeaf6(Pjll_t Pjll, Word_t LeafPop1, Word_t Index) +#endif +{ SEARCHLEAFNONNAT(Pjll, LeafPop1, Index, 6, JU_COPY6_PINDEX_TO_LONG); } + +#ifdef JU_WIN +static __inline int j__udySearchLeaf7(Pjll_t Pjll, Word_t LeafPop1, Word_t Index) +#else +static inline int j__udySearchLeaf7(Pjll_t Pjll, Word_t LeafPop1, Word_t Index) +#endif +{ SEARCHLEAFNONNAT(Pjll, LeafPop1, Index, 7, JU_COPY7_PINDEX_TO_LONG); } + +#endif // JU_64BIT + +#ifdef JU_WIN +static __inline int j__udySearchLeafW(Pjlw_t Pjlw, Word_t LeafPop1, Word_t Index) +#else +static inline int j__udySearchLeafW(Pjlw_t Pjlw, Word_t LeafPop1, Word_t Index) +#endif +{ SEARCHLEAFNATIVE(Word_t, Pjlw, LeafPop1, Index); } + +#endif // compiler support for inline + +#endif // ! _JUDYPRIVATE_INCLUDED diff --git a/src/libnetdata/libjudy/src/JudyCommon/JudyPrivate1L.h b/src/libnetdata/libjudy/src/JudyCommon/JudyPrivate1L.h new file mode 100644 index 00000000..5b470489 --- /dev/null +++ b/src/libnetdata/libjudy/src/JudyCommon/JudyPrivate1L.h @@ -0,0 +1,485 @@ +#ifndef _JUDYPRIVATE1L_INCLUDED +#define _JUDYPRIVATE1L_INCLUDED +// _________________ +// +// Copyright (C) 2000 - 2002 Hewlett-Packard Company +// +// This program is free software; you can redistribute it and/or modify it +// under the term of the GNU Lesser General Public License as published by the +// Free Software Foundation; either version 2 of the License, or (at your +// option) any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License +// for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this program; if not, write to the Free Software Foundation, +// Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// _________________ + +// @(#) $Revision: 4.31 $ $Source: /judy/src/JudyCommon/JudyPrivate1L.h $ + +// **************************************************************************** +// Declare common cJU_* names for JP Types that occur in both Judy1 and JudyL, +// for use by code that ifdefs JUDY1 and JUDYL. Only JP Types common to both +// Judy1 and JudyL are #defined here with equivalent cJU_* names. JP Types +// unique to only Judy1 or JudyL are listed in comments, so the type lists +// match the Judy1.h and JudyL.h files. +// +// This file also defines cJU_* for other JP-related constants and functions +// that some shared JUDY1/JUDYL code finds handy. +// +// At least in principle this file should be included AFTER Judy1.h or JudyL.h. +// +// WARNING: This file must be kept consistent with the enums in Judy1.h and +// JudyL.h. +// +// TBD: You might think, why not define common cJU_* enums in, say, +// JudyPrivate.h, and then inherit them into superset enums in Judy1.h and +// JudyL.h? The problem is that the enum lists for each class (cJ1_* and +// cJL_*) must be numerically "packed" into the correct order, for two reasons: +// (1) allow the compiler to generate "tight" switch statements with no wasted +// slots (although this is not very big), and (2) allow calculations using the +// enum values, although this is also not an issue if the calculations are only +// within each cJ*_JPIMMED_*_* class and the members are packed within the +// class. + +#ifdef JUDY1 + +#define cJU_JRPNULL cJ1_JRPNULL +#define cJU_JPNULL1 cJ1_JPNULL1 +#define cJU_JPNULL2 cJ1_JPNULL2 +#define cJU_JPNULL3 cJ1_JPNULL3 +#ifdef JU_64BIT +#define cJU_JPNULL4 cJ1_JPNULL4 +#define cJU_JPNULL5 cJ1_JPNULL5 +#define cJU_JPNULL6 cJ1_JPNULL6 +#define cJU_JPNULL7 cJ1_JPNULL7 +#endif +#define cJU_JPNULLMAX cJ1_JPNULLMAX +#define cJU_JPBRANCH_L2 cJ1_JPBRANCH_L2 +#define cJU_JPBRANCH_L3 cJ1_JPBRANCH_L3 +#ifdef JU_64BIT +#define cJU_JPBRANCH_L4 cJ1_JPBRANCH_L4 +#define cJU_JPBRANCH_L5 cJ1_JPBRANCH_L5 +#define cJU_JPBRANCH_L6 cJ1_JPBRANCH_L6 +#define cJU_JPBRANCH_L7 cJ1_JPBRANCH_L7 +#endif +#define cJU_JPBRANCH_L cJ1_JPBRANCH_L +#define j__U_BranchBJPPopToWords j__1_BranchBJPPopToWords +#define cJU_JPBRANCH_B2 cJ1_JPBRANCH_B2 +#define cJU_JPBRANCH_B3 cJ1_JPBRANCH_B3 +#ifdef JU_64BIT +#define cJU_JPBRANCH_B4 cJ1_JPBRANCH_B4 +#define cJU_JPBRANCH_B5 cJ1_JPBRANCH_B5 +#define cJU_JPBRANCH_B6 cJ1_JPBRANCH_B6 +#define cJU_JPBRANCH_B7 cJ1_JPBRANCH_B7 +#endif +#define cJU_JPBRANCH_B cJ1_JPBRANCH_B +#define cJU_JPBRANCH_U2 cJ1_JPBRANCH_U2 +#define cJU_JPBRANCH_U3 cJ1_JPBRANCH_U3 +#ifdef JU_64BIT +#define cJU_JPBRANCH_U4 cJ1_JPBRANCH_U4 +#define cJU_JPBRANCH_U5 cJ1_JPBRANCH_U5 +#define cJU_JPBRANCH_U6 cJ1_JPBRANCH_U6 +#define cJU_JPBRANCH_U7 cJ1_JPBRANCH_U7 +#endif +#define cJU_JPBRANCH_U cJ1_JPBRANCH_U +#ifndef JU_64BIT +#define cJU_JPLEAF1 cJ1_JPLEAF1 +#endif +#define cJU_JPLEAF2 cJ1_JPLEAF2 +#define cJU_JPLEAF3 cJ1_JPLEAF3 +#ifdef JU_64BIT +#define cJU_JPLEAF4 cJ1_JPLEAF4 +#define cJU_JPLEAF5 cJ1_JPLEAF5 +#define cJU_JPLEAF6 cJ1_JPLEAF6 +#define cJU_JPLEAF7 cJ1_JPLEAF7 +#endif +#define cJU_JPLEAF_B1 cJ1_JPLEAF_B1 +// cJ1_JPFULLPOPU1 +#define cJU_JPIMMED_1_01 cJ1_JPIMMED_1_01 +#define cJU_JPIMMED_2_01 cJ1_JPIMMED_2_01 +#define cJU_JPIMMED_3_01 cJ1_JPIMMED_3_01 +#ifdef JU_64BIT +#define cJU_JPIMMED_4_01 cJ1_JPIMMED_4_01 +#define cJU_JPIMMED_5_01 cJ1_JPIMMED_5_01 +#define cJU_JPIMMED_6_01 cJ1_JPIMMED_6_01 +#define cJU_JPIMMED_7_01 cJ1_JPIMMED_7_01 +#endif +#define cJU_JPIMMED_1_02 cJ1_JPIMMED_1_02 +#define cJU_JPIMMED_1_03 cJ1_JPIMMED_1_03 +#define cJU_JPIMMED_1_04 cJ1_JPIMMED_1_04 +#define cJU_JPIMMED_1_05 cJ1_JPIMMED_1_05 +#define cJU_JPIMMED_1_06 cJ1_JPIMMED_1_06 +#define cJU_JPIMMED_1_07 cJ1_JPIMMED_1_07 +#ifdef JU_64BIT +// cJ1_JPIMMED_1_08 +// cJ1_JPIMMED_1_09 +// cJ1_JPIMMED_1_10 +// cJ1_JPIMMED_1_11 +// cJ1_JPIMMED_1_12 +// cJ1_JPIMMED_1_13 +// cJ1_JPIMMED_1_14 +// cJ1_JPIMMED_1_15 +#endif +#define cJU_JPIMMED_2_02 cJ1_JPIMMED_2_02 +#define cJU_JPIMMED_2_03 cJ1_JPIMMED_2_03 +#ifdef JU_64BIT +// cJ1_JPIMMED_2_04 +// cJ1_JPIMMED_2_05 +// cJ1_JPIMMED_2_06 +// cJ1_JPIMMED_2_07 +#endif +#define cJU_JPIMMED_3_02 cJ1_JPIMMED_3_02 +#ifdef JU_64BIT +// cJ1_JPIMMED_3_03 +// cJ1_JPIMMED_3_04 +// cJ1_JPIMMED_3_05 +// cJ1_JPIMMED_4_02 +// cJ1_JPIMMED_4_03 +// cJ1_JPIMMED_5_02 +// cJ1_JPIMMED_5_03 +// cJ1_JPIMMED_6_02 +// cJ1_JPIMMED_7_02 +#endif +#define cJU_JPIMMED_CAP cJ1_JPIMMED_CAP + +#else // JUDYL **************************************************************** + +#define cJU_JRPNULL cJL_JRPNULL +#define cJU_JPNULL1 cJL_JPNULL1 +#define cJU_JPNULL2 cJL_JPNULL2 +#define cJU_JPNULL3 cJL_JPNULL3 +#ifdef JU_64BIT +#define cJU_JPNULL4 cJL_JPNULL4 +#define cJU_JPNULL5 cJL_JPNULL5 +#define cJU_JPNULL6 cJL_JPNULL6 +#define cJU_JPNULL7 cJL_JPNULL7 +#endif +#define cJU_JPNULLMAX cJL_JPNULLMAX +#define cJU_JPBRANCH_L2 cJL_JPBRANCH_L2 +#define cJU_JPBRANCH_L3 cJL_JPBRANCH_L3 +#ifdef JU_64BIT +#define cJU_JPBRANCH_L4 cJL_JPBRANCH_L4 +#define cJU_JPBRANCH_L5 cJL_JPBRANCH_L5 +#define cJU_JPBRANCH_L6 cJL_JPBRANCH_L6 +#define cJU_JPBRANCH_L7 cJL_JPBRANCH_L7 +#endif +#define cJU_JPBRANCH_L cJL_JPBRANCH_L +#define j__U_BranchBJPPopToWords j__L_BranchBJPPopToWords +#define cJU_JPBRANCH_B2 cJL_JPBRANCH_B2 +#define cJU_JPBRANCH_B3 cJL_JPBRANCH_B3 +#ifdef JU_64BIT +#define cJU_JPBRANCH_B4 cJL_JPBRANCH_B4 +#define cJU_JPBRANCH_B5 cJL_JPBRANCH_B5 +#define cJU_JPBRANCH_B6 cJL_JPBRANCH_B6 +#define cJU_JPBRANCH_B7 cJL_JPBRANCH_B7 +#endif +#define cJU_JPBRANCH_B cJL_JPBRANCH_B +#define cJU_JPBRANCH_U2 cJL_JPBRANCH_U2 +#define cJU_JPBRANCH_U3 cJL_JPBRANCH_U3 +#ifdef JU_64BIT +#define cJU_JPBRANCH_U4 cJL_JPBRANCH_U4 +#define cJU_JPBRANCH_U5 cJL_JPBRANCH_U5 +#define cJU_JPBRANCH_U6 cJL_JPBRANCH_U6 +#define cJU_JPBRANCH_U7 cJL_JPBRANCH_U7 +#endif +#define cJU_JPBRANCH_U cJL_JPBRANCH_U +#define cJU_JPLEAF1 cJL_JPLEAF1 +#define cJU_JPLEAF2 cJL_JPLEAF2 +#define cJU_JPLEAF3 cJL_JPLEAF3 +#ifdef JU_64BIT +#define cJU_JPLEAF4 cJL_JPLEAF4 +#define cJU_JPLEAF5 cJL_JPLEAF5 +#define cJU_JPLEAF6 cJL_JPLEAF6 +#define cJU_JPLEAF7 cJL_JPLEAF7 +#endif +#define cJU_JPLEAF_B1 cJL_JPLEAF_B1 +#define cJU_JPIMMED_1_01 cJL_JPIMMED_1_01 +#define cJU_JPIMMED_2_01 cJL_JPIMMED_2_01 +#define cJU_JPIMMED_3_01 cJL_JPIMMED_3_01 +#ifdef JU_64BIT +#define cJU_JPIMMED_4_01 cJL_JPIMMED_4_01 +#define cJU_JPIMMED_5_01 cJL_JPIMMED_5_01 +#define cJU_JPIMMED_6_01 cJL_JPIMMED_6_01 +#define cJU_JPIMMED_7_01 cJL_JPIMMED_7_01 +#endif +#define cJU_JPIMMED_1_02 cJL_JPIMMED_1_02 +#define cJU_JPIMMED_1_03 cJL_JPIMMED_1_03 +#ifdef JU_64BIT +#define cJU_JPIMMED_1_04 cJL_JPIMMED_1_04 +#define cJU_JPIMMED_1_05 cJL_JPIMMED_1_05 +#define cJU_JPIMMED_1_06 cJL_JPIMMED_1_06 +#define cJU_JPIMMED_1_07 cJL_JPIMMED_1_07 +#define cJU_JPIMMED_2_02 cJL_JPIMMED_2_02 +#define cJU_JPIMMED_2_03 cJL_JPIMMED_2_03 +#define cJU_JPIMMED_3_02 cJL_JPIMMED_3_02 +#endif +#define cJU_JPIMMED_CAP cJL_JPIMMED_CAP + +#endif // JUDYL + + +// **************************************************************************** +// cJU*_ other than JP types: + +#ifdef JUDY1 + +#define cJU_LEAFW_MAXPOP1 cJ1_LEAFW_MAXPOP1 +#ifndef JU_64BIT +#define cJU_LEAF1_MAXPOP1 cJ1_LEAF1_MAXPOP1 +#endif +#define cJU_LEAF2_MAXPOP1 cJ1_LEAF2_MAXPOP1 +#define cJU_LEAF3_MAXPOP1 cJ1_LEAF3_MAXPOP1 +#ifdef JU_64BIT +#define cJU_LEAF4_MAXPOP1 cJ1_LEAF4_MAXPOP1 +#define cJU_LEAF5_MAXPOP1 cJ1_LEAF5_MAXPOP1 +#define cJU_LEAF6_MAXPOP1 cJ1_LEAF6_MAXPOP1 +#define cJU_LEAF7_MAXPOP1 cJ1_LEAF7_MAXPOP1 +#endif +#define cJU_IMMED1_MAXPOP1 cJ1_IMMED1_MAXPOP1 +#define cJU_IMMED2_MAXPOP1 cJ1_IMMED2_MAXPOP1 +#define cJU_IMMED3_MAXPOP1 cJ1_IMMED3_MAXPOP1 +#ifdef JU_64BIT +#define cJU_IMMED4_MAXPOP1 cJ1_IMMED4_MAXPOP1 +#define cJU_IMMED5_MAXPOP1 cJ1_IMMED5_MAXPOP1 +#define cJU_IMMED6_MAXPOP1 cJ1_IMMED6_MAXPOP1 +#define cJU_IMMED7_MAXPOP1 cJ1_IMMED7_MAXPOP1 +#endif + +#define JU_LEAF1POPTOWORDS(Pop1) J1_LEAF1POPTOWORDS(Pop1) +#define JU_LEAF2POPTOWORDS(Pop1) J1_LEAF2POPTOWORDS(Pop1) +#define JU_LEAF3POPTOWORDS(Pop1) J1_LEAF3POPTOWORDS(Pop1) +#ifdef JU_64BIT +#define JU_LEAF4POPTOWORDS(Pop1) J1_LEAF4POPTOWORDS(Pop1) +#define JU_LEAF5POPTOWORDS(Pop1) J1_LEAF5POPTOWORDS(Pop1) +#define JU_LEAF6POPTOWORDS(Pop1) J1_LEAF6POPTOWORDS(Pop1) +#define JU_LEAF7POPTOWORDS(Pop1) J1_LEAF7POPTOWORDS(Pop1) +#endif +#define JU_LEAFWPOPTOWORDS(Pop1) J1_LEAFWPOPTOWORDS(Pop1) + +#ifndef JU_64BIT +#define JU_LEAF1GROWINPLACE(Pop1) J1_LEAF1GROWINPLACE(Pop1) +#endif +#define JU_LEAF2GROWINPLACE(Pop1) J1_LEAF2GROWINPLACE(Pop1) +#define JU_LEAF3GROWINPLACE(Pop1) J1_LEAF3GROWINPLACE(Pop1) +#ifdef JU_64BIT +#define JU_LEAF4GROWINPLACE(Pop1) J1_LEAF4GROWINPLACE(Pop1) +#define JU_LEAF5GROWINPLACE(Pop1) J1_LEAF5GROWINPLACE(Pop1) +#define JU_LEAF6GROWINPLACE(Pop1) J1_LEAF6GROWINPLACE(Pop1) +#define JU_LEAF7GROWINPLACE(Pop1) J1_LEAF7GROWINPLACE(Pop1) +#endif +#define JU_LEAFWGROWINPLACE(Pop1) J1_LEAFWGROWINPLACE(Pop1) + +#define j__udyCreateBranchL j__udy1CreateBranchL +#define j__udyCreateBranchB j__udy1CreateBranchB +#define j__udyCreateBranchU j__udy1CreateBranchU +#define j__udyCascade1 j__udy1Cascade1 +#define j__udyCascade2 j__udy1Cascade2 +#define j__udyCascade3 j__udy1Cascade3 +#ifdef JU_64BIT +#define j__udyCascade4 j__udy1Cascade4 +#define j__udyCascade5 j__udy1Cascade5 +#define j__udyCascade6 j__udy1Cascade6 +#define j__udyCascade7 j__udy1Cascade7 +#endif +#define j__udyCascadeL j__udy1CascadeL +#define j__udyInsertBranch j__udy1InsertBranch + +#define j__udyBranchBToBranchL j__udy1BranchBToBranchL +#ifndef JU_64BIT +#define j__udyLeafB1ToLeaf1 j__udy1LeafB1ToLeaf1 +#endif +#define j__udyLeaf1ToLeaf2 j__udy1Leaf1ToLeaf2 +#define j__udyLeaf2ToLeaf3 j__udy1Leaf2ToLeaf3 +#ifndef JU_64BIT +#define j__udyLeaf3ToLeafW j__udy1Leaf3ToLeafW +#else +#define j__udyLeaf3ToLeaf4 j__udy1Leaf3ToLeaf4 +#define j__udyLeaf4ToLeaf5 j__udy1Leaf4ToLeaf5 +#define j__udyLeaf5ToLeaf6 j__udy1Leaf5ToLeaf6 +#define j__udyLeaf6ToLeaf7 j__udy1Leaf6ToLeaf7 +#define j__udyLeaf7ToLeafW j__udy1Leaf7ToLeafW +#endif + +#define jpm_t j1pm_t +#define Pjpm_t Pj1pm_t + +#define jlb_t j1lb_t +#define Pjlb_t Pj1lb_t + +#define JU_JLB_BITMAP J1_JLB_BITMAP + +#define j__udyAllocJPM j__udy1AllocJ1PM +#define j__udyAllocJBL j__udy1AllocJBL +#define j__udyAllocJBB j__udy1AllocJBB +#define j__udyAllocJBBJP j__udy1AllocJBBJP +#define j__udyAllocJBU j__udy1AllocJBU +#ifndef JU_64BIT +#define j__udyAllocJLL1 j__udy1AllocJLL1 +#endif +#define j__udyAllocJLL2 j__udy1AllocJLL2 +#define j__udyAllocJLL3 j__udy1AllocJLL3 +#ifdef JU_64BIT +#define j__udyAllocJLL4 j__udy1AllocJLL4 +#define j__udyAllocJLL5 j__udy1AllocJLL5 +#define j__udyAllocJLL6 j__udy1AllocJLL6 +#define j__udyAllocJLL7 j__udy1AllocJLL7 +#endif +#define j__udyAllocJLW j__udy1AllocJLW +#define j__udyAllocJLB1 j__udy1AllocJLB1 +#define j__udyFreeJPM j__udy1FreeJ1PM +#define j__udyFreeJBL j__udy1FreeJBL +#define j__udyFreeJBB j__udy1FreeJBB +#define j__udyFreeJBBJP j__udy1FreeJBBJP +#define j__udyFreeJBU j__udy1FreeJBU +#ifndef JU_64BIT +#define j__udyFreeJLL1 j__udy1FreeJLL1 +#endif +#define j__udyFreeJLL2 j__udy1FreeJLL2 +#define j__udyFreeJLL3 j__udy1FreeJLL3 +#ifdef JU_64BIT +#define j__udyFreeJLL4 j__udy1FreeJLL4 +#define j__udyFreeJLL5 j__udy1FreeJLL5 +#define j__udyFreeJLL6 j__udy1FreeJLL6 +#define j__udyFreeJLL7 j__udy1FreeJLL7 +#endif +#define j__udyFreeJLW j__udy1FreeJLW +#define j__udyFreeJLB1 j__udy1FreeJLB1 +#define j__udyFreeSM j__udy1FreeSM + +#define j__uMaxWords j__u1MaxWords + +#ifdef DEBUG +#define JudyCheckPop Judy1CheckPop +#endif + +#else // JUDYL **************************************************************** + +#define cJU_LEAFW_MAXPOP1 cJL_LEAFW_MAXPOP1 +#define cJU_LEAF1_MAXPOP1 cJL_LEAF1_MAXPOP1 +#define cJU_LEAF2_MAXPOP1 cJL_LEAF2_MAXPOP1 +#define cJU_LEAF3_MAXPOP1 cJL_LEAF3_MAXPOP1 +#ifdef JU_64BIT +#define cJU_LEAF4_MAXPOP1 cJL_LEAF4_MAXPOP1 +#define cJU_LEAF5_MAXPOP1 cJL_LEAF5_MAXPOP1 +#define cJU_LEAF6_MAXPOP1 cJL_LEAF6_MAXPOP1 +#define cJU_LEAF7_MAXPOP1 cJL_LEAF7_MAXPOP1 +#endif +#define cJU_IMMED1_MAXPOP1 cJL_IMMED1_MAXPOP1 +#define cJU_IMMED2_MAXPOP1 cJL_IMMED2_MAXPOP1 +#define cJU_IMMED3_MAXPOP1 cJL_IMMED3_MAXPOP1 +#ifdef JU_64BIT +#define cJU_IMMED4_MAXPOP1 cJL_IMMED4_MAXPOP1 +#define cJU_IMMED5_MAXPOP1 cJL_IMMED5_MAXPOP1 +#define cJU_IMMED6_MAXPOP1 cJL_IMMED6_MAXPOP1 +#define cJU_IMMED7_MAXPOP1 cJL_IMMED7_MAXPOP1 +#endif + +#define JU_LEAF1POPTOWORDS(Pop1) JL_LEAF1POPTOWORDS(Pop1) +#define JU_LEAF2POPTOWORDS(Pop1) JL_LEAF2POPTOWORDS(Pop1) +#define JU_LEAF3POPTOWORDS(Pop1) JL_LEAF3POPTOWORDS(Pop1) +#ifdef JU_64BIT +#define JU_LEAF4POPTOWORDS(Pop1) JL_LEAF4POPTOWORDS(Pop1) +#define JU_LEAF5POPTOWORDS(Pop1) JL_LEAF5POPTOWORDS(Pop1) +#define JU_LEAF6POPTOWORDS(Pop1) JL_LEAF6POPTOWORDS(Pop1) +#define JU_LEAF7POPTOWORDS(Pop1) JL_LEAF7POPTOWORDS(Pop1) +#endif +#define JU_LEAFWPOPTOWORDS(Pop1) JL_LEAFWPOPTOWORDS(Pop1) + +#define JU_LEAF1GROWINPLACE(Pop1) JL_LEAF1GROWINPLACE(Pop1) +#define JU_LEAF2GROWINPLACE(Pop1) JL_LEAF2GROWINPLACE(Pop1) +#define JU_LEAF3GROWINPLACE(Pop1) JL_LEAF3GROWINPLACE(Pop1) +#ifdef JU_64BIT +#define JU_LEAF4GROWINPLACE(Pop1) JL_LEAF4GROWINPLACE(Pop1) +#define JU_LEAF5GROWINPLACE(Pop1) JL_LEAF5GROWINPLACE(Pop1) +#define JU_LEAF6GROWINPLACE(Pop1) JL_LEAF6GROWINPLACE(Pop1) +#define JU_LEAF7GROWINPLACE(Pop1) JL_LEAF7GROWINPLACE(Pop1) +#endif +#define JU_LEAFWGROWINPLACE(Pop1) JL_LEAFWGROWINPLACE(Pop1) + +#define j__udyCreateBranchL j__udyLCreateBranchL +#define j__udyCreateBranchB j__udyLCreateBranchB +#define j__udyCreateBranchU j__udyLCreateBranchU +#define j__udyCascade1 j__udyLCascade1 +#define j__udyCascade2 j__udyLCascade2 +#define j__udyCascade3 j__udyLCascade3 +#ifdef JU_64BIT +#define j__udyCascade4 j__udyLCascade4 +#define j__udyCascade5 j__udyLCascade5 +#define j__udyCascade6 j__udyLCascade6 +#define j__udyCascade7 j__udyLCascade7 +#endif +#define j__udyCascadeL j__udyLCascadeL +#define j__udyInsertBranch j__udyLInsertBranch + +#define j__udyBranchBToBranchL j__udyLBranchBToBranchL +#define j__udyLeafB1ToLeaf1 j__udyLLeafB1ToLeaf1 +#define j__udyLeaf1ToLeaf2 j__udyLLeaf1ToLeaf2 +#define j__udyLeaf2ToLeaf3 j__udyLLeaf2ToLeaf3 +#ifndef JU_64BIT +#define j__udyLeaf3ToLeafW j__udyLLeaf3ToLeafW +#else +#define j__udyLeaf3ToLeaf4 j__udyLLeaf3ToLeaf4 +#define j__udyLeaf4ToLeaf5 j__udyLLeaf4ToLeaf5 +#define j__udyLeaf5ToLeaf6 j__udyLLeaf5ToLeaf6 +#define j__udyLeaf6ToLeaf7 j__udyLLeaf6ToLeaf7 +#define j__udyLeaf7ToLeafW j__udyLLeaf7ToLeafW +#endif + +#define jpm_t jLpm_t +#define Pjpm_t PjLpm_t + +#define jlb_t jLlb_t +#define Pjlb_t PjLlb_t + +#define JU_JLB_BITMAP JL_JLB_BITMAP + +#define j__udyAllocJPM j__udyLAllocJLPM +#define j__udyAllocJBL j__udyLAllocJBL +#define j__udyAllocJBB j__udyLAllocJBB +#define j__udyAllocJBBJP j__udyLAllocJBBJP +#define j__udyAllocJBU j__udyLAllocJBU +#define j__udyAllocJLL1 j__udyLAllocJLL1 +#define j__udyAllocJLL2 j__udyLAllocJLL2 +#define j__udyAllocJLL3 j__udyLAllocJLL3 +#ifdef JU_64BIT +#define j__udyAllocJLL4 j__udyLAllocJLL4 +#define j__udyAllocJLL5 j__udyLAllocJLL5 +#define j__udyAllocJLL6 j__udyLAllocJLL6 +#define j__udyAllocJLL7 j__udyLAllocJLL7 +#endif +#define j__udyAllocJLW j__udyLAllocJLW +#define j__udyAllocJLB1 j__udyLAllocJLB1 +// j__udyLAllocJV +#define j__udyFreeJPM j__udyLFreeJLPM +#define j__udyFreeJBL j__udyLFreeJBL +#define j__udyFreeJBB j__udyLFreeJBB +#define j__udyFreeJBBJP j__udyLFreeJBBJP +#define j__udyFreeJBU j__udyLFreeJBU +#define j__udyFreeJLL1 j__udyLFreeJLL1 +#define j__udyFreeJLL2 j__udyLFreeJLL2 +#define j__udyFreeJLL3 j__udyLFreeJLL3 +#ifdef JU_64BIT +#define j__udyFreeJLL4 j__udyLFreeJLL4 +#define j__udyFreeJLL5 j__udyLFreeJLL5 +#define j__udyFreeJLL6 j__udyLFreeJLL6 +#define j__udyFreeJLL7 j__udyLFreeJLL7 +#endif +#define j__udyFreeJLW j__udyLFreeJLW +#define j__udyFreeJLB1 j__udyLFreeJLB1 +#define j__udyFreeSM j__udyLFreeSM +// j__udyLFreeJV + +#define j__uMaxWords j__uLMaxWords + +#ifdef DEBUG +#define JudyCheckPop JudyLCheckPop +#endif + +#endif // JUDYL + +#endif // _JUDYPRIVATE1L_INCLUDED diff --git a/src/libnetdata/libjudy/src/JudyCommon/JudyPrivateBranch.h b/src/libnetdata/libjudy/src/JudyCommon/JudyPrivateBranch.h new file mode 100644 index 00000000..10295ba9 --- /dev/null +++ b/src/libnetdata/libjudy/src/JudyCommon/JudyPrivateBranch.h @@ -0,0 +1,788 @@ +#ifndef _JUDY_PRIVATE_BRANCH_INCLUDED +#define _JUDY_PRIVATE_BRANCH_INCLUDED +// _________________ +// +// Copyright (C) 2000 - 2002 Hewlett-Packard Company +// +// This program is free software; you can redistribute it and/or modify it +// under the term of the GNU Lesser General Public License as published by the +// Free Software Foundation; either version 2 of the License, or (at your +// option) any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License +// for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this program; if not, write to the Free Software Foundation, +// Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// _________________ + +// @(#) $Revision: 1.2 $ $Source: /home/doug/judy-1.0.5_min/test/../src/JudyCommon/RCS/JudyPrivateBranch.h,v $ +// +// Header file for all Judy sources, for global but private (non-exported) +// declarations specific to branch support. +// +// See also the "Judy Shop Manual" (try judy/doc/int/JudyShopManual.*). + + +// **************************************************************************** +// JUDY POINTER (JP) SUPPORT +// **************************************************************************** +// +// This "rich pointer" object is pivotal to Judy execution. +// +// JP CONTAINING OTHER THAN IMMEDIATE INDEXES: +// +// If the JP points to a linear or bitmap leaf, jp_DcdPopO contains the +// Population-1 in LSbs and Decode (Dcd) bytes in the MSBs. (In practice the +// Decode bits are masked off while accessing the Pop0 bits.) +// +// The Decode Size, the number of Dcd bytes available, is encoded in jpo_Type. +// It can also be thought of as the number of states "skipped" in the SM, where +// each state decodes 8 bits = 1 byte. +// +// TBD: Dont need two structures, except possibly to force jp_Type to highest +// address! +// +// Note: The jpo_u union is not required by HP-UX or Linux but Win32 because +// the cl.exe compiler otherwise refuses to pack a bitfield (DcdPopO) with +// anything else, even with the -Zp option. This is pretty ugly, but +// fortunately portable, and its all hide-able by macros (see below). + +typedef struct J_UDY_POINTER_OTHERS // JPO. + { + Word_t j_po_Addr; // first word: Pjp_t, Word_t, etc. + union { + Word_t j_po_Addr1; + uint8_t j_po_DcdP0[sizeof(Word_t) - 1]; + uint8_t j_po_Bytes[sizeof(Word_t)]; // last byte = jp_Type. + } jpo_u; + } jpo_t; + + +// JP CONTAINING IMMEDIATE INDEXES: +// +// j_pi_1Index[] plus j_pi_LIndex[] together hold as many N-byte (1..3-byte +// [1..7-byte]) Indexes as will fit in sizeof(jpi_t) less 1 byte for j_pi_Type +// (that is, 7..1 [15..1] Indexes). +// +// For Judy1, j_pi_1Index[] is used and j_pi_LIndex[] is not used. +// For JudyL, j_pi_LIndex[] is used and j_pi_1Index[] is not used. +// +// Note: Actually when Pop1 = 1, jpi_t is not used, and the least bytes of the +// single Index are stored in j_po_DcdPopO, for both Judy1 and JudyL, so for +// JudyL the j_po_Addr field can hold the target value. +// +// TBD: Revise this structure to not overload j_po_DcdPopO this way? The +// current arrangement works, its just confusing. + +typedef struct _JUDY_POINTER_IMMEDL + { + Word_t j_pL_Addr; + uint8_t j_pL_LIndex[sizeof(Word_t) - 1]; // see above. + uint8_t j_pL_Type; + } jpL_t; + +typedef struct _JUDY_POINTER_IMMED1 + { + uint8_t j_p1_1Index[(2 * sizeof(Word_t)) - 1]; + uint8_t j_p1_Type; + } jp1_t; + +// UNION OF JP TYPES: +// +// A branch is an array of cJU_BRANCHUNUMJPS (256) of this object, or an +// alternate data type such as: A linear branch which is a list of 2..7 JPs, +// or a bitmap branch which contains 8 lists of 0..32 JPs. JPs reside only in +// branches of a Judy SM. + +typedef union J_UDY_POINTER // JP. + { + jpo_t j_po; // other than immediate indexes. + jpL_t j_pL; // immediate indexes. + jp1_t j_p1; // immediate indexes. + } jp_t, *Pjp_t; + +// For coding convenience: +// +// Note, jp_Type has the same bits in jpo_t jpL_t and jp1_t. + +#define jp_1Index j_p1.j_p1_1Index // for storing Indexes in first word. +#define jp_LIndex j_pL.j_pL_LIndex // for storing Indexes in second word. +#define jp_Addr j_po.j_po_Addr +#define jp_Addr1 j_po.jpo_u.j_po_Addr1 +//#define jp_DcdPop0 j_po.jpo_u.j_po_DcdPop0 +#define jp_Addr1 j_po.jpo_u.j_po_Addr1 +//#define jp_Type j_po.jpo_u.j_po_Bytes[sizeof(Word_t) - 1] +#define jp_Type j_p1.j_p1_Type +#define jp_DcdP0 j_po.jpo_u.j_po_DcdP0 + + +// **************************************************************************** +// JUDY POINTER (JP) -- RELATED MACROS AND CONSTANTS +// **************************************************************************** + +// EXTRACT VALUES FROM JP: +// +// Masks for the bytes in the Dcd and Pop0 parts of jp_DcdPopO: +// +// cJU_DCDMASK() consists of a mask that excludes the (LSb) Pop0 bytes and +// also, just to be safe, the top byte of the word, since jp_DcdPopO is 1 byte +// less than a full word. +// +// Note: These are constant macros (cJU) because cPopBytes should be a +// constant. Also note cPopBytes == state in the SM. + +#define cJU_POP0MASK(cPopBytes) JU_LEASTBYTESMASK(cPopBytes) + +#define cJU_DCDMASK(cPopBytes) \ + ((cJU_ALLONES >> cJU_BITSPERBYTE) & (~cJU_POP0MASK(cPopBytes))) + +// Mask off the high byte from INDEX to it can be compared to DcdPopO: + +#define JU_TRIMTODCDSIZE(INDEX) ((cJU_ALLONES >> cJU_BITSPERBYTE) & (INDEX)) + +// Get from jp_DcdPopO the Pop0 for various branch JP Types: +// +// Note: There are no simple macros for cJU_BRANCH* Types because their +// populations must be added up and dont reside in an already-calculated +// place. + +#define JU_JPBRANCH_POP0(PJP,cPopBytes) \ + (JU_JPDCDPOP0(PJP) & cJU_POP0MASK(cPopBytes)) + +// METHOD FOR DETERMINING IF OBJECTS HAVE ROOM TO GROW: +// +// J__U_GROWCK() is a generic method to determine if an object can grow in +// place, based on whether the next population size (one more) would use the +// same space. + +#define J__U_GROWCK(POP1,MAXPOP1,POPTOWORDS) \ + (((POP1) != (MAXPOP1)) && (POPTOWORDS[POP1] == POPTOWORDS[(POP1) + 1])) + +#define JU_BRANCHBJPGROWINPLACE(NumJPs) \ + J__U_GROWCK(NumJPs, cJU_BITSPERSUBEXPB, j__U_BranchBJPPopToWords) + + +// DETERMINE IF AN INDEX IS (NOT) IN A JPS EXPANSE: + +#define JU_DCDNOTMATCHINDEX(INDEX,PJP,POP0BYTES) \ + (((INDEX) ^ JU_JPDCDPOP0(PJP)) & cJU_DCDMASK(POP0BYTES)) + + +// NUMBER OF JPs IN AN UNCOMPRESSED BRANCH: +// +// An uncompressed branch is simply an array of 256 Judy Pointers (JPs). It is +// a minimum cacheline fill object. Define it here before its first needed. + +#define cJU_BRANCHUNUMJPS cJU_SUBEXPPERSTATE + + +// **************************************************************************** +// JUDY BRANCH LINEAR (JBL) SUPPORT +// **************************************************************************** +// +// A linear branch is a way of compressing empty expanses (null JPs) out of an +// uncompressed 256-way branch, when the number of populated expanses is so +// small that even a bitmap branch is excessive. +// +// The maximum number of JPs in a Judy linear branch: +// +// Note: This number results in a 1-cacheline sized structure. Previous +// versions had a larger struct so a linear branch didnt become a bitmap +// branch until the memory consumed was even, but for speed, its better to +// switch "sooner" and keep a linear branch fast. + +#define cJU_BRANCHLMAXJPS 7 + + +// LINEAR BRANCH STRUCT: +// +// 1-byte count, followed by array of byte-sized expanses, followed by JPs. + +typedef struct J__UDY_BRANCH_LINEAR + { + uint8_t jbl_NumJPs; // num of JPs (Pjp_t), 1..N. + uint8_t jbl_Expanse[cJU_BRANCHLMAXJPS]; // 1..7 MSbs of pop exps. + jp_t jbl_jp [cJU_BRANCHLMAXJPS]; // JPs for populated exps. + } jbl_t, * Pjbl_t; + + +// **************************************************************************** +// JUDY BRANCH BITMAP (JBB) SUPPORT +// **************************************************************************** +// +// A bitmap branch is a way of compressing empty expanses (null JPs) out of +// uncompressed 256-way branch. This costs 1 additional cache line fill, but +// can save a lot of memory when it matters most, near the leaves, and +// typically there will be only one at most in the path to any Index (leaf). +// +// The bitmap indicates which of the cJU_BRANCHUNUMJPS (256) JPs in the branch +// are NOT null, that is, their expanses are populated. The jbb_t also +// contains N pointers to "mini" Judy branches ("subexpanses") of up to M JPs +// each (see BITMAP_BRANCHMxN, for example, BITMAP_BRANCH32x8), where M x N = +// cJU_BRANCHUNUMJPS. These are dynamically allocated and never contain +// cJ*_JPNULL* jp_Types. An empty subexpanse is represented by no bit sets in +// the corresponding subexpanse bitmap, in which case the corresponding +// jbbs_Pjp pointers value is unused. +// +// Note that the number of valid JPs in each 1-of-N subexpanses is determined +// by POPULATION rather than by EXPANSE -- the desired outcome to save memory +// when near the leaves. Note that the memory required for 185 JPs is about as +// much as an uncompressed 256-way branch, therefore 184 is set as the maximum. +// However, it is expected that a conversion to an uncompressed 256-way branch +// will normally take place before this limit is reached for other reasons, +// such as improving performance when the "wasted" memory is well amortized by +// the population under the branch, preserving an acceptable overall +// bytes/Index in the Judy array. +// +// The number of pointers to arrays of JPs in the Judy bitmap branch: +// +// Note: The numbers below are the same in both 32 and 64 bit systems. + +#define cJU_BRANCHBMAXJPS 184 // maximum JPs for bitmap branches. + +// Convenience wrappers for referencing BranchB bitmaps or JP subarray +// pointers: +// +// Note: JU_JBB_PJP produces a "raw" memory address that must pass through +// P_JP before use, except when freeing memory: + +#define JU_JBB_BITMAP(Pjbb, SubExp) ((Pjbb)->jbb_jbbs[SubExp].jbbs_Bitmap) +#define JU_JBB_PJP( Pjbb, SubExp) ((Pjbb)->jbb_jbbs[SubExp].jbbs_Pjp) + +#define JU_SUBEXPB(Digit) (((Digit) / cJU_BITSPERSUBEXPB) & (cJU_NUMSUBEXPB-1)) + +#define JU_BITMAPTESTB(Pjbb, Index) \ + (JU_JBB_BITMAP(Pjbb, JU_SUBEXPB(Index)) & JU_BITPOSMASKB(Index)) + +#define JU_BITMAPSETB(Pjbb, Index) \ + (JU_JBB_BITMAP(Pjbb, JU_SUBEXPB(Index)) |= JU_BITPOSMASKB(Index)) + +// Note: JU_BITMAPCLEARB is not defined because the code does it a faster way. + +typedef struct J__UDY_BRANCH_BITMAP_SUBEXPANSE + { + BITMAPB_t jbbs_Bitmap; + Pjp_t jbbs_Pjp; + + } jbbs_t; + +typedef struct J__UDY_BRANCH_BITMAP + { + jbbs_t jbb_jbbs [cJU_NUMSUBEXPB]; +#ifdef SUBEXPCOUNTS + Word_t jbb_subPop1[cJU_NUMSUBEXPB]; +#endif + } jbb_t, * Pjbb_t; + +#define JU_BRANCHJP_NUMJPSTOWORDS(NumJPs) (j__U_BranchBJPPopToWords[NumJPs]) + +#ifdef SUBEXPCOUNTS +#define cJU_NUMSUBEXPU 16 // number of subexpanse counts. +#endif + + +// **************************************************************************** +// JUDY BRANCH UNCOMPRESSED (JBU) SUPPORT +// **************************************************************************** + +// Convenience wrapper for referencing BranchU JPs: +// +// Note: This produces a non-"raw" address already passed through P_JBU(). + +#define JU_JBU_PJP(Pjp,Index,Level) \ + (&((P_JBU((Pjp)->jp_Addr))->jbu_jp[JU_DIGITATSTATE(Index, Level)])) +#define JU_JBU_PJP0(Pjp) \ + (&((P_JBU((Pjp)->jp_Addr))->jbu_jp[0])) + +typedef struct J__UDY_BRANCH_UNCOMPRESSED + { + jp_t jbu_jp [cJU_BRANCHUNUMJPS]; // JPs for populated exp. +#ifdef SUBEXPCOUNTS + Word_t jbu_subPop1[cJU_NUMSUBEXPU]; +#endif + } jbu_t, * Pjbu_t; + + +// **************************************************************************** +// OTHER SUPPORT FOR JUDY STATE MACHINES (SMs) +// **************************************************************************** + +// OBJECT SIZES IN WORDS: +// +// Word_ts per various JudyL structures that have constant sizes. +// cJU_WORDSPERJP should always be 2; this is fundamental to the Judy +// structures. + +#define cJU_WORDSPERJP (sizeof(jp_t) / cJU_BYTESPERWORD) +#define cJU_WORDSPERCL (cJU_BYTESPERCL / cJU_BYTESPERWORD) + + +// OPPORTUNISTIC UNCOMPRESSION: +// +// Define populations at which a BranchL or BranchB must convert to BranchU. +// Earlier conversion is possible with good memory efficiency -- see below. + +#ifndef NO_BRANCHU + +// Max population below BranchL, then convert to BranchU: + +#define JU_BRANCHL_MAX_POP 1000 + +// Minimum global population increment before next conversion of a BranchB to a +// BranchU: +// +// This is was done to allow malloc() to coalesce memory before the next big +// (~512 words) allocation. + +#define JU_BTOU_POP_INCREMENT 300 + +// Min/max population below BranchB, then convert to BranchU: + +#define JU_BRANCHB_MIN_POP 135 +#define JU_BRANCHB_MAX_POP 750 + +#else // NO_BRANCHU + +// These are set up to have conservative conversion schedules to BranchU: + +#define JU_BRANCHL_MAX_POP (-1UL) +#define JU_BTOU_POP_INCREMENT 300 +#define JU_BRANCHB_MIN_POP 1000 +#define JU_BRANCHB_MAX_POP (-1UL) + +#endif // NO_BRANCHU + + +// MISCELLANEOUS MACROS: + +// Get N most significant bits from the shifted Index word: +// +// As Index words are decoded, they are shifted left so only relevant, +// undecoded Index bits remain. + +#define JU_BITSFROMSFTIDX(SFTIDX, N) ((SFTIDX) >> (cJU_BITSPERWORD - (N))) + +// TBD: I have my doubts about the necessity of these macros (dlb): + +// Produce 1-digit mask at specified state: + +#define cJU_MASKATSTATE(State) (0xffL << (((State) - 1) * cJU_BITSPERBYTE)) + +// Get byte (digit) from Index at the specified state, right justified: +// +// Note: State must be 1..cJU_ROOTSTATE, and Digits must be 1..(cJU_ROOTSTATE +// - 1), but theres no way to assert these within an expression. + +#define JU_DIGITATSTATE(Index,cState) \ + ((uint8_t)((Index) >> (((cState) - 1) * cJU_BITSPERBYTE))) + +// Similarly, place byte (digit) at correct position for the specified state: +// +// Note: Cast digit to a Word_t first so there are no complaints or problems +// about shifting it more than 32 bits on a 64-bit system, say, when it is a +// uint8_t from jbl_Expanse[]. (Believe it or not, the C standard says to +// promote an unsigned char to a signed int; -Ac does not do this, but -Ae +// does.) +// +// Also, to make lint happy, cast the whole result again because apparently +// shifting a Word_t does not result in a Word_t! + +#define JU_DIGITTOSTATE(Digit,cState) \ + ((Word_t) (((Word_t) (Digit)) << (((cState) - 1) * cJU_BITSPERBYTE))) + +#endif // ! _JUDY_PRIVATE_BRANCH_INCLUDED + + +#ifdef TEST_INSDEL + +// **************************************************************************** +// TEST CODE FOR INSERT/DELETE MACROS +// **************************************************************************** +// +// To use this, compile a temporary *.c file containing: +// +// #define DEBUG +// #define JUDY_ASSERT +// #define TEST_INSDEL +// #include "JudyPrivate.h" +// #include "JudyPrivateBranch.h" +// +// Use a command like this: cc -Ae +DD64 -I. -I JudyCommon -o t t.c +// For best results, include +DD64 on a 64-bit system. +// +// This test code exercises some tricky macros, but the output must be studied +// manually to verify it. Assume that for even-index testing, whole words +// (Word_t) suffices. + +#include <stdio.h> + +#define INDEXES 3 // in each array. + + +// **************************************************************************** +// I N I T +// +// Set up variables for next test. See usage. + +FUNCTION void Init ( + int base, + PWord_t PeIndex, + PWord_t PoIndex, + PWord_t Peleaf, // always whole words. +#ifndef JU_64BIT + uint8_t * Poleaf3) +#else + uint8_t * Poleaf3, + uint8_t * Poleaf5, + uint8_t * Poleaf6, + uint8_t * Poleaf7) +#endif +{ + int offset; + + *PeIndex = 99; + + for (offset = 0; offset <= INDEXES; ++offset) + Peleaf[offset] = base + offset; + + for (offset = 0; offset < (INDEXES + 1) * 3; ++offset) + Poleaf3[offset] = base + offset; + +#ifndef JU_64BIT + *PoIndex = (91 << 24) | (92 << 16) | (93 << 8) | 94; +#else + + *PoIndex = (91L << 56) | (92L << 48) | (93L << 40) | (94L << 32) + | (95L << 24) | (96L << 16) | (97L << 8) | 98L; + + for (offset = 0; offset < (INDEXES + 1) * 5; ++offset) + Poleaf5[offset] = base + offset; + + for (offset = 0; offset < (INDEXES + 1) * 6; ++offset) + Poleaf6[offset] = base + offset; + + for (offset = 0; offset < (INDEXES + 1) * 7; ++offset) + Poleaf7[offset] = base + offset; +#endif + +} // Init() + + +// **************************************************************************** +// P R I N T L E A F +// +// Print the byte values in a leaf. + +FUNCTION void PrintLeaf ( + char * Label, // for output. + int IOffset, // insertion offset in array. + int Indsize, // index size in bytes. + uint8_t * PLeaf) // array of Index bytes. +{ + int offset; // in PLeaf. + int byte; // in one word. + + (void) printf("%s %u: ", Label, IOffset); + + for (offset = 0; offset <= INDEXES; ++offset) + { + for (byte = 0; byte < Indsize; ++byte) + (void) printf("%2d", PLeaf[(offset * Indsize) + byte]); + + (void) printf(" "); + } + + (void) printf("\n"); + +} // PrintLeaf() + + +// **************************************************************************** +// M A I N +// +// Test program. + +FUNCTION main() +{ + Word_t eIndex; // even, to insert. + Word_t oIndex; // odd, to insert. + Word_t eleaf [ INDEXES + 1]; // even leaf, index size 4. + uint8_t oleaf3[(INDEXES + 1) * 3]; // odd leaf, index size 3. +#ifdef JU_64BIT + uint8_t oleaf5[(INDEXES + 1) * 5]; // odd leaf, index size 5. + uint8_t oleaf6[(INDEXES + 1) * 6]; // odd leaf, index size 6. + uint8_t oleaf7[(INDEXES + 1) * 7]; // odd leaf, index size 7. +#endif + Word_t eleaf_2 [ INDEXES + 1]; // same, but second arrays: + uint8_t oleaf3_2[(INDEXES + 1) * 3]; +#ifdef JU_64BIT + uint8_t oleaf5_2[(INDEXES + 1) * 5]; + uint8_t oleaf6_2[(INDEXES + 1) * 6]; + uint8_t oleaf7_2[(INDEXES + 1) * 7]; +#endif + int ioffset; // index insertion offset. + +#ifndef JU_64BIT +#define INIT Init( 0, & eIndex, & oIndex, eleaf, oleaf3) +#define INIT2 INIT; Init(50, & eIndex, & oIndex, eleaf_2, oleaf3_2) +#else +#define INIT Init( 0, & eIndex, & oIndex, eleaf, oleaf3, \ + oleaf5, oleaf6, oleaf7) +#define INIT2 INIT; Init(50, & eIndex, & oIndex, eleaf_2, oleaf3_2, \ + oleaf5_2, oleaf6_2, oleaf7_2) +#endif + +#define WSIZE sizeof (Word_t) // shorthand. + +#ifdef PRINTALL // to turn on "noisy" printouts. +#define PRINTLEAF(Label,IOffset,Indsize,PLeaf) \ + PrintLeaf(Label,IOffset,Indsize,PLeaf) +#else +#define PRINTLEAF(Label,IOffset,Indsize,PLeaf) \ + if (ioffset == 0) \ + PrintLeaf(Label,IOffset,Indsize,PLeaf) +#endif + + (void) printf( +"In each case, tests operate on an initial array of %d indexes. Even-index\n" +"tests set index values to 0,1,2...; odd-index tests set byte values to\n" +"0,1,2... Inserted indexes have a value of 99 or else byte values 91,92,...\n", + INDEXES); + + (void) puts("\nJU_INSERTINPLACE():"); + + for (ioffset = 0; ioffset <= INDEXES; ++ioffset) + { + INIT; + PRINTLEAF("Before", ioffset, WSIZE, (uint8_t *) eleaf); + JU_INSERTINPLACE(eleaf, INDEXES, ioffset, eIndex); + PrintLeaf("After ", ioffset, WSIZE, (uint8_t *) eleaf); + } + + (void) puts("\nJU_INSERTINPLACE3():"); + + for (ioffset = 0; ioffset <= INDEXES; ++ioffset) + { + INIT; + PRINTLEAF("Before", ioffset, 3, oleaf3); + JU_INSERTINPLACE3(oleaf3, INDEXES, ioffset, oIndex); + PrintLeaf("After ", ioffset, 3, oleaf3); + } + +#ifdef JU_64BIT + (void) puts("\nJU_INSERTINPLACE5():"); + + for (ioffset = 0; ioffset <= INDEXES; ++ioffset) + { + INIT; + PRINTLEAF("Before", ioffset, 5, oleaf5); + JU_INSERTINPLACE5(oleaf5, INDEXES, ioffset, oIndex); + PrintLeaf("After ", ioffset, 5, oleaf5); + } + + (void) puts("\nJU_INSERTINPLACE6():"); + + for (ioffset = 0; ioffset <= INDEXES; ++ioffset) + { + INIT; + PRINTLEAF("Before", ioffset, 6, oleaf6); + JU_INSERTINPLACE6(oleaf6, INDEXES, ioffset, oIndex); + PrintLeaf("After ", ioffset, 6, oleaf6); + } + + (void) puts("\nJU_INSERTINPLACE7():"); + + for (ioffset = 0; ioffset <= INDEXES; ++ioffset) + { + INIT; + PRINTLEAF("Before", ioffset, 7, oleaf7); + JU_INSERTINPLACE7(oleaf7, INDEXES, ioffset, oIndex); + PrintLeaf("After ", ioffset, 7, oleaf7); + } +#endif // JU_64BIT + + (void) puts("\nJU_DELETEINPLACE():"); + + for (ioffset = 0; ioffset < INDEXES; ++ioffset) + { + INIT; + PRINTLEAF("Before", ioffset, WSIZE, (uint8_t *) eleaf); + JU_DELETEINPLACE(eleaf, INDEXES, ioffset); + PrintLeaf("After ", ioffset, WSIZE, (uint8_t *) eleaf); + } + + (void) puts("\nJU_DELETEINPLACE_ODD(3):"); + + for (ioffset = 0; ioffset < INDEXES; ++ioffset) + { + INIT; + PRINTLEAF("Before", ioffset, 3, oleaf3); + JU_DELETEINPLACE_ODD(oleaf3, INDEXES, ioffset, 3); + PrintLeaf("After ", ioffset, 3, oleaf3); + } + +#ifdef JU_64BIT + (void) puts("\nJU_DELETEINPLACE_ODD(5):"); + + for (ioffset = 0; ioffset < INDEXES; ++ioffset) + { + INIT; + PRINTLEAF("Before", ioffset, 5, oleaf5); + JU_DELETEINPLACE_ODD(oleaf5, INDEXES, ioffset, 5); + PrintLeaf("After ", ioffset, 5, oleaf5); + } + + (void) puts("\nJU_DELETEINPLACE_ODD(6):"); + + for (ioffset = 0; ioffset < INDEXES; ++ioffset) + { + INIT; + PRINTLEAF("Before", ioffset, 6, oleaf6); + JU_DELETEINPLACE_ODD(oleaf6, INDEXES, ioffset, 6); + PrintLeaf("After ", ioffset, 6, oleaf6); + } + + (void) puts("\nJU_DELETEINPLACE_ODD(7):"); + + for (ioffset = 0; ioffset < INDEXES; ++ioffset) + { + INIT; + PRINTLEAF("Before", ioffset, 7, oleaf7); + JU_DELETEINPLACE_ODD(oleaf7, INDEXES, ioffset, 7); + PrintLeaf("After ", ioffset, 7, oleaf7); + } +#endif // JU_64BIT + + (void) puts("\nJU_INSERTCOPY():"); + + for (ioffset = 0; ioffset <= INDEXES; ++ioffset) + { + INIT2; + PRINTLEAF("Before, src ", ioffset, WSIZE, (uint8_t *) eleaf); + PRINTLEAF("Before, dest", ioffset, WSIZE, (uint8_t *) eleaf_2); + JU_INSERTCOPY(eleaf_2, eleaf, INDEXES, ioffset, eIndex); + PRINTLEAF("After, src ", ioffset, WSIZE, (uint8_t *) eleaf); + PrintLeaf("After, dest", ioffset, WSIZE, (uint8_t *) eleaf_2); + } + + (void) puts("\nJU_INSERTCOPY3():"); + + for (ioffset = 0; ioffset <= INDEXES; ++ioffset) + { + INIT2; + PRINTLEAF("Before, src ", ioffset, 3, oleaf3); + PRINTLEAF("Before, dest", ioffset, 3, oleaf3_2); + JU_INSERTCOPY3(oleaf3_2, oleaf3, INDEXES, ioffset, oIndex); + PRINTLEAF("After, src ", ioffset, 3, oleaf3); + PrintLeaf("After, dest", ioffset, 3, oleaf3_2); + } + +#ifdef JU_64BIT + (void) puts("\nJU_INSERTCOPY5():"); + + for (ioffset = 0; ioffset <= INDEXES; ++ioffset) + { + INIT2; + PRINTLEAF("Before, src ", ioffset, 5, oleaf5); + PRINTLEAF("Before, dest", ioffset, 5, oleaf5_2); + JU_INSERTCOPY5(oleaf5_2, oleaf5, INDEXES, ioffset, oIndex); + PRINTLEAF("After, src ", ioffset, 5, oleaf5); + PrintLeaf("After, dest", ioffset, 5, oleaf5_2); + } + + (void) puts("\nJU_INSERTCOPY6():"); + + for (ioffset = 0; ioffset <= INDEXES; ++ioffset) + { + INIT2; + PRINTLEAF("Before, src ", ioffset, 6, oleaf6); + PRINTLEAF("Before, dest", ioffset, 6, oleaf6_2); + JU_INSERTCOPY6(oleaf6_2, oleaf6, INDEXES, ioffset, oIndex); + PRINTLEAF("After, src ", ioffset, 6, oleaf6); + PrintLeaf("After, dest", ioffset, 6, oleaf6_2); + } + + (void) puts("\nJU_INSERTCOPY7():"); + + for (ioffset = 0; ioffset <= INDEXES; ++ioffset) + { + INIT2; + PRINTLEAF("Before, src ", ioffset, 7, oleaf7); + PRINTLEAF("Before, dest", ioffset, 7, oleaf7_2); + JU_INSERTCOPY7(oleaf7_2, oleaf7, INDEXES, ioffset, oIndex); + PRINTLEAF("After, src ", ioffset, 7, oleaf7); + PrintLeaf("After, dest", ioffset, 7, oleaf7_2); + } +#endif // JU_64BIT + + (void) puts("\nJU_DELETECOPY():"); + + for (ioffset = 0; ioffset < INDEXES; ++ioffset) + { + INIT2; + PRINTLEAF("Before, src ", ioffset, WSIZE, (uint8_t *) eleaf); + PRINTLEAF("Before, dest", ioffset, WSIZE, (uint8_t *) eleaf_2); + JU_DELETECOPY(eleaf_2, eleaf, INDEXES, ioffset, ignore); + PRINTLEAF("After, src ", ioffset, WSIZE, (uint8_t *) eleaf); + PrintLeaf("After, dest", ioffset, WSIZE, (uint8_t *) eleaf_2); + } + + (void) puts("\nJU_DELETECOPY_ODD(3):"); + + for (ioffset = 0; ioffset < INDEXES; ++ioffset) + { + INIT2; + PRINTLEAF("Before, src ", ioffset, 3, oleaf3); + PRINTLEAF("Before, dest", ioffset, 3, oleaf3_2); + JU_DELETECOPY_ODD(oleaf3_2, oleaf3, INDEXES, ioffset, 3); + PRINTLEAF("After, src ", ioffset, 3, oleaf3); + PrintLeaf("After, dest", ioffset, 3, oleaf3_2); + } + +#ifdef JU_64BIT + (void) puts("\nJU_DELETECOPY_ODD(5):"); + + for (ioffset = 0; ioffset < INDEXES; ++ioffset) + { + INIT2; + PRINTLEAF("Before, src ", ioffset, 5, oleaf5); + PRINTLEAF("Before, dest", ioffset, 5, oleaf5_2); + JU_DELETECOPY_ODD(oleaf5_2, oleaf5, INDEXES, ioffset, 5); + PRINTLEAF("After, src ", ioffset, 5, oleaf5); + PrintLeaf("After, dest", ioffset, 5, oleaf5_2); + } + + (void) puts("\nJU_DELETECOPY_ODD(6):"); + + for (ioffset = 0; ioffset < INDEXES; ++ioffset) + { + INIT2; + PRINTLEAF("Before, src ", ioffset, 6, oleaf6); + PRINTLEAF("Before, dest", ioffset, 6, oleaf6_2); + JU_DELETECOPY_ODD(oleaf6_2, oleaf6, INDEXES, ioffset, 6); + PRINTLEAF("After, src ", ioffset, 6, oleaf6); + PrintLeaf("After, dest", ioffset, 6, oleaf6_2); + } + + (void) puts("\nJU_DELETECOPY_ODD(7):"); + + for (ioffset = 0; ioffset < INDEXES; ++ioffset) + { + INIT2; + PRINTLEAF("Before, src ", ioffset, 7, oleaf7); + PRINTLEAF("Before, dest", ioffset, 7, oleaf7_2); + JU_DELETECOPY_ODD(oleaf7_2, oleaf7, INDEXES, ioffset, 7); + PRINTLEAF("After, src ", ioffset, 7, oleaf7); + PrintLeaf("After, dest", ioffset, 7, oleaf7_2); + } +#endif // JU_64BIT + + return(0); + +} // main() + +#endif // TEST_INSDEL diff --git a/src/libnetdata/libjudy/src/JudyHS/JudyHS.c b/src/libnetdata/libjudy/src/JudyHS/JudyHS.c new file mode 100644 index 00000000..21191bab --- /dev/null +++ b/src/libnetdata/libjudy/src/JudyHS/JudyHS.c @@ -0,0 +1,771 @@ +// @(#) $Revision: 4.1 $ $Source: /judy/src/JudyHS/JudyHS.c +//======================================================================= +// Author Douglas L. Baskins, Dec 2003. +// Permission to use this code is freely granted, provided that this +// statement is retained. +// email - doug@sourcejudy.com -or- dougbaskins@yahoo.com +//======================================================================= + +#include <string.h> // for memcmp(), memcpy() + +#include <Judy.h> // for JudyL* routines/macros + +/* + This routine is a very fast "string" version of an ADT that stores + (JudyHSIns()), retrieves (JudyHSGet()), deletes (JudyHSDel()) and + frees the entire ADT (JudyHSFreeArray()) strings. It uses the "Judy + arrays" JudyL() API as the main workhorse. The length of the string + is included in the calling parameters so that strings with embedded + \0s can be used. The string lengths can be from 0 bytes to whatever + malloc() can handle (~2GB). + + Compile: + + cc -O JudyHS.c -c needs to link with -lJudy (libJudy.a) + + Note: in gcc version 3.3.1, -O2 generates faster code than -O + Note: in gcc version 3.3.2, -O3 generates faster code than -O2 + + NOTES: + +1) There may be some performance issues with 64 bit machines, because I + have not characterized that it yet. + +2) It appears that a modern CPU (>2Ghz) that the instruction times are + much faster that a RAM access, so building up a word from bytes takes + no longer that a whole word access. I am taking advantage of this to + make this code endian neutral. A side effect of this is strings do + not need to be aligned, nor tested to be on to a word boundry. In + older and in slow (RISC) machines, this may be a performance issue. + I have given up trying to optimize for machines that have very slow + mpy, mod, variable shifts and call returns. + +3) JudyHS is very scalable from 1 string to billions (with enough RAM). + The memory usage is also scales with population. I have attempted to + combine the best characteristics of JudyL arrays with Hashing methods + and well designed modern processors (such as the 1.3Ghz Intel + Centrino this is being written on). + + HOW JudyHS WORKS: ( 4[8] means 4 bytes in 32 bit machine and 8 in 64) + + A) A JudyL array is used to separate strings of equal lengths into + their own structures (a different hash table is used for each length + of string). The additional time overhead is very near zero because + of the CPU cache. The space efficiency is improved because the + length need not be stored with the string (ls_t). The "JLHash" ADT + in the test program "StringCompare" is verification of both these + assumptions. + + B) A 32 bit hash value is produced from the string. Many thanks to + the Internet and the author (Bob Jenkins) for coming up with a very + good and fast universal string hash. Next the 32 bit hash number is + used as an Index to another JudyL array. Notice that one (1) JudyL + array is used as a hash table per each string length. If there are + no hash collisions (normally) then the string is copied to a + structure (ls_t) along with room for storing a Value. A flag is + added to the pointer to note it is pointing to a ls_t structure. + Since the lengths of the strings are the same, there is no need to + stored length of string in the ls_t structure. This saves about a + word per string of memory. + + C) When there is a hashing collision (very rare), a JudyL array is + used to decode the next 4[8] bytes of the string. That is, the next + 4[8] bytes of the string are used as the Index. This process is + repeated until the remaining string is unique. The remaining string + (if any) is stored in a (now smaller) ls_t structure. If the + remaining string is less or equal to 4[8] bytes, then the ls_t + structure is not needed and the Value area in the JudyL array is + used. A compile option -DDONOTUSEHASH is available to test this + structure without using hashing (only the JudyL tree is used). This + is equivalent to having all strings hashed to the same bucket. The + speed is still better than all other tree based ADTs I have tested. + An added benefit of this is a very fast "hash collision" resolving. + It could foil hackers that exploit the slow synonym (linked-list) + collision handling property used with most hashing algorithms. If + this is not a necessary property, then a simpler ADT "JLHash" that is + documented the the test program "StringCompare.c" may be used with a + little loss of memory efficiency (because it includes the string + length with the ls_t structure). JudyHS was written to be the + fastest, very scalable, memory efficient, general purpose string ADT + possible. (However, I would like to eat those words someday). (dlb) + +*/ + +#ifdef EXAMPLE_CODE +#include <stdio.h> +#include <unistd.h> +#include <string.h> + +#include <Judy.h> + +//#include "JudyHS.h" // for Judy.h without JudyHS*() + +// By Doug Baskins Apr 2004 - for JudyHS man page + +#define MAXLINE 1000000 /* max length of line */ +char Index[MAXLINE]; // string to check + +int // Usage: CheckDupLines < file +main() +{ + Pvoid_t PJArray = (PWord_t)NULL; // Judy array. + PWord_t PValue; // ^ Judy array element. + Word_t Bytes; // size of JudyHS array. + Word_t LineNumb = 0; // current line number + Word_t Dups = 0; // number of duplicate lines + + while (fgets(Index, MAXLINE, stdin) != (char *)NULL) + { + LineNumb++; // line number + +// store string into array + JHSI(PValue, PJArray, Index, strlen(Index)); + if (*PValue) // check if duplicate + { + Dups++; // count duplicates + printf("Duplicate lines %lu:%lu:%s", *PValue, LineNumb, Index); + } + else + { + *PValue = LineNumb; // store Line number + } + } + printf("%lu Duplicates, free JudyHS array of %lu Lines\n", + Dups, LineNumb - Dups); + JHSFA(Bytes, PJArray); // free array + printf("The JudyHS array allocated %lu bytes of memory\n", Bytes); + return (0); +} +#endif // EXAMPLE_CODE + +// Note: Use JLAP_INVALID, which is non-zero, to mark pointers to a ls_t +// This makes it compatable with previous versions of JudyL() + +#define IS_PLS(PLS) (((Word_t) (PLS)) & JLAP_INVALID) +#define CLEAR_PLS(PLS) (((Word_t) (PLS)) & (~JLAP_INVALID)) +#define SET_PLS(PLS) (((Word_t) (PLS)) | JLAP_INVALID) + +#define WORDSIZE (sizeof(Word_t)) + +// this is the struct used for "leaf" strings. Note that +// the Value is followed by a "variable" length ls_String array. +// +typedef struct L_EAFSTRING +{ + Word_t ls_Value; // Value area (cannot change size) + uint8_t ls_String[WORDSIZE]; // to fill out to a Word_t size +} ls_t , *Pls_t; + +#define LS_STRUCTOVD (sizeof(ls_t) - WORDSIZE) + +// Calculate size of ls_t including the string of length of LEN. +// +#define LS_WORDLEN(LEN) (((LEN) + LS_STRUCTOVD + WORDSIZE - 1) / WORDSIZE) + +// Copy from 0..4[8] bytes from string to a Word_t +// NOTE: the copy in in little-endian order to take advantage of improved +// memory efficiency of JudyLIns() with smaller numbers +// +#define COPYSTRING4toWORD(WORD,STR,LEN) \ +{ \ + WORD = 0; \ + switch(LEN) \ + { \ + default: /* four and greater */ \ + case 4: \ + WORD += (Word_t)(((uint8_t *)(STR))[3] << 24); \ + case 3: \ + WORD += (Word_t)(((uint8_t *)(STR))[2] << 16); \ + case 2: \ + WORD += (Word_t)(((uint8_t *)(STR))[1] << 8); \ + case 1: \ + WORD += (Word_t)(((uint8_t *)(STR))[0]); \ + case 0: break; \ + } \ +} + +#ifdef JU_64BIT + +// copy from 0..8 bytes from string to Word_t +// +#define COPYSTRING8toWORD(WORD,STR,LEN) \ +{ \ + WORD = 0UL; \ + switch(LEN) \ + { \ + default: /* eight and greater */ \ + case 8: \ + WORD += ((Word_t)((uint8_t *)(STR))[7] << 56); \ + case 7: \ + WORD += ((Word_t)((uint8_t *)(STR))[6] << 48); \ + case 6: \ + WORD += ((Word_t)((uint8_t *)(STR))[5] << 40); \ + case 5: \ + WORD += ((Word_t)((uint8_t *)(STR))[4] << 32); \ + case 4: \ + WORD += ((Word_t)((uint8_t *)(STR))[3] << 24); \ + case 3: \ + WORD += ((Word_t)((uint8_t *)(STR))[2] << 16); \ + case 2: \ + WORD += ((Word_t)((uint8_t *)(STR))[1] << 8); \ + case 1: \ + WORD += ((Word_t)((uint8_t *)(STR))[0]); \ + case 0: break; \ + } \ +} + +#define COPYSTRINGtoWORD COPYSTRING8toWORD + +#else // JU_32BIT + +#define COPYSTRINGtoWORD COPYSTRING4toWORD + +#endif // JU_32BIT + +// set JError_t locally + +#define JU_SET_ERRNO(PJERROR, JERRNO) \ +{ \ + if (PJERROR != (PJError_t) NULL) \ + { \ + if (JERRNO) \ + JU_ERRNO(PJError) = (JERRNO); \ + JU_ERRID(PJERROR) = __LINE__; \ + } \ +} + +//======================================================================= +// This routine must hash string to 24..32 bits. The "goodness" of +// the hash is not as important as its speed. +//======================================================================= + +// hash to no more than 32 bits + +// extern Word_t gHmask; for hash bits experiments + +#define JUDYHASHSTR(HVALUE,STRING,LENGTH) \ +{ \ + uint8_t *p_ = (uint8_t *)(STRING); \ + uint8_t *q_ = p_ + (LENGTH); \ + uint32_t c_ = 0; \ + for (; p_ != q_; ++p_) \ + { \ + c_ = (c_ * 31) + *p_; \ + } \ +/* c_ &= gHmask; see above */ \ + (HVALUE) = c_; \ +} + +// Find String of Len in JudyHS structure, return pointer to associated Value + +PPvoid_t +JudyHSGet(Pcvoid_t PArray, // pointer (^) to structure + void * Str, // pointer to string + Word_t Len // length of string + ) +{ + uint8_t *String = (uint8_t *)Str; + PPvoid_t PPValue; // pointer to Value + Word_t Index; // 4[8] bytes of String + + JLG(PPValue, PArray, Len); // find hash table for strings of Len + if (PPValue == (PPvoid_t) NULL) + return ((PPvoid_t) NULL); // no strings of this Len + +// check for caller error (null pointer) +// + if ((String == (void *) NULL) && (Len != 0)) + return ((PPvoid_t) NULL); // avoid null-pointer dereference + +#ifndef DONOTUSEHASH + if (Len > WORDSIZE) // Hash table not necessary with short + { + uint32_t HValue; // hash of input string + JUDYHASHSTR(HValue, String, Len); // hash to no more than 32 bits + JLG(PPValue, *PPValue, (Word_t)HValue); // get ^ to hash bucket + if (PPValue == (PPvoid_t) NULL) + return ((PPvoid_t) NULL); // no entry in Hash table + } +#endif // DONOTUSEHASH + +/* + Each JudyL array decodes 4[8] bytes of the string. Since the hash + collisions occur very infrequently, the performance is not important. + However, even if the Hash code is not used this method still is + significantly faster than common tree methods (AVL, Red-Black, Splay, + b-tree, etc..). You can compare it yourself with #define DONOTUSEHASH + 1 or putting -DDONOTUSEHASH in the cc line. Use the "StringCompare.c" + code to compare (9Dec2003 dlb). +*/ + while (Len > WORDSIZE) // traverse tree of JudyL arrays + { + if (IS_PLS(*PPValue)) // ^ to JudyL array or ls_t struct? + { + Pls_t Pls; // ls_t struct, termination of tree + Pls = (Pls_t) CLEAR_PLS(*PPValue); // remove flag from ^ + +// if remaining string matches, return ^ to Value, else NULL + + if (memcmp(String, Pls->ls_String, Len) == 0) + return ((PPvoid_t) (&(Pls->ls_Value))); + else + return ((PPvoid_t) NULL); // string does not match + } + else + { + COPYSTRINGtoWORD(Index, String, WORDSIZE); + + JLG(PPValue, *PPValue, Index); // decode next 4[8] bytes + if (PPValue == (PPvoid_t) NULL) // if NULL array, bail out + return ((PPvoid_t) NULL); // string does not match + + String += WORDSIZE; // advance + Len -= WORDSIZE; + } + } + +// Get remaining 1..4[8] bytes left in string + + COPYSTRINGtoWORD(Index, String, Len); + JLG(PPValue, *PPValue, Index); // decode last 1-4[8] bytes + return (PPValue); +} + +// Add string to a tree of JudyL arrays (all lengths must be same) + +static PPvoid_t +insStrJudyLTree(uint8_t * String, // string to add to tree of JudyL arrays + Word_t Len, // length of string + PPvoid_t PPValue, // pointer to root pointer + PJError_t PJError // for returning error info + ) +{ + Word_t Index; // next 4[8] bytes of String + + while (Len > WORDSIZE) // add to JudyL tree + { +// CASE 1, pointer is to a NULL, make a new ls_t leaf + + if (*PPValue == (Pvoid_t)NULL) + { + Pls_t Pls; // memory for a ls_t + Pls = (Pls_t) JudyMalloc(LS_WORDLEN(Len)); + if (Pls == NULL) + { + JU_SET_ERRNO(PJError, JU_ERRNO_NOMEM); + return (PPJERR); + } + Pls->ls_Value = 0; // clear Value word + memcpy(Pls->ls_String, String, Len); // copy to new struct + *PPValue = (Pvoid_t)SET_PLS(Pls); // mark pointer + return ((PPvoid_t) (&Pls->ls_Value)); // return ^ to Value + } // no exit here +// CASE 2: is a ls_t, free (and shorten), then decode into JudyL tree + + if (IS_PLS(*PPValue)) // pointer to a ls_t? (leaf) + { + Pls_t Pls; // ^ to ls_t + uint8_t *String0; // ^ to string in ls_t + Word_t Index0; // 4[8] bytes in string + Word_t FreeLen; // length of ls_t + PPvoid_t PPsplit; + + FreeLen = LS_WORDLEN(Len); // length of ls_t + + Pls = (Pls_t) CLEAR_PLS(*PPValue); // demangle ^ to ls_t + String0 = Pls->ls_String; + if (memcmp(String, String0, Len) == 0) // check if match? + { + return ((PPvoid_t) (&Pls->ls_Value)); // yes, duplicate + } + + *PPValue = NULL; // clear ^ to ls_t and make JudyL + +// This do loop is technically not required, saves multiple JudyFree() +// when storing already sorted strings into structure + + do // decode next 4[8] bytes of string + { // with a JudyL array +// Note: string0 is always aligned + + COPYSTRINGtoWORD(Index0, String0, WORDSIZE); + String0 += WORDSIZE; + COPYSTRINGtoWORD(Index, String, WORDSIZE); + String += WORDSIZE; + Len -= WORDSIZE; + PPsplit = PPValue; // save for split below + PPValue = JudyLIns(PPValue, Index0, PJError); + if (PPValue == PPJERR) + { + JU_SET_ERRNO(PJError, 0); + return (PPJERR); + } + + } while ((Index0 == Index) && (Len > WORDSIZE)); + +// finish storing remainder of string that was in the ls_t + + PPValue = insStrJudyLTree(String0, Len, PPValue, PJError); + if (PPValue == PPJERR) + { + return (PPJERR); + } +// copy old Value to Value in new struct + + *(PWord_t)PPValue = Pls->ls_Value; + +// free the string buffer (ls_t) + + JudyFree((Pvoid_t)Pls, FreeLen); + PPValue = JudyLIns(PPsplit, Index, PJError); + if (PPValue == PPJERR) + { + JU_SET_ERRNO(PJError, 0); + return (PPValue); + } + +// finish remainder of newly inserted string + + PPValue = insStrJudyLTree(String, Len, PPValue, PJError); + return (PPValue); + } // no exit here +// CASE 3, more JudyL arrays, decode to next tree + + COPYSTRINGtoWORD(Index, String, WORDSIZE); + Len -= WORDSIZE; + String += WORDSIZE; + + PPValue = JudyLIns(PPValue, Index, PJError); // next 4[8] bytes + if (PPValue == PPJERR) + { + JU_SET_ERRNO(PJError, 0); + return (PPValue); + } + } +// this is done outside of loop so "Len" can be an unsigned number + + COPYSTRINGtoWORD(Index, String, Len); + PPValue = JudyLIns(PPValue, Index, PJError); // remaining 4[8] bytes + + return (PPValue); +} + + +// Insert string to JudyHS structure, return pointer to associated Value + +PPvoid_t +JudyHSIns(PPvoid_t PPArray, // ^ to JudyHashArray name + void * Str, // pointer to string + Word_t Len, // length of string + PJError_t PJError // optional, for returning error info + ) +{ + uint8_t * String = (uint8_t *)Str; + PPvoid_t PPValue; + +// string can only be NULL if Len is 0. + + if ((String == (uint8_t *) NULL) && (Len != 0UL)) + { + JU_SET_ERRNO(PJError, JU_ERRNO_NULLPINDEX); + return (PPJERR); + } + JLG(PPValue, *PPArray, Len); // JudyL hash table for strings of Len + if (PPValue == (PPvoid_t) NULL) // make new if missing, (very rare) + { + PPValue = JudyLIns(PPArray, Len, PJError); + if (PPValue == PPJERR) + { + JU_SET_ERRNO(PJError, 0); + return (PPJERR); + } + } +#ifndef DONOTUSEHASH + if (Len > WORDSIZE) + { + uint32_t HValue; // hash of input string + JUDYHASHSTR(HValue, String, Len); // hash to no more than 32 bits + PPValue = JudyLIns(PPValue, (Word_t)HValue, PJError); + if (PPValue == PPJERR) + { + JU_SET_ERRNO(PJError, 0); + return (PPJERR); + } + } +#endif // DONOTUSEHASH + + PPValue = insStrJudyLTree(String, Len, PPValue, PJError); // add string + return (PPValue); // ^ to Value +} + +// Delete string from tree of JudyL arrays (all Lens must be same) + +static int +delStrJudyLTree(uint8_t * String, // delete from tree of JudyL arrays + Word_t Len, // length of string + PPvoid_t PPValue, // ^ to hash bucket + PJError_t PJError // for returning error info + ) +{ + PPvoid_t PPValueN; // next pointer + Word_t Index; + int Ret; // -1=failed, 1=success, 2=quit del + + if (IS_PLS(*PPValue)) // is pointer to ls_t? + { + Pls_t Pls; + Pls = (Pls_t) CLEAR_PLS(*PPValue); // demangle pointer + JudyFree((Pvoid_t)Pls, LS_WORDLEN(Len)); // free the ls_t + + *PPValue = (Pvoid_t)NULL; // clean pointer + return (1); // successfully deleted + } + + if (Len > WORDSIZE) // delete from JudyL tree, not leaf + { + COPYSTRINGtoWORD(Index, String, WORDSIZE); // get Index + JLG(PPValueN, *PPValue, Index); // get pointer to next JudyL array + + String += WORDSIZE; // advance to next 4[8] bytes + Len -= WORDSIZE; + + Ret = delStrJudyLTree(String, Len, PPValueN, PJError); + if (Ret != 1) return(Ret); + + if (*PPValueN == (PPvoid_t) NULL) + { +// delete JudyL element from tree + + Ret = JudyLDel(PPValue, Index, PJError); + } + } + else + { + COPYSTRINGtoWORD(Index, String, Len); // get leaf element + +// delete last 1-4[8] bytes from leaf element + + Ret = JudyLDel(PPValue, Index, PJError); + } + return (Ret); +} + +// Delete string from JHS structure + +int +JudyHSDel(PPvoid_t PPArray, // ^ to JudyHashArray struct + void * Str, // pointer to string + Word_t Len, // length of string + PJError_t PJError // optional, for returning error info + ) +{ + uint8_t * String = (uint8_t *)Str; + PPvoid_t PPBucket, PPHtble; + int Ret; // return bool from Delete routine +#ifndef DONOTUSEHASH + uint32_t HValue = 0; // hash value of input string +#endif // DONOTUSEHASH + + if (PPArray == NULL) + return (0); // no pointer, return not found + +// This is a little slower than optimum method, but not much in new CPU +// Verify that string is in the structure -- simplifies future assumptions + + if (JudyHSGet(*PPArray, String, Len) == (PPvoid_t) NULL) + return (0); // string not found, return + +// string is in structure, so testing for absence is not necessary + + JLG(PPHtble, *PPArray, Len); // JudyL hash table for strings of Len + +#ifdef DONOTUSEHASH + PPBucket = PPHtble; // simulate below code +#else // USEHASH + if (Len > WORDSIZE) + { + JUDYHASHSTR(HValue, String, Len); // hash to no more than 32 bits + +// get pointer to hash bucket + + JLG(PPBucket, *PPHtble, (Word_t)HValue); + } + else + { + PPBucket = PPHtble; // no bucket to JLGet + } +#endif // USEHASH + +// delete from JudyL tree +// + Ret = delStrJudyLTree(String, Len, PPBucket, PJError); + if (Ret != 1) + { + JU_SET_ERRNO(PJError, 0); + return(-1); + } +// handle case of missing JudyL array from hash table and length table + + if (*PPBucket == (Pvoid_t)NULL) // if JudyL tree gone + { +#ifndef DONOTUSEHASH + if (Len > WORDSIZE) + { +// delete entry in Hash table + + Ret = JudyLDel(PPHtble, (Word_t)HValue, PJError); + if (Ret != 1) + { + JU_SET_ERRNO(PJError, 0); + return(-1); + } + } +#endif // USEHASH + if (*PPHtble == (PPvoid_t) NULL) // if Hash table gone + { +// delete entry from the String length table + + Ret = JudyLDel(PPArray, Len, PJError); + if (Ret != 1) + { + JU_SET_ERRNO(PJError, 0); + return(-1); + } + } + } + return (1); // success +} + +static Word_t +delJudyLTree(PPvoid_t PPValue, // ^ to JudyL root pointer + Word_t Len, // length of string + PJError_t PJError) // for returning error info +{ + Word_t bytes_freed = 0; // bytes freed at point + Word_t bytes_total = 0; // accumulated bytes freed + PPvoid_t PPValueN; + +// Pointer is to another tree of JudyL arrays or ls_t struct + + if (Len > WORDSIZE) // more depth to tree + { + Word_t NEntry; + +// Pointer is to a ls_t struct + + if (IS_PLS(*PPValue)) + { + Pls_t Pls; + Word_t freewords; + + freewords = LS_WORDLEN(Len); // calculate length + Pls = (Pls_t)CLEAR_PLS(*PPValue); // demangle pointer + +// *PPValue = (Pvoid_t)NULL; // clean pointer + JudyFree((Pvoid_t)Pls, freewords); // free the ls_t + + return(freewords * WORDSIZE); + } +// else +// Walk all the entrys in the JudyL array + + NEntry = 0; // start at beginning + for (PPValueN = JudyLFirst(*PPValue, &NEntry, PJError); + (PPValueN != (PPvoid_t) NULL) && (PPValueN != PPJERR); + PPValueN = JudyLNext(*PPValue, &NEntry, PJError)) + { +// recurse to the next level in the tree of arrays + + bytes_freed = delJudyLTree(PPValueN, Len - WORDSIZE, PJError); + if (bytes_freed == JERR) return(JERR); + bytes_total += bytes_freed; + } + if (PPValueN == PPJERR) return(JERR); + +// now free this JudyL array + + bytes_freed = JudyLFreeArray(PPValue, PJError); + if (bytes_freed == JERR) return(JERR); + bytes_total += bytes_freed; + + return(bytes_total); // return amount freed + } +// else + +// Pointer to simple JudyL array + + bytes_freed = JudyLFreeArray(PPValue, PJError); + + return(bytes_freed); +} + + +Word_t // bytes freed +JudyHSFreeArray(PPvoid_t PPArray, // ^ to JudyHashArray struct + PJError_t PJError // optional, for returning error info + ) +{ + Word_t Len; // start at beginning + Word_t bytes_freed; // bytes freed at this level. + Word_t bytes_total; // bytes total at all levels. + PPvoid_t PPHtble; + + if (PPArray == NULL) + return (0); // no pointer, return none + +// Walk the string length table for subsidary hash structs +// NOTE: This is necessary to determine the depth of the tree + + bytes_freed = 0; + bytes_total = 0; + Len = 0; // walk to length table + + for (PPHtble = JudyLFirst(*PPArray, &Len, PJError); + (PPHtble != (PPvoid_t) NULL) && (PPHtble != PPJERR); + PPHtble = JudyLNext(*PPArray, &Len, PJError)) + { + PPvoid_t PPValueH; + +#ifndef DONOTUSEHASH + if (Len > WORDSIZE) + { + Word_t HEntry = 0; // walk the hash tables + + for (PPValueH = JudyLFirst(*PPHtble, &HEntry, PJError); + (PPValueH != (PPvoid_t) NULL) && (PPValueH != PPJERR); + PPValueH = JudyLNext(*PPHtble, &HEntry, PJError)) + { + bytes_freed = delJudyLTree(PPValueH, Len, PJError); + if (bytes_freed == JERR) return(JERR); + bytes_total += bytes_freed; + } + + if (PPValueH == PPJERR) return(JERR); + +// free the Hash table for this length of string + + bytes_freed = JudyLFreeArray(PPHtble, PJError); + if (bytes_freed == JERR) return(JERR); + bytes_total += bytes_freed; + } + else +#endif // DONOTUSEHASH + { + PPValueH = PPHtble; // simulate hash table + + bytes_freed = delJudyLTree(PPValueH, Len, PJError); + if (bytes_freed == JERR) return(JERR); + bytes_total += bytes_freed; + } + } + if (PPHtble == PPJERR) return(JERR); + +// free the length table + + bytes_freed = JudyLFreeArray(PPArray, PJError); + if (bytes_freed == JERR) return(JERR); + + bytes_total += bytes_freed; + + return(bytes_total); // return bytes freed +} diff --git a/src/libnetdata/libjudy/src/JudyL/JudyL.h b/src/libnetdata/libjudy/src/JudyL/JudyL.h new file mode 100644 index 00000000..d901969d --- /dev/null +++ b/src/libnetdata/libjudy/src/JudyL/JudyL.h @@ -0,0 +1,505 @@ +#ifndef _JUDYL_INCLUDED +#define _JUDYL_INCLUDED +// _________________ +// +// Copyright (C) 2000 - 2002 Hewlett-Packard Company +// +// This program is free software; you can redistribute it and/or modify it +// under the term of the GNU Lesser General Public License as published by the +// Free Software Foundation; either version 2 of the License, or (at your +// option) any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License +// for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this program; if not, write to the Free Software Foundation, +// Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// _________________ + +// @(#) $Revision: 4.41 $ $Source: /judy/src/JudyL/JudyL.h $ + +// **************************************************************************** +// JUDYL -- SMALL/LARGE AND/OR CLUSTERED/SPARSE ARRAYS +// +// -by- +// +// Douglas L. Baskins +// doug@sourcejudy.com +// +// Judy arrays are designed to be used instead of arrays. The performance +// suggests the reason why Judy arrays are thought of as arrays, instead of +// trees. They are remarkably memory efficient at all populations. +// Implemented as a hybrid digital tree (but really a state machine, see +// below), Judy arrays feature fast insert/retrievals, fast near neighbor +// searching, and contain a population tree for extremely fast ordinal related +// retrievals. +// +// CONVENTIONS: +// +// - The comments here refer to 32-bit [64-bit] systems. +// +// - BranchL, LeafL refer to linear branches and leaves (small populations), +// except LeafL does not actually appear as such; rather, Leaf1..3 [Leaf1..7] +// is used to represent leaf Index sizes, and LeafW refers to a Leaf with +// full (Long) word Indexes, which is also a type of linear leaf. Note that +// root-level LeafW (Leaf4 [Leaf8]) leaves are called LEAFW. +// +// - BranchB, LeafB1 refer to bitmap branches and leaves (intermediate +// populations). +// +// - BranchU refers to uncompressed branches. An uncompressed branch has 256 +// JPs, some of which could be null. Note: All leaves are compressed (and +// sorted), or else an expanse is full (FullPopu), so there is no LeafU +// equivalent to BranchU. +// +// - "Popu" is short for "Population". +// - "Pop1" refers to actual population (base 1). +// - "Pop0" refers to Pop1 - 1 (base 0), the way populations are stored in data +// structures. +// +// - Branches and Leaves are both named by the number of bytes in their Pop0 +// field. In the case of Leaves, the same number applies to the Index sizes. +// +// - The representation of many numbers as hex is a relatively safe and +// portable way to get desired bitpatterns as unsigned longs. +// +// - Some preprocessors cant handle single apostrophe characters within +// #ifndef code, so here, delete all instead. + + +#include "JudyPrivate.h" // includes Judy.h in turn. +#include "JudyPrivateBranch.h" // support for branches. + + +// **************************************************************************** +// JUDYL ROOT POINTER (JRP) AND JUDYL POINTER (JP) TYPE FIELDS +// **************************************************************************** + +typedef enum // uint8_t -- but C does not support this type of enum. +{ + +// JP NULL TYPES: +// +// There is a series of cJL_JPNULL* Types because each one pre-records a +// different Index Size for when the first Index is inserted in the previously +// null JP. They must start >= 8 (three bits). +// +// Note: These Types must be in sequential order for doing relative +// calculations between them. + + cJL_JPNULL1 = 1, + // Index Size 1[1] byte when 1 Index inserted. + cJL_JPNULL2, // Index Size 2[2] bytes when 1 Index inserted. + cJL_JPNULL3, // Index Size 3[3] bytes when 1 Index inserted. + +#ifndef JU_64BIT +#define cJL_JPNULLMAX cJL_JPNULL3 +#else + cJL_JPNULL4, // Index Size 4[4] bytes when 1 Index inserted. + cJL_JPNULL5, // Index Size 5[5] bytes when 1 Index inserted. + cJL_JPNULL6, // Index Size 6[6] bytes when 1 Index inserted. + cJL_JPNULL7, // Index Size 7[7] bytes when 1 Index inserted. +#define cJL_JPNULLMAX cJL_JPNULL7 +#endif + + +// JP BRANCH TYPES: +// +// Note: There are no state-1 branches; only leaves reside at state 1. + +// Linear branches: +// +// Note: These Types must be in sequential order for doing relative +// calculations between them. + + cJL_JPBRANCH_L2, // 2[2] bytes Pop0, 1[5] bytes Dcd. + cJL_JPBRANCH_L3, // 3[3] bytes Pop0, 0[4] bytes Dcd. + +#ifdef JU_64BIT + cJL_JPBRANCH_L4, // [4] bytes Pop0, [3] bytes Dcd. + cJL_JPBRANCH_L5, // [5] bytes Pop0, [2] bytes Dcd. + cJL_JPBRANCH_L6, // [6] bytes Pop0, [1] byte Dcd. + cJL_JPBRANCH_L7, // [7] bytes Pop0, [0] bytes Dcd. +#endif + + cJL_JPBRANCH_L, // note: DcdPopO field not used. + +// Bitmap branches: +// +// Note: These Types must be in sequential order for doing relative +// calculations between them. + + cJL_JPBRANCH_B2, // 2[2] bytes Pop0, 1[5] bytes Dcd. + cJL_JPBRANCH_B3, // 3[3] bytes Pop0, 0[4] bytes Dcd. + +#ifdef JU_64BIT + cJL_JPBRANCH_B4, // [4] bytes Pop0, [3] bytes Dcd. + cJL_JPBRANCH_B5, // [5] bytes Pop0, [2] bytes Dcd. + cJL_JPBRANCH_B6, // [6] bytes Pop0, [1] byte Dcd. + cJL_JPBRANCH_B7, // [7] bytes Pop0, [0] bytes Dcd. +#endif + + cJL_JPBRANCH_B, // note: DcdPopO field not used. + +// Uncompressed branches: +// +// Note: These Types must be in sequential order for doing relative +// calculations between them. + + cJL_JPBRANCH_U2, // 2[2] bytes Pop0, 1[5] bytes Dcd. + cJL_JPBRANCH_U3, // 3[3] bytes Pop0, 0[4] bytes Dcd. + +#ifdef JU_64BIT + cJL_JPBRANCH_U4, // [4] bytes Pop0, [3] bytes Dcd. + cJL_JPBRANCH_U5, // [5] bytes Pop0, [2] bytes Dcd. + cJL_JPBRANCH_U6, // [6] bytes Pop0, [1] byte Dcd. + cJL_JPBRANCH_U7, // [7] bytes Pop0, [0] bytes Dcd. +#endif + + cJL_JPBRANCH_U, // note: DcdPopO field not used. + + +// JP LEAF TYPES: + +// Linear leaves: +// +// Note: These Types must be in sequential order for doing relative +// calculations between them. +// +// Note: There is no full-word (4-byte [8-byte]) Index leaf under a JP because +// non-root-state leaves only occur under branches that decode at least one +// byte. Full-word, root-state leaves are under a JRP, not a JP. However, in +// the code a "fake" JP can be created temporarily above a root-state leaf. + + cJL_JPLEAF1, // 1[1] byte Pop0, 2 bytes Dcd. + cJL_JPLEAF2, // 2[2] bytes Pop0, 1[5] bytes Dcd. + cJL_JPLEAF3, // 3[3] bytes Pop0, 0[4] bytes Dcd. + +#ifdef JU_64BIT + cJL_JPLEAF4, // [4] bytes Pop0, [3] bytes Dcd. + cJL_JPLEAF5, // [5] bytes Pop0, [2] bytes Dcd. + cJL_JPLEAF6, // [6] bytes Pop0, [1] byte Dcd. + cJL_JPLEAF7, // [7] bytes Pop0, [0] bytes Dcd. +#endif + +// Bitmap leaf; Index Size == 1: +// +// Note: These are currently only supported at state 1. At other states the +// bitmap would grow from 256 to 256^2, 256^3, ... bits, which would not be +// efficient.. + + cJL_JPLEAF_B1, // 1[1] byte Pop0, 2[6] bytes Dcd. + +// Full population; Index Size == 1 virtual leaf: +// +// Note: JudyL has no cJL_JPFULLPOPU1 equivalent to cJ1_JPFULLPOPU1, because +// in the JudyL case this could result in a values-only leaf of up to 256 words +// (value areas) that would be slow to insert/delete. + + +// JP IMMEDIATES; leaves (Indexes) stored inside a JP: +// +// The second numeric suffix is the Pop1 for each type. As the Index Size +// increases, the maximum possible population decreases. +// +// Note: These Types must be in sequential order in each group (Index Size), +// and the groups in correct order too, for doing relative calculations between +// them. For example, since these Types enumerate the Pop1 values (unlike +// other JP Types where there is a Pop0 value in the JP), the maximum Pop1 for +// each Index Size is computable. +// +// All enums equal or above this point are cJL_JPIMMEDs. + + cJL_JPIMMED_1_01, // Index Size = 1, Pop1 = 1. + cJL_JPIMMED_2_01, // Index Size = 2, Pop1 = 1. + cJL_JPIMMED_3_01, // Index Size = 3, Pop1 = 1. + +#ifdef JU_64BIT + cJL_JPIMMED_4_01, // Index Size = 4, Pop1 = 1. + cJL_JPIMMED_5_01, // Index Size = 5, Pop1 = 1. + cJL_JPIMMED_6_01, // Index Size = 6, Pop1 = 1. + cJL_JPIMMED_7_01, // Index Size = 7, Pop1 = 1. +#endif + + cJL_JPIMMED_1_02, // Index Size = 1, Pop1 = 2. + cJL_JPIMMED_1_03, // Index Size = 1, Pop1 = 3. + +#ifdef JU_64BIT + cJL_JPIMMED_1_04, // Index Size = 1, Pop1 = 4. + cJL_JPIMMED_1_05, // Index Size = 1, Pop1 = 5. + cJL_JPIMMED_1_06, // Index Size = 1, Pop1 = 6. + cJL_JPIMMED_1_07, // Index Size = 1, Pop1 = 7. + + cJL_JPIMMED_2_02, // Index Size = 2, Pop1 = 2. + cJL_JPIMMED_2_03, // Index Size = 2, Pop1 = 3. + + cJL_JPIMMED_3_02, // Index Size = 3, Pop1 = 2. +#endif + +// This special Type is merely a sentinel for doing relative calculations. +// This value should not be used in switch statements (to avoid allocating code +// for it), which is also why it appears at the end of the enum list. + + cJL_JPIMMED_CAP + +} jpL_Type_t; + + +// RELATED VALUES: + +// Index Size (state) for leaf JP, and JP type based on Index Size (state): + +#define JL_LEAFINDEXSIZE(jpType) ((jpType) - cJL_JPLEAF1 + 1) +#define JL_LEAFTYPE(IndexSize) ((IndexSize) + cJL_JPLEAF1 - 1) + + +// MAXIMUM POPULATIONS OF LINEAR LEAVES: + +#ifndef JU_64BIT // 32-bit + +#define J_L_MAXB (sizeof(Word_t) * 64) +#define ALLOCSIZES { 3, 5, 7, 11, 15, 23, 32, 47, 64, TERMINATOR } // in words. +#define cJL_LEAF1_MAXWORDS (32) // max Leaf1 size in words. + +// Note: cJL_LEAF1_MAXPOP1 is chosen such that the index portion is less than +// 32 bytes -- the number of bytes the index takes in a bitmap leaf. + +#define cJL_LEAF1_MAXPOP1 \ + ((cJL_LEAF1_MAXWORDS * cJU_BYTESPERWORD)/(1 + cJU_BYTESPERWORD)) +#define cJL_LEAF2_MAXPOP1 (J_L_MAXB / (2 + cJU_BYTESPERWORD)) +#define cJL_LEAF3_MAXPOP1 (J_L_MAXB / (3 + cJU_BYTESPERWORD)) +#define cJL_LEAFW_MAXPOP1 \ + ((J_L_MAXB - cJU_BYTESPERWORD) / (2 * cJU_BYTESPERWORD)) + +#else // 64-bit + +#define J_L_MAXB (sizeof(Word_t) * 64) +#define ALLOCSIZES { 3, 5, 7, 11, 15, 23, 32, 47, 64, TERMINATOR } // in words. +#define cJL_LEAF1_MAXWORDS (15) // max Leaf1 size in words. + +#define cJL_LEAF1_MAXPOP1 \ + ((cJL_LEAF1_MAXWORDS * cJU_BYTESPERWORD)/(1 + cJU_BYTESPERWORD)) +#define cJL_LEAF2_MAXPOP1 (J_L_MAXB / (2 + cJU_BYTESPERWORD)) +#define cJL_LEAF3_MAXPOP1 (J_L_MAXB / (3 + cJU_BYTESPERWORD)) +#define cJL_LEAF4_MAXPOP1 (J_L_MAXB / (4 + cJU_BYTESPERWORD)) +#define cJL_LEAF5_MAXPOP1 (J_L_MAXB / (5 + cJU_BYTESPERWORD)) +#define cJL_LEAF6_MAXPOP1 (J_L_MAXB / (6 + cJU_BYTESPERWORD)) +#define cJL_LEAF7_MAXPOP1 (J_L_MAXB / (7 + cJU_BYTESPERWORD)) +#define cJL_LEAFW_MAXPOP1 \ + ((J_L_MAXB - cJU_BYTESPERWORD) / (2 * cJU_BYTESPERWORD)) + +#endif // 64-bit + + +// MAXIMUM POPULATIONS OF IMMEDIATE JPs: +// +// These specify the maximum Population of immediate JPs with various Index +// Sizes (== sizes of remaining undecoded Index bits). Since the JP Types enum +// already lists all the immediates in order by state and size, calculate these +// values from it to avoid redundancy. + +#define cJL_IMMED1_MAXPOP1 ((cJU_BYTESPERWORD - 1) / 1) // 3 [7]. +#define cJL_IMMED2_MAXPOP1 ((cJU_BYTESPERWORD - 1) / 2) // 1 [3]. +#define cJL_IMMED3_MAXPOP1 ((cJU_BYTESPERWORD - 1) / 3) // 1 [2]. + +#ifdef JU_64BIT +#define cJL_IMMED4_MAXPOP1 ((cJU_BYTESPERWORD - 1) / 4) // [1]. +#define cJL_IMMED5_MAXPOP1 ((cJU_BYTESPERWORD - 1) / 5) // [1]. +#define cJL_IMMED6_MAXPOP1 ((cJU_BYTESPERWORD - 1) / 6) // [1]. +#define cJL_IMMED7_MAXPOP1 ((cJU_BYTESPERWORD - 1) / 7) // [1]. +#endif + + +// **************************************************************************** +// JUDYL LEAF BITMAP (JLLB) SUPPORT +// **************************************************************************** +// +// Assemble bitmap leaves out of smaller units that put bitmap subexpanses +// close to their associated pointers. Why not just use a bitmap followed by a +// series of pointers? (See 4.27.) Turns out this wastes a cache fill on +// systems with smaller cache lines than the assumed value cJU_WORDSPERCL. + +#define JL_JLB_BITMAP(Pjlb, Subexp) ((Pjlb)->jLlb_jLlbs[Subexp].jLlbs_Bitmap) +#define JL_JLB_PVALUE(Pjlb, Subexp) ((Pjlb)->jLlb_jLlbs[Subexp].jLlbs_PValue) + +typedef struct J__UDYL_LEAF_BITMAP_SUBEXPANSE +{ + BITMAPL_t jLlbs_Bitmap; + Pjv_t jLlbs_PValue; + +} jLlbs_t; + +typedef struct J__UDYL_LEAF_BITMAP +{ + jLlbs_t jLlb_jLlbs[cJU_NUMSUBEXPL]; + +} jLlb_t, * PjLlb_t; + +// Words per bitmap leaf: + +#define cJL_WORDSPERLEAFB1 (sizeof(jLlb_t) / cJU_BYTESPERWORD) + + +// **************************************************************************** +// MEMORY ALLOCATION SUPPORT +// **************************************************************************** + +// ARRAY-GLOBAL INFORMATION: +// +// At the cost of an occasional additional cache fill, this object, which is +// pointed at by a JRP and in turn points to a JP_BRANCH*, carries array-global +// information about a JudyL array that has sufficient population to amortize +// the cost. The jpm_Pop0 field prevents having to add up the total population +// for the array in insert, delete, and count code. The jpm_JP field prevents +// having to build a fake JP for entry to a state machine; however, the +// jp_DcdPopO field in jpm_JP, being one byte too small, is not used. +// +// Note: Struct fields are ordered to keep "hot" data in the first 8 words +// (see left-margin comments) for machines with 8-word cache lines, and to keep +// sub-word fields together for efficient packing. + +typedef struct J_UDYL_POPULATION_AND_MEMORY +{ +/* 1 */ Word_t jpm_Pop0; // total population-1 in array. +/* 2 */ jp_t jpm_JP; // JP to first branch; see above. +/* 4 */ Word_t jpm_LastUPop0; // last jpm_Pop0 when convert to BranchU +/* 7 */ Pjv_t jpm_PValue; // pointer to value to return. +// Note: Field names match PJError_t for convenience in macros: +/* 8 */ char je_Errno; // one of the enums in Judy.h. +/* 8/9 */ int je_ErrID; // often an internal source line number. +/* 9/10 */ Word_t jpm_TotalMemWords; // words allocated in array. +} jLpm_t, *PjLpm_t; + + +// TABLES FOR DETERMINING IF LEAVES HAVE ROOM TO GROW: +// +// These tables indicate if a given memory chunk can support growth of a given +// object into wasted (rounded-up) memory in the chunk. Note: This violates +// the hiddenness of the JudyMalloc code. + +extern const uint8_t j__L_Leaf1PopToWords[cJL_LEAF1_MAXPOP1 + 1]; +extern const uint8_t j__L_Leaf2PopToWords[cJL_LEAF2_MAXPOP1 + 1]; +extern const uint8_t j__L_Leaf3PopToWords[cJL_LEAF3_MAXPOP1 + 1]; +#ifdef JU_64BIT +extern const uint8_t j__L_Leaf4PopToWords[cJL_LEAF4_MAXPOP1 + 1]; +extern const uint8_t j__L_Leaf5PopToWords[cJL_LEAF5_MAXPOP1 + 1]; +extern const uint8_t j__L_Leaf6PopToWords[cJL_LEAF6_MAXPOP1 + 1]; +extern const uint8_t j__L_Leaf7PopToWords[cJL_LEAF7_MAXPOP1 + 1]; +#endif +extern const uint8_t j__L_LeafWPopToWords[cJL_LEAFW_MAXPOP1 + 1]; +extern const uint8_t j__L_LeafVPopToWords[]; + +// These tables indicate where value areas start: + +extern const uint8_t j__L_Leaf1Offset [cJL_LEAF1_MAXPOP1 + 1]; +extern const uint8_t j__L_Leaf2Offset [cJL_LEAF2_MAXPOP1 + 1]; +extern const uint8_t j__L_Leaf3Offset [cJL_LEAF3_MAXPOP1 + 1]; +#ifdef JU_64BIT +extern const uint8_t j__L_Leaf4Offset [cJL_LEAF4_MAXPOP1 + 1]; +extern const uint8_t j__L_Leaf5Offset [cJL_LEAF5_MAXPOP1 + 1]; +extern const uint8_t j__L_Leaf6Offset [cJL_LEAF6_MAXPOP1 + 1]; +extern const uint8_t j__L_Leaf7Offset [cJL_LEAF7_MAXPOP1 + 1]; +#endif +extern const uint8_t j__L_LeafWOffset [cJL_LEAFW_MAXPOP1 + 1]; + +// Also define macros to hide the details in the code using these tables. + +#define JL_LEAF1GROWINPLACE(Pop1) \ + J__U_GROWCK(Pop1, cJL_LEAF1_MAXPOP1, j__L_Leaf1PopToWords) +#define JL_LEAF2GROWINPLACE(Pop1) \ + J__U_GROWCK(Pop1, cJL_LEAF2_MAXPOP1, j__L_Leaf2PopToWords) +#define JL_LEAF3GROWINPLACE(Pop1) \ + J__U_GROWCK(Pop1, cJL_LEAF3_MAXPOP1, j__L_Leaf3PopToWords) +#ifdef JU_64BIT +#define JL_LEAF4GROWINPLACE(Pop1) \ + J__U_GROWCK(Pop1, cJL_LEAF4_MAXPOP1, j__L_Leaf4PopToWords) +#define JL_LEAF5GROWINPLACE(Pop1) \ + J__U_GROWCK(Pop1, cJL_LEAF5_MAXPOP1, j__L_Leaf5PopToWords) +#define JL_LEAF6GROWINPLACE(Pop1) \ + J__U_GROWCK(Pop1, cJL_LEAF6_MAXPOP1, j__L_Leaf6PopToWords) +#define JL_LEAF7GROWINPLACE(Pop1) \ + J__U_GROWCK(Pop1, cJL_LEAF7_MAXPOP1, j__L_Leaf7PopToWords) +#endif +#define JL_LEAFWGROWINPLACE(Pop1) \ + J__U_GROWCK(Pop1, cJL_LEAFW_MAXPOP1, j__L_LeafWPopToWords) +#define JL_LEAFVGROWINPLACE(Pop1) \ + J__U_GROWCK(Pop1, cJU_BITSPERSUBEXPL, j__L_LeafVPopToWords) + +#define JL_LEAF1VALUEAREA(Pjv,Pop1) (((PWord_t)(Pjv)) + j__L_Leaf1Offset[Pop1]) +#define JL_LEAF2VALUEAREA(Pjv,Pop1) (((PWord_t)(Pjv)) + j__L_Leaf2Offset[Pop1]) +#define JL_LEAF3VALUEAREA(Pjv,Pop1) (((PWord_t)(Pjv)) + j__L_Leaf3Offset[Pop1]) +#ifdef JU_64BIT +#define JL_LEAF4VALUEAREA(Pjv,Pop1) (((PWord_t)(Pjv)) + j__L_Leaf4Offset[Pop1]) +#define JL_LEAF5VALUEAREA(Pjv,Pop1) (((PWord_t)(Pjv)) + j__L_Leaf5Offset[Pop1]) +#define JL_LEAF6VALUEAREA(Pjv,Pop1) (((PWord_t)(Pjv)) + j__L_Leaf6Offset[Pop1]) +#define JL_LEAF7VALUEAREA(Pjv,Pop1) (((PWord_t)(Pjv)) + j__L_Leaf7Offset[Pop1]) +#endif +#define JL_LEAFWVALUEAREA(Pjv,Pop1) (((PWord_t)(Pjv)) + j__L_LeafWOffset[Pop1]) + +#define JL_LEAF1POPTOWORDS(Pop1) (j__L_Leaf1PopToWords[Pop1]) +#define JL_LEAF2POPTOWORDS(Pop1) (j__L_Leaf2PopToWords[Pop1]) +#define JL_LEAF3POPTOWORDS(Pop1) (j__L_Leaf3PopToWords[Pop1]) +#ifdef JU_64BIT +#define JL_LEAF4POPTOWORDS(Pop1) (j__L_Leaf4PopToWords[Pop1]) +#define JL_LEAF5POPTOWORDS(Pop1) (j__L_Leaf5PopToWords[Pop1]) +#define JL_LEAF6POPTOWORDS(Pop1) (j__L_Leaf6PopToWords[Pop1]) +#define JL_LEAF7POPTOWORDS(Pop1) (j__L_Leaf7PopToWords[Pop1]) +#endif +#define JL_LEAFWPOPTOWORDS(Pop1) (j__L_LeafWPopToWords[Pop1]) +#define JL_LEAFVPOPTOWORDS(Pop1) (j__L_LeafVPopToWords[Pop1]) + + +// FUNCTIONS TO ALLOCATE OBJECTS: + +PjLpm_t j__udyLAllocJLPM(void); // constant size. + +Pjbl_t j__udyLAllocJBL( PjLpm_t); // constant size. +Pjbb_t j__udyLAllocJBB( PjLpm_t); // constant size. +Pjp_t j__udyLAllocJBBJP(Word_t, PjLpm_t); +Pjbu_t j__udyLAllocJBU( PjLpm_t); // constant size. + +Pjll_t j__udyLAllocJLL1( Word_t, PjLpm_t); +Pjll_t j__udyLAllocJLL2( Word_t, PjLpm_t); +Pjll_t j__udyLAllocJLL3( Word_t, PjLpm_t); + +#ifdef JU_64BIT +Pjll_t j__udyLAllocJLL4( Word_t, PjLpm_t); +Pjll_t j__udyLAllocJLL5( Word_t, PjLpm_t); +Pjll_t j__udyLAllocJLL6( Word_t, PjLpm_t); +Pjll_t j__udyLAllocJLL7( Word_t, PjLpm_t); +#endif + +Pjlw_t j__udyLAllocJLW( Word_t ); // no PjLpm_t needed. +PjLlb_t j__udyLAllocJLB1( PjLpm_t); // constant size. +Pjv_t j__udyLAllocJV( Word_t, PjLpm_t); + + +// FUNCTIONS TO FREE OBJECTS: + +void j__udyLFreeJLPM( PjLpm_t, PjLpm_t); // constant size. + +void j__udyLFreeJBL( Pjbl_t, PjLpm_t); // constant size. +void j__udyLFreeJBB( Pjbb_t, PjLpm_t); // constant size. +void j__udyLFreeJBBJP(Pjp_t, Word_t, PjLpm_t); +void j__udyLFreeJBU( Pjbu_t, PjLpm_t); // constant size. + +void j__udyLFreeJLL1( Pjll_t, Word_t, PjLpm_t); +void j__udyLFreeJLL2( Pjll_t, Word_t, PjLpm_t); +void j__udyLFreeJLL3( Pjll_t, Word_t, PjLpm_t); + +#ifdef JU_64BIT +void j__udyLFreeJLL4( Pjll_t, Word_t, PjLpm_t); +void j__udyLFreeJLL5( Pjll_t, Word_t, PjLpm_t); +void j__udyLFreeJLL6( Pjll_t, Word_t, PjLpm_t); +void j__udyLFreeJLL7( Pjll_t, Word_t, PjLpm_t); +#endif + +void j__udyLFreeJLW( Pjlw_t, Word_t, PjLpm_t); +void j__udyLFreeJLB1( PjLlb_t, PjLpm_t); // constant size. +void j__udyLFreeJV( Pjv_t, Word_t, PjLpm_t); +void j__udyLFreeSM( Pjp_t, PjLpm_t); // everything below Pjp. + +#endif // ! _JUDYL_INCLUDED diff --git a/src/libnetdata/libjudy/src/JudyL/JudyLByCount.c b/src/libnetdata/libjudy/src/JudyL/JudyLByCount.c new file mode 100644 index 00000000..c5a00479 --- /dev/null +++ b/src/libnetdata/libjudy/src/JudyL/JudyLByCount.c @@ -0,0 +1,954 @@ +// Copyright (C) 2000 - 2002 Hewlett-Packard Company +// +// This program is free software; you can redistribute it and/or modify it +// under the term of the GNU Lesser General Public License as published by the +// Free Software Foundation; either version 2 of the License, or (at your +// option) any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License +// for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this program; if not, write to the Free Software Foundation, +// Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// _________________ + +// @(#) $Revision: 4.28 $ $Source: /judy/src/JudyCommon/JudyByCount.c $ +// +// Judy*ByCount() function for Judy1 and JudyL. +// Compile with one of -DJUDY1 or -DJUDYL. +// +// Compile with -DNOSMARTJBB, -DNOSMARTJBU, and/or -DNOSMARTJLB to build a +// version with cache line optimizations deleted, for testing. +// +// Judy*ByCount() is a conceptual although not literal inverse of Judy*Count(). +// Judy*Count() takes a pair of Indexes, and allows finding the ordinal of a +// given Index (that is, its position in the list of valid indexes from the +// beginning) as a degenerate case, because in general the count between two +// Indexes, inclusive, is not always just the difference in their ordinals. +// However, it suffices for Judy*ByCount() to simply be an ordinal-to-Index +// mapper. +// +// Note: Like Judy*Count(), this code must "count sideways" in branches, which +// can result in a lot of cache line fills. However, unlike Judy*Count(), this +// code does not receive a specific Index, hence digit, where to start in each +// branch, so it cant accurately calculate cache line fills required in each +// direction. The best it can do is an approximation based on the total +// population of the expanse (pop1 from Pjp) and the ordinal of the target +// Index (see SETOFFSET()) within the expanse. +// +// Compile with -DSMARTMETRICS to obtain global variables containing smart +// cache line metrics. Note: Dont turn this on simultaneously for this file +// and JudyCount.c because they export the same globals. +// **************************************************************************** + +#if (! (defined(JUDY1) || defined(JUDYL))) +#error: One of -DJUDY1 or -DJUDYL must be specified. +#endif + +#ifdef JUDY1 +#include "Judy1.h" +#else +#include "JudyL.h" +#endif + +#include "JudyPrivate1L.h" + +// These are imported from JudyCount.c: +// +// TBD: Should this be in common code? Exported from a header file? + +#ifdef JUDY1 +extern Word_t j__udy1JPPop1(const Pjp_t Pjp); +#define j__udyJPPop1 j__udy1JPPop1 +#else +extern Word_t j__udyLJPPop1(const Pjp_t Pjp); +#define j__udyJPPop1 j__udyLJPPop1 +#endif + +// Avoid duplicate symbols since this file is multi-compiled: + +#ifdef SMARTMETRICS +#ifdef JUDY1 +Word_t jbb_upward = 0; // counts of directions taken: +Word_t jbb_downward = 0; +Word_t jbu_upward = 0; +Word_t jbu_downward = 0; +Word_t jlb_upward = 0; +Word_t jlb_downward = 0; +#else +extern Word_t jbb_upward; +extern Word_t jbb_downward; +extern Word_t jbu_upward; +extern Word_t jbu_downward; +extern Word_t jlb_upward; +extern Word_t jlb_downward; +#endif +#endif + + +// **************************************************************************** +// J U D Y 1 B Y C O U N T +// J U D Y L B Y C O U N T +// +// See the manual entry. + +#ifdef JUDY1 +FUNCTION int Judy1ByCount +#else +FUNCTION PPvoid_t JudyLByCount +#endif + ( + Pcvoid_t PArray, // root pointer to first branch/leaf in SM. + Word_t Count, // ordinal of Index to find, 1..MAX. + Word_t * PIndex, // to return found Index. + PJError_t PJError // optional, for returning error info. + ) +{ + Word_t Count0; // Count, base-0, to match pop0. + Word_t state; // current state in SM. + Word_t pop1; // of current branch or leaf, or of expanse. + Word_t pop1lower; // pop1 of expanses (JPs) below that for Count. + Word_t digit; // current word in branch. + Word_t jpcount; // JPs in a BranchB subexpanse. + long jpnum; // JP number in a branch (base 0). + long subexp; // for stepping through layer 1 (subexpanses). + int offset; // index ordinal within a leaf, base 0. + + Pjp_t Pjp; // current JP in branch. + Pjll_t Pjll; // current Judy linear leaf. + + +// CHECK FOR EMPTY ARRAY OR NULL PINDEX: + + if (PArray == (Pvoid_t) NULL) JU_RET_NOTFOUND; + + if (PIndex == (PWord_t) NULL) + { + JU_SET_ERRNO(PJError, JU_ERRNO_NULLPINDEX); + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + } + +// Convert Count to Count0; assume special case of Count = 0 maps to ~0, as +// desired, to represent the last index in a full array: +// +// Note: Think of Count0 as a reliable "number of Indexes below the target." + + Count0 = Count - 1; + assert((Count || Count0 == ~0)); // ensure CPU is sane about 0 - 1. + pop1lower = 0; + + if (JU_LEAFW_POP0(PArray) < cJU_LEAFW_MAXPOP1) // must be a LEAFW + { + Pjlw_t Pjlw = P_JLW(PArray); // first word of leaf. + + if (Count0 > Pjlw[0]) JU_RET_NOTFOUND; // too high. + + *PIndex = Pjlw[Count]; // Index, base 1. + + JU_RET_FOUND_LEAFW(Pjlw, Pjlw[0] + 1, Count0); + } + else + { + Pjpm_t Pjpm = P_JPM(PArray); + + if (Count0 > (Pjpm->jpm_Pop0)) JU_RET_NOTFOUND; // too high. + + Pjp = &(Pjpm->jpm_JP); + pop1 = (Pjpm->jpm_Pop0) + 1; + +// goto SMByCount; + } + +// COMMON CODE: +// +// Prepare to handle a root-level or lower-level branch: Save the current +// state, obtain the total population for the branch in a state-dependent way, +// and then branch to common code for multiple cases. +// +// For root-level branches, the state is always cJU_ROOTSTATE, and the array +// population must already be set in pop1; it is not available in jp_DcdPopO. +// +// Note: The total population is only needed in cases where the common code +// "counts down" instead of up to minimize cache line fills. However, its +// available cheaply, and its better to do it with a constant shift (constant +// state value) instead of a variable shift later "when needed". + +#define PREPB_ROOT(Next) \ + state = cJU_ROOTSTATE; \ + goto Next + +// Use PREPB_DCD() to first copy the Dcd bytes to *PIndex if there are any +// (only if state < cJU_ROOTSTATE - 1): + +#define PREPB_DCD(Pjp,cState,Next) \ + JU_SETDCD(*PIndex, Pjp, cState); \ + PREPB((Pjp), cState, Next) + +#define PREPB(Pjp,cState,Next) \ + state = (cState); \ + pop1 = JU_JPBRANCH_POP0(Pjp, (cState)) + 1; \ + goto Next + +// Calculate whether the ordinal of an Index within a given expanse falls in +// the lower or upper half of the expanses population, taking care with +// unsigned math and boundary conditions: +// +// Note: Assume the ordinal falls within the expanses population, that is, +// 0 < (Count - Pop1lower) <= Pop1exp (assuming infinite math). +// +// Note: If the ordinal is the middle element, it doesnt matter whether +// LOWERHALF() is TRUE or FALSE. + +#define LOWERHALF(Count0,Pop1lower,Pop1exp) \ + (((Count0) - (Pop1lower)) < ((Pop1exp) / 2)) + +// Calculate the (signed) offset within a leaf to the desired ordinal (Count - +// Pop1lower; offset is one less), and optionally ensure its in range: + +#define SETOFFSET(Offset,Count0,Pop1lower,Pjp) \ + (Offset) = (Count0) - (Pop1lower); \ + assert((Offset) >= 0); \ + assert((Offset) <= JU_JPLEAF_POP0(Pjp)) + +// Variations for immediate indexes, with and without pop1-specific assertions: + +#define SETOFFSET_IMM_CK(Offset,Count0,Pop1lower,cPop1) \ + (Offset) = (Count0) - (Pop1lower); \ + assert((Offset) >= 0); \ + assert((Offset) < (cPop1)) + +#define SETOFFSET_IMM(Offset,Count0,Pop1lower) \ + (Offset) = (Count0) - (Pop1lower) + + +// STATE MACHINE -- TRAVERSE TREE: +// +// In branches, look for the expanse (digit), if any, where the total pop1 +// below or at that expanse would meet or exceed Count, meaning the Index must +// be in this expanse. + +SMByCount: // return here for next branch/leaf. + + switch (JU_JPTYPE(Pjp)) + { + + +// ---------------------------------------------------------------------------- +// LINEAR BRANCH; count populations in JPs in the JBL upwards until finding the +// expanse (digit) containing Count, and "recurse". +// +// Note: There are no null JPs in a JBL; watch out for pop1 == 0. +// +// Note: A JBL should always fit in one cache line => no need to count up +// versus down to save cache line fills. +// +// TBD: The previous is no longer true. Consider enhancing this code to count +// up/down, but it can wait for a later tuning phase. In the meantime, PREPB() +// sets pop1 for the whole array, but that value is not used here. 001215: +// Maybe its true again? + + case cJU_JPBRANCH_L2: PREPB_DCD(Pjp, 2, BranchL); +#ifndef JU_64BIT + case cJU_JPBRANCH_L3: PREPB( Pjp, 3, BranchL); +#else + case cJU_JPBRANCH_L3: PREPB_DCD(Pjp, 3, BranchL); + case cJU_JPBRANCH_L4: PREPB_DCD(Pjp, 4, BranchL); + case cJU_JPBRANCH_L5: PREPB_DCD(Pjp, 5, BranchL); + case cJU_JPBRANCH_L6: PREPB_DCD(Pjp, 6, BranchL); + case cJU_JPBRANCH_L7: PREPB( Pjp, 7, BranchL); +#endif + case cJU_JPBRANCH_L: PREPB_ROOT( BranchL); + { + Pjbl_t Pjbl; + +// Common code (state-independent) for all cases of linear branches: + +BranchL: + Pjbl = P_JBL(Pjp->jp_Addr); + + for (jpnum = 0; jpnum < (Pjbl->jbl_NumJPs); ++jpnum) + { + if ((pop1 = j__udyJPPop1((Pjbl->jbl_jp) + jpnum)) + == cJU_ALLONES) + { + JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + } + assert(pop1 != 0); + +// Warning: pop1lower and pop1 are unsigned, so do not subtract 1 and compare +// >=, but instead use the following expression: + + if (pop1lower + pop1 > Count0) // Index is in this expanse. + { + JU_SETDIGIT(*PIndex, Pjbl->jbl_Expanse[jpnum], state); + Pjp = (Pjbl->jbl_jp) + jpnum; + goto SMByCount; // look under this expanse. + } + + pop1lower += pop1; // add this JPs pop1. + } + + JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); // should never get here. + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + + } // case cJU_JPBRANCH_L + + +// ---------------------------------------------------------------------------- +// BITMAP BRANCH; count populations in JPs in the JBB upwards or downwards +// until finding the expanse (digit) containing Count, and "recurse". +// +// Note: There are no null JPs in a JBB; watch out for pop1 == 0. + + case cJU_JPBRANCH_B2: PREPB_DCD(Pjp, 2, BranchB); +#ifndef JU_64BIT + case cJU_JPBRANCH_B3: PREPB( Pjp, 3, BranchB); +#else + case cJU_JPBRANCH_B3: PREPB_DCD(Pjp, 3, BranchB); + case cJU_JPBRANCH_B4: PREPB_DCD(Pjp, 4, BranchB); + case cJU_JPBRANCH_B5: PREPB_DCD(Pjp, 5, BranchB); + case cJU_JPBRANCH_B6: PREPB_DCD(Pjp, 6, BranchB); + case cJU_JPBRANCH_B7: PREPB( Pjp, 7, BranchB); +#endif + case cJU_JPBRANCH_B: PREPB_ROOT( BranchB); + { + Pjbb_t Pjbb; + +// Common code (state-independent) for all cases of bitmap branches: + +BranchB: + Pjbb = P_JBB(Pjp->jp_Addr); + +// Shorthand for one subexpanse in a bitmap and for one JP in a bitmap branch: +// +// Note: BMPJP0 exists separately to support assertions. + +#define BMPJP0(Subexp) (P_JP(JU_JBB_PJP(Pjbb, Subexp))) +#define BMPJP(Subexp,JPnum) (BMPJP0(Subexp) + (JPnum)) + + +// Common code for descending through a JP: +// +// Determine the digit for the expanse and save it in *PIndex; then "recurse". + +#define JBB_FOUNDEXPANSE \ + { \ + JU_BITMAPDIGITB(digit, subexp, JU_JBB_BITMAP(Pjbb,subexp), jpnum); \ + JU_SETDIGIT(*PIndex, digit, state); \ + Pjp = BMPJP(subexp, jpnum); \ + goto SMByCount; \ + } + + +#ifndef NOSMARTJBB // enable to turn off smart code for comparison purposes. + +// FIGURE OUT WHICH DIRECTION CAUSES FEWER CACHE LINE FILLS; adding the pop1s +// in JPs upwards, or subtracting the pop1s in JPs downwards: +// +// See header comments about limitations of this for Judy*ByCount(). + +#endif + +// COUNT UPWARD, adding each "below" JPs pop1: + +#ifndef NOSMARTJBB // enable to turn off smart code for comparison purposes. + + if (LOWERHALF(Count0, pop1lower, pop1)) + { +#endif +#ifdef SMARTMETRICS + ++jbb_upward; +#endif + for (subexp = 0; subexp < cJU_NUMSUBEXPB; ++subexp) + { + if ((jpcount = j__udyCountBitsB(JU_JBB_BITMAP(Pjbb,subexp))) + && (BMPJP0(subexp) == (Pjp_t) NULL)) + { + JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); // null ptr. + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + } + +// Note: An empty subexpanse (jpcount == 0) is handled "for free": + + for (jpnum = 0; jpnum < jpcount; ++jpnum) + { + if ((pop1 = j__udyJPPop1(BMPJP(subexp, jpnum))) + == cJU_ALLONES) + { + JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + } + assert(pop1 != 0); + +// Warning: pop1lower and pop1 are unsigned, see earlier comment: + + if (pop1lower + pop1 > Count0) + JBB_FOUNDEXPANSE; // Index is in this expanse. + + pop1lower += pop1; // add this JPs pop1. + } + } +#ifndef NOSMARTJBB // enable to turn off smart code for comparison purposes. + } + + +// COUNT DOWNWARD, subtracting each "above" JPs pop1 from the whole expanses +// pop1: + + else + { +#ifdef SMARTMETRICS + ++jbb_downward; +#endif + pop1lower += pop1; // add whole branch to start. + + for (subexp = cJU_NUMSUBEXPB - 1; subexp >= 0; --subexp) + { + if ((jpcount = j__udyCountBitsB(JU_JBB_BITMAP(Pjbb, subexp))) + && (BMPJP0(subexp) == (Pjp_t) NULL)) + { + JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); // null ptr. + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + } + +// Note: An empty subexpanse (jpcount == 0) is handled "for free": + + for (jpnum = jpcount - 1; jpnum >= 0; --jpnum) + { + if ((pop1 = j__udyJPPop1(BMPJP(subexp, jpnum))) + == cJU_ALLONES) + { + JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + } + assert(pop1 != 0); + +// Warning: pop1lower and pop1 are unsigned, see earlier comment: + + pop1lower -= pop1; + +// Beware unsigned math problems: + + if ((pop1lower == 0) || (pop1lower - 1 < Count0)) + JBB_FOUNDEXPANSE; // Index is in this expanse. + } + } + } +#endif // NOSMARTJBB + + JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); // should never get here. + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + + } // case cJU_JPBRANCH_B + + +// ---------------------------------------------------------------------------- +// UNCOMPRESSED BRANCH; count populations in JPs in the JBU upwards or +// downwards until finding the expanse (digit) containing Count, and "recurse". + + case cJU_JPBRANCH_U2: PREPB_DCD(Pjp, 2, BranchU); +#ifndef JU_64BIT + case cJU_JPBRANCH_U3: PREPB( Pjp, 3, BranchU); +#else + case cJU_JPBRANCH_U3: PREPB_DCD(Pjp, 3, BranchU); + case cJU_JPBRANCH_U4: PREPB_DCD(Pjp, 4, BranchU); + case cJU_JPBRANCH_U5: PREPB_DCD(Pjp, 5, BranchU); + case cJU_JPBRANCH_U6: PREPB_DCD(Pjp, 6, BranchU); + case cJU_JPBRANCH_U7: PREPB( Pjp, 7, BranchU); +#endif + case cJU_JPBRANCH_U: PREPB_ROOT( BranchU); + { + Pjbu_t Pjbu; + +// Common code (state-independent) for all cases of uncompressed branches: + +BranchU: + Pjbu = P_JBU(Pjp->jp_Addr); + +// Common code for descending through a JP: +// +// Save the digit for the expanse in *PIndex, then "recurse". + +#define JBU_FOUNDEXPANSE \ + { \ + JU_SETDIGIT(*PIndex, jpnum, state); \ + Pjp = (Pjbu->jbu_jp) + jpnum; \ + goto SMByCount; \ + } + + +#ifndef NOSMARTJBU // enable to turn off smart code for comparison purposes. + +// FIGURE OUT WHICH DIRECTION CAUSES FEWER CACHE LINE FILLS; adding the pop1s +// in JPs upwards, or subtracting the pop1s in JPs downwards: +// +// See header comments about limitations of this for Judy*ByCount(). + +#endif + +// COUNT UPWARD, simply adding the pop1 of each JP: + +#ifndef NOSMARTJBU // enable to turn off smart code for comparison purposes. + + if (LOWERHALF(Count0, pop1lower, pop1)) + { +#endif +#ifdef SMARTMETRICS + ++jbu_upward; +#endif + + for (jpnum = 0; jpnum < cJU_BRANCHUNUMJPS; ++jpnum) + { + // shortcut, save a function call: + + if ((Pjbu->jbu_jp[jpnum].jp_Type) <= cJU_JPNULLMAX) + continue; + + if ((pop1 = j__udyJPPop1((Pjbu->jbu_jp) + jpnum)) + == cJU_ALLONES) + { + JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + } + assert(pop1 != 0); + +// Warning: pop1lower and pop1 are unsigned, see earlier comment: + + if (pop1lower + pop1 > Count0) + JBU_FOUNDEXPANSE; // Index is in this expanse. + + pop1lower += pop1; // add this JPs pop1. + } +#ifndef NOSMARTJBU // enable to turn off smart code for comparison purposes. + } + + +// COUNT DOWNWARD, subtracting the pop1 of each JP above from the whole +// expanses pop1: + + else + { +#ifdef SMARTMETRICS + ++jbu_downward; +#endif + pop1lower += pop1; // add whole branch to start. + + for (jpnum = cJU_BRANCHUNUMJPS - 1; jpnum >= 0; --jpnum) + { + // shortcut, save a function call: + + if ((Pjbu->jbu_jp[jpnum].jp_Type) <= cJU_JPNULLMAX) + continue; + + if ((pop1 = j__udyJPPop1(Pjbu->jbu_jp + jpnum)) + == cJU_ALLONES) + { + JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + } + assert(pop1 != 0); + +// Warning: pop1lower and pop1 are unsigned, see earlier comment: + + pop1lower -= pop1; + +// Beware unsigned math problems: + + if ((pop1lower == 0) || (pop1lower - 1 < Count0)) + JBU_FOUNDEXPANSE; // Index is in this expanse. + } + } +#endif // NOSMARTJBU + + JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); // should never get here. + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + + } // case cJU_JPBRANCH_U + +// ---------------------------------------------------------------------------- +// LINEAR LEAF: +// +// Return the Index at the proper ordinal (see SETOFFSET()) in the leaf. First +// copy Dcd bytes, if there are any (only if state < cJU_ROOTSTATE - 1), to +// *PIndex. +// +// Note: The preceding branch traversal code MIGHT set pop1 for this expanse +// (linear leaf) as a side-effect, but dont depend on that (for JUDYL, which +// is the only cases that need it anyway). + +#define PREPL_DCD(cState) \ + JU_SETDCD(*PIndex, Pjp, cState); \ + PREPL + +#ifdef JUDY1 +#define PREPL_SETPOP1 // not needed in any cases. +#else +#define PREPL_SETPOP1 pop1 = JU_JPLEAF_POP0(Pjp) + 1 +#endif + +#define PREPL \ + Pjll = P_JLL(Pjp->jp_Addr); \ + PREPL_SETPOP1; \ + SETOFFSET(offset, Count0, pop1lower, Pjp) + +#if (defined(JUDYL) || (! defined(JU_64BIT))) + case cJU_JPLEAF1: + + PREPL_DCD(1); + JU_SETDIGIT1(*PIndex, ((uint8_t *) Pjll)[offset]); + JU_RET_FOUND_LEAF1(Pjll, pop1, offset); +#endif + + case cJU_JPLEAF2: + + PREPL_DCD(2); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(2))) + | ((uint16_t *) Pjll)[offset]; + JU_RET_FOUND_LEAF2(Pjll, pop1, offset); + +#ifndef JU_64BIT + case cJU_JPLEAF3: + { + Word_t lsb; + PREPL; + JU_COPY3_PINDEX_TO_LONG(lsb, ((uint8_t *) Pjll) + (3 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(3))) | lsb; + JU_RET_FOUND_LEAF3(Pjll, pop1, offset); + } + +#else + case cJU_JPLEAF3: + { + Word_t lsb; + PREPL_DCD(3); + JU_COPY3_PINDEX_TO_LONG(lsb, ((uint8_t *) Pjll) + (3 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(3))) | lsb; + JU_RET_FOUND_LEAF3(Pjll, pop1, offset); + } + + case cJU_JPLEAF4: + + PREPL_DCD(4); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(4))) + | ((uint32_t *) Pjll)[offset]; + JU_RET_FOUND_LEAF4(Pjll, pop1, offset); + + case cJU_JPLEAF5: + { + Word_t lsb; + PREPL_DCD(5); + JU_COPY5_PINDEX_TO_LONG(lsb, ((uint8_t *) Pjll) + (5 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(5))) | lsb; + JU_RET_FOUND_LEAF5(Pjll, pop1, offset); + } + + case cJU_JPLEAF6: + { + Word_t lsb; + PREPL_DCD(6); + JU_COPY6_PINDEX_TO_LONG(lsb, ((uint8_t *) Pjll) + (6 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(6))) | lsb; + JU_RET_FOUND_LEAF6(Pjll, pop1, offset); + } + + case cJU_JPLEAF7: + { + Word_t lsb; + PREPL; + JU_COPY7_PINDEX_TO_LONG(lsb, ((uint8_t *) Pjll) + (7 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(7))) | lsb; + JU_RET_FOUND_LEAF7(Pjll, pop1, offset); + } +#endif + + +// ---------------------------------------------------------------------------- +// BITMAP LEAF: +// +// Return the Index at the proper ordinal (see SETOFFSET()) in the leaf by +// counting bits. First copy Dcd bytes (always present since state 1 < +// cJU_ROOTSTATE) to *PIndex. +// +// Note: The preceding branch traversal code MIGHT set pop1 for this expanse +// (bitmap leaf) as a side-effect, but dont depend on that. + + case cJU_JPLEAF_B1: + { + Pjlb_t Pjlb; + + JU_SETDCD(*PIndex, Pjp, 1); + Pjlb = P_JLB(Pjp->jp_Addr); + pop1 = JU_JPLEAF_POP0(Pjp) + 1; + +// COUNT UPWARD, adding the pop1 of each subexpanse: +// +// The entire bitmap should fit in one cache line, but still try to save some +// CPU time by counting the fewest possible number of subexpanses from the +// bitmap. +// +// See header comments about limitations of this for Judy*ByCount(). + +#ifndef NOSMARTJLB // enable to turn off smart code for comparison purposes. + + if (LOWERHALF(Count0, pop1lower, pop1)) + { +#endif +#ifdef SMARTMETRICS + ++jlb_upward; +#endif + for (subexp = 0; subexp < cJU_NUMSUBEXPL; ++subexp) + { + pop1 = j__udyCountBitsL(JU_JLB_BITMAP(Pjlb, subexp)); + +// Warning: pop1lower and pop1 are unsigned, see earlier comment: + + if (pop1lower + pop1 > Count0) + goto LeafB1; // Index is in this subexpanse. + + pop1lower += pop1; // add this subexpanses pop1. + } +#ifndef NOSMARTJLB // enable to turn off smart code for comparison purposes. + } + + +// COUNT DOWNWARD, subtracting each "above" subexpanses pop1 from the whole +// expanses pop1: + + else + { +#ifdef SMARTMETRICS + ++jlb_downward; +#endif + pop1lower += pop1; // add whole leaf to start. + + for (subexp = cJU_NUMSUBEXPL - 1; subexp >= 0; --subexp) + { + pop1lower -= j__udyCountBitsL(JU_JLB_BITMAP(Pjlb, subexp)); + +// Beware unsigned math problems: + + if ((pop1lower == 0) || (pop1lower - 1 < Count0)) + goto LeafB1; // Index is in this subexpanse. + } + } +#endif // NOSMARTJLB + + JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); // should never get here. + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + + +// RETURN INDEX FOUND: +// +// Come here with subexp set to the correct subexpanse, and pop1lower set to +// the sum for all lower expanses and subexpanses in the Judy tree. Calculate +// and save in *PIndex the digit corresponding to the ordinal in this +// subexpanse. + +LeafB1: + SETOFFSET(offset, Count0, pop1lower, Pjp); + JU_BITMAPDIGITL(digit, subexp, JU_JLB_BITMAP(Pjlb, subexp), offset); + JU_SETDIGIT1(*PIndex, digit); + JU_RET_FOUND_LEAF_B1(Pjlb, subexp, offset); +// == return((PPvoid_t) (P_JV(JL_JLB_PVALUE(Pjlb, subexp)) + offset)) + + } // case cJU_JPLEAF_B1 + + +#ifdef JUDY1 +// ---------------------------------------------------------------------------- +// FULL POPULATION: +// +// Copy Dcd bytes (always present since state 1 < cJU_ROOTSTATE) to *PIndex, +// then set the appropriate digit for the ordinal (see SETOFFSET()) in the leaf +// as the LSB in *PIndex. + + case cJ1_JPFULLPOPU1: + + JU_SETDCD(*PIndex, Pjp, 1); + SETOFFSET(offset, Count0, pop1lower, Pjp); + assert(offset >= 0); + assert(offset <= cJU_JPFULLPOPU1_POP0); + JU_SETDIGIT1(*PIndex, offset); + JU_RET_FOUND_FULLPOPU1; +#endif + + +// ---------------------------------------------------------------------------- +// IMMEDIATE: +// +// Locate the Index with the proper ordinal (see SETOFFSET()) in the Immediate, +// depending on leaf Index Size and pop1. Note: There are no Dcd bytes in an +// Immediate JP, but in a cJU_JPIMMED_*_01 JP, the field holds the least bytes +// of the immediate Index. + +#define SET_01(cState) JU_SETDIGITS(*PIndex, JU_JPDCDPOP0(Pjp), cState) + + case cJU_JPIMMED_1_01: SET_01(1); goto Imm_01; + case cJU_JPIMMED_2_01: SET_01(2); goto Imm_01; + case cJU_JPIMMED_3_01: SET_01(3); goto Imm_01; +#ifdef JU_64BIT + case cJU_JPIMMED_4_01: SET_01(4); goto Imm_01; + case cJU_JPIMMED_5_01: SET_01(5); goto Imm_01; + case cJU_JPIMMED_6_01: SET_01(6); goto Imm_01; + case cJU_JPIMMED_7_01: SET_01(7); goto Imm_01; +#endif + +Imm_01: + + DBGCODE(SETOFFSET_IMM_CK(offset, Count0, pop1lower, 1);) + JU_RET_FOUND_IMM_01(Pjp); + +// Shorthand for where to find start of Index bytes array: + +#ifdef JUDY1 +#define PJI (Pjp->jp_1Index) +#else +#define PJI (Pjp->jp_LIndex) +#endif + +// Optional code to check the remaining ordinal (see SETOFFSET_IMM()) against +// the Index Size of the Immediate: + +#ifndef DEBUG // simple placeholder: +#define IMM(cPop1,Next) \ + goto Next +#else // extra pop1-specific checking: +#define IMM(cPop1,Next) \ + SETOFFSET_IMM_CK(offset, Count0, pop1lower, cPop1); \ + goto Next +#endif + + case cJU_JPIMMED_1_02: IMM( 2, Imm1); + case cJU_JPIMMED_1_03: IMM( 3, Imm1); +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_1_04: IMM( 4, Imm1); + case cJU_JPIMMED_1_05: IMM( 5, Imm1); + case cJU_JPIMMED_1_06: IMM( 6, Imm1); + case cJU_JPIMMED_1_07: IMM( 7, Imm1); +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_1_08: IMM( 8, Imm1); + case cJ1_JPIMMED_1_09: IMM( 9, Imm1); + case cJ1_JPIMMED_1_10: IMM(10, Imm1); + case cJ1_JPIMMED_1_11: IMM(11, Imm1); + case cJ1_JPIMMED_1_12: IMM(12, Imm1); + case cJ1_JPIMMED_1_13: IMM(13, Imm1); + case cJ1_JPIMMED_1_14: IMM(14, Imm1); + case cJ1_JPIMMED_1_15: IMM(15, Imm1); +#endif + +Imm1: SETOFFSET_IMM(offset, Count0, pop1lower); + JU_SETDIGIT1(*PIndex, ((uint8_t *) PJI)[offset]); + JU_RET_FOUND_IMM(Pjp, offset); + +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_2_02: IMM(2, Imm2); + case cJU_JPIMMED_2_03: IMM(3, Imm2); +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_2_04: IMM(4, Imm2); + case cJ1_JPIMMED_2_05: IMM(5, Imm2); + case cJ1_JPIMMED_2_06: IMM(6, Imm2); + case cJ1_JPIMMED_2_07: IMM(7, Imm2); +#endif + +#if (defined(JUDY1) || defined(JU_64BIT)) +Imm2: SETOFFSET_IMM(offset, Count0, pop1lower); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(2))) + | ((uint16_t *) PJI)[offset]; + JU_RET_FOUND_IMM(Pjp, offset); +#endif + +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_3_02: IMM(2, Imm3); +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_3_03: IMM(3, Imm3); + case cJ1_JPIMMED_3_04: IMM(4, Imm3); + case cJ1_JPIMMED_3_05: IMM(5, Imm3); +#endif + +#if (defined(JUDY1) || defined(JU_64BIT)) +Imm3: + { + Word_t lsb; + SETOFFSET_IMM(offset, Count0, pop1lower); + JU_COPY3_PINDEX_TO_LONG(lsb, ((uint8_t *) PJI) + (3 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(3))) | lsb; + JU_RET_FOUND_IMM(Pjp, offset); + } +#endif + +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_4_02: IMM(2, Imm4); + case cJ1_JPIMMED_4_03: IMM(3, Imm4); + +Imm4: SETOFFSET_IMM(offset, Count0, pop1lower); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(4))) + | ((uint32_t *) PJI)[offset]; + JU_RET_FOUND_IMM(Pjp, offset); + + case cJ1_JPIMMED_5_02: IMM(2, Imm5); + case cJ1_JPIMMED_5_03: IMM(3, Imm5); + +Imm5: + { + Word_t lsb; + SETOFFSET_IMM(offset, Count0, pop1lower); + JU_COPY5_PINDEX_TO_LONG(lsb, ((uint8_t *) PJI) + (5 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(5))) | lsb; + JU_RET_FOUND_IMM(Pjp, offset); + } + + case cJ1_JPIMMED_6_02: IMM(2, Imm6); + +Imm6: + { + Word_t lsb; + SETOFFSET_IMM(offset, Count0, pop1lower); + JU_COPY6_PINDEX_TO_LONG(lsb, ((uint8_t *) PJI) + (6 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(6))) | lsb; + JU_RET_FOUND_IMM(Pjp, offset); + } + + case cJ1_JPIMMED_7_02: IMM(2, Imm7); + +Imm7: + { + Word_t lsb; + SETOFFSET_IMM(offset, Count0, pop1lower); + JU_COPY7_PINDEX_TO_LONG(lsb, ((uint8_t *) PJI) + (7 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(7))) | lsb; + JU_RET_FOUND_IMM(Pjp, offset); + } +#endif // (JUDY1 && JU_64BIT) + + +// ---------------------------------------------------------------------------- +// UNEXPECTED JP TYPES: + + default: JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + + } // SMByCount switch. + + /*NOTREACHED*/ + +} // Judy1ByCount() / JudyLByCount() diff --git a/src/libnetdata/libjudy/src/JudyL/JudyLCascade.c b/src/libnetdata/libjudy/src/JudyL/JudyLCascade.c new file mode 100644 index 00000000..c1a26f41 --- /dev/null +++ b/src/libnetdata/libjudy/src/JudyL/JudyLCascade.c @@ -0,0 +1,1943 @@ +// Copyright (C) 2000 - 2002 Hewlett-Packard Company +// +// This program is free software; you can redistribute it and/or modify it +// under the term of the GNU Lesser General Public License as published by the +// Free Software Foundation; either version 2 of the License, or (at your +// option) any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License +// for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this program; if not, write to the Free Software Foundation, +// Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// _________________ + +// @(#) $Revision: 4.38 $ $Source: /judy/src/JudyCommon/JudyCascade.c $ + +#ifdef JUDY1 +#include "Judy1.h" +#else +#include "JudyL.h" +#endif + +#include "JudyPrivate1L.h" + +extern int j__udyCreateBranchL(Pjp_t, Pjp_t, uint8_t *, Word_t, Pvoid_t); +extern int j__udyCreateBranchB(Pjp_t, Pjp_t, uint8_t *, Word_t, Pvoid_t); + +DBGCODE(extern void JudyCheckSorted(Pjll_t Pjll, Word_t Pop1, long IndexSize);) + +static const jbb_t StageJBBZero; // zeroed versions of namesake struct. + +// TBD: There are multiple copies of (some of) these CopyWto3, Copy3toW, +// CopyWto7 and Copy7toW functions in Judy1Cascade.c, JudyLCascade.c, and +// JudyDecascade.c. These static functions should probably be moved to a +// common place, made macros, or something to avoid having four copies. + + +// **************************************************************************** +// __ J U D Y C O P Y X T O W + + +FUNCTION static void j__udyCopy3toW( + PWord_t PDest, + uint8_t * PSrc, + Word_t LeafIndexes) +{ + do + { + JU_COPY3_PINDEX_TO_LONG(*PDest, PSrc); + PSrc += 3; + PDest += 1; + + } while(--LeafIndexes); + +} //j__udyCopy3toW() + + +#ifdef JU_64BIT + +FUNCTION static void j__udyCopy4toW( + PWord_t PDest, + uint32_t * PSrc, + Word_t LeafIndexes) +{ + do { *PDest++ = *PSrc++; + } while(--LeafIndexes); + +} // j__udyCopy4toW() + + +FUNCTION static void j__udyCopy5toW( + PWord_t PDest, + uint8_t * PSrc, + Word_t LeafIndexes) +{ + do + { + JU_COPY5_PINDEX_TO_LONG(*PDest, PSrc); + PSrc += 5; + PDest += 1; + + } while(--LeafIndexes); + +} // j__udyCopy5toW() + + +FUNCTION static void j__udyCopy6toW( + PWord_t PDest, + uint8_t * PSrc, + Word_t LeafIndexes) +{ + do + { + JU_COPY6_PINDEX_TO_LONG(*PDest, PSrc); + PSrc += 6; + PDest += 1; + + } while(--LeafIndexes); + +} // j__udyCopy6toW() + + +FUNCTION static void j__udyCopy7toW( + PWord_t PDest, + uint8_t * PSrc, + Word_t LeafIndexes) +{ + do + { + JU_COPY7_PINDEX_TO_LONG(*PDest, PSrc); + PSrc += 7; + PDest += 1; + + } while(--LeafIndexes); + +} // j__udyCopy7toW() + +#endif // JU_64BIT + + +// **************************************************************************** +// __ J U D Y C O P Y W T O X + + +FUNCTION static void j__udyCopyWto3( + uint8_t * PDest, + PWord_t PSrc, + Word_t LeafIndexes) +{ + do + { + JU_COPY3_LONG_TO_PINDEX(PDest, *PSrc); + PSrc += 1; + PDest += 3; + + } while(--LeafIndexes); + +} // j__udyCopyWto3() + + +#ifdef JU_64BIT + +FUNCTION static void j__udyCopyWto4( + uint8_t * PDest, + PWord_t PSrc, + Word_t LeafIndexes) +{ + uint32_t *PDest32 = (uint32_t *)PDest; + + do + { + *PDest32 = *PSrc; + PSrc += 1; + PDest32 += 1; + } while(--LeafIndexes); + +} // j__udyCopyWto4() + + +FUNCTION static void j__udyCopyWto5( + uint8_t * PDest, + PWord_t PSrc, + Word_t LeafIndexes) +{ + do + { + JU_COPY5_LONG_TO_PINDEX(PDest, *PSrc); + PSrc += 1; + PDest += 5; + + } while(--LeafIndexes); + +} // j__udyCopyWto5() + + +FUNCTION static void j__udyCopyWto6( + uint8_t * PDest, + PWord_t PSrc, + Word_t LeafIndexes) +{ + do + { + JU_COPY6_LONG_TO_PINDEX(PDest, *PSrc); + PSrc += 1; + PDest += 6; + + } while(--LeafIndexes); + +} // j__udyCopyWto6() + + +FUNCTION static void j__udyCopyWto7( + uint8_t * PDest, + PWord_t PSrc, + Word_t LeafIndexes) +{ + do + { + JU_COPY7_LONG_TO_PINDEX(PDest, *PSrc); + PSrc += 1; + PDest += 7; + + } while(--LeafIndexes); + +} // j__udyCopyWto7() + +#endif // JU_64BIT + + +// **************************************************************************** +// COMMON CODE (MACROS): +// +// Free objects in an array of valid JPs, StageJP[ExpCnt] == last one may +// include Immeds, which are ignored. + +#define FREEALLEXIT(ExpCnt,StageJP,Pjpm) \ + { \ + Word_t _expct = (ExpCnt); \ + while (_expct--) j__udyFreeSM(&((StageJP)[_expct]), Pjpm); \ + return(-1); \ + } + +// Clear the array that keeps track of the number of JPs in a subexpanse: + +#define ZEROJP(SubJPCount) \ + { \ + int ii; \ + for (ii = 0; ii < cJU_NUMSUBEXPB; ii++) (SubJPCount[ii]) = 0; \ + } + +// **************************************************************************** +// __ J U D Y S T A G E J B B T O J B B +// +// Create a mallocd BranchB (jbb_t) from a staged BranchB while "splaying" a +// single old leaf. Return -1 if out of memory, otherwise 1. + +static int j__udyStageJBBtoJBB( + Pjp_t PjpLeaf, // JP of leaf being splayed. + Pjbb_t PStageJBB, // temp jbb_t on stack. + Pjp_t PjpArray, // array of JPs to splayed new leaves. + uint8_t * PSubCount, // count of JPs for each subexpanse. + Pjpm_t Pjpm) // the jpm_t for JudyAlloc*(). +{ + Pjbb_t PjbbRaw; // pointer to new bitmap branch. + Pjbb_t Pjbb; + Word_t subexp; + +// Get memory for new BranchB: + + if ((PjbbRaw = j__udyAllocJBB(Pjpm)) == (Pjbb_t) NULL) return(-1); + Pjbb = P_JBB(PjbbRaw); + +// Copy staged BranchB into just-allocated BranchB: + + *Pjbb = *PStageJBB; + +// Allocate the JP subarrays (BJP) for the new BranchB: + + for (subexp = 0; subexp < cJU_NUMSUBEXPB; subexp++) + { + Pjp_t PjpRaw; + Pjp_t Pjp; + Word_t NumJP; // number of JPs in each subexpanse. + + if ((NumJP = PSubCount[subexp]) == 0) continue; // empty. + +// Out of memory, back out previous allocations: + + if ((PjpRaw = j__udyAllocJBBJP(NumJP, Pjpm)) == (Pjp_t) NULL) + { + while(subexp--) + { + if ((NumJP = PSubCount[subexp]) == 0) continue; + + PjpRaw = JU_JBB_PJP(Pjbb, subexp); + j__udyFreeJBBJP(PjpRaw, NumJP, Pjpm); + } + j__udyFreeJBB(PjbbRaw, Pjpm); + return(-1); // out of memory. + } + Pjp = P_JP(PjpRaw); + +// Place the JP subarray pointer in the new BranchB, copy subarray JPs, and +// advance to the next subexpanse: + + JU_JBB_PJP(Pjbb, subexp) = PjpRaw; + JU_COPYMEM(Pjp, PjpArray, NumJP); + PjpArray += NumJP; + + } // for each subexpanse. + +// Change the PjpLeaf from Leaf to BranchB: + + PjpLeaf->jp_Addr = (Word_t) PjbbRaw; + PjpLeaf->jp_Type += cJU_JPBRANCH_B2 - cJU_JPLEAF2; // Leaf to BranchB. + + return(1); + +} // j__udyStageJBBtoJBB() + + +// **************************************************************************** +// __ J U D Y J L L 2 T O J L B 1 +// +// Create a LeafB1 (jlb_t = JLB1) from a Leaf2 (2-byte Indexes and for JudyL, +// Word_t Values). Return NULL if out of memory, else a pointer to the new +// LeafB1. +// +// NOTE: Caller must release the Leaf2 that was passed in. + +__attribute__((no_sanitize("shift"))) +FUNCTION static Pjlb_t j__udyJLL2toJLB1( + uint16_t * Pjll, // array of 16-bit indexes. +#ifdef JUDYL + Pjv_t Pjv, // array of associated values. +#endif + Word_t LeafPop1, // number of indexes/values. + Pvoid_t Pjpm) // jpm_t for JudyAlloc*()/JudyFree*(). +{ + Pjlb_t PjlbRaw; + Pjlb_t Pjlb; + int offset; +JUDYLCODE(int subexp;) + +// Allocate the LeafB1: + + if ((PjlbRaw = j__udyAllocJLB1(Pjpm)) == (Pjlb_t) NULL) + return((Pjlb_t) NULL); + Pjlb = P_JLB(PjlbRaw); + +// Copy Leaf2 indexes to LeafB1: + + for (offset = 0; offset < LeafPop1; ++offset) + JU_BITMAPSETL(Pjlb, Pjll[offset]); + +#ifdef JUDYL + +// Build LeafVs from bitmap: + + for (subexp = 0; subexp < cJU_NUMSUBEXPL; ++subexp) + { + struct _POINTER_VALUES + { + Word_t pv_Pop1; // size of value area. + Pjv_t pv_Pjv; // raw pointer to value area. + } pv[cJU_NUMSUBEXPL]; + +// Get the population of the subexpanse, and if any, allocate a LeafV: + + pv[subexp].pv_Pop1 = j__udyCountBitsL(JU_JLB_BITMAP(Pjlb, subexp)); + + if (pv[subexp].pv_Pop1) + { + Pjv_t Pjvnew; + +// TBD: There is an opportunity to put pop == 1 value in pointer: + + pv[subexp].pv_Pjv = j__udyLAllocJV(pv[subexp].pv_Pop1, Pjpm); + +// Upon out of memory, free all previously allocated: + + if (pv[subexp].pv_Pjv == (Pjv_t) NULL) + { + while(subexp--) + { + if (pv[subexp].pv_Pop1) + { + j__udyLFreeJV(pv[subexp].pv_Pjv, pv[subexp].pv_Pop1, + Pjpm); + } + } + j__udyFreeJLB1(PjlbRaw, Pjpm); + return((Pjlb_t) NULL); + } + + Pjvnew = P_JV(pv[subexp].pv_Pjv); + JU_COPYMEM(Pjvnew, Pjv, pv[subexp].pv_Pop1); + Pjv += pv[subexp].pv_Pop1; // advance value pointer. + +// Place raw pointer to value array in bitmap subexpanse: + + JL_JLB_PVALUE(Pjlb, subexp) = pv[subexp].pv_Pjv; + + } // populated subexpanse. + } // each subexpanse. + +#endif // JUDYL + + return(PjlbRaw); // pointer to LeafB1. + +} // j__udyJLL2toJLB1() + + +// **************************************************************************** +// __ J U D Y C A S C A D E 1 +// +// Create bitmap leaf from 1-byte Indexes and Word_t Values. +// +// TBD: There must be a better way. +// +// Only for JudyL 32 bit: (note, unifdef disallows comment on next line) + +#if (defined(JUDYL) || (! defined(JU_64BIT))) + +FUNCTION int j__udyCascade1( + Pjp_t Pjp, + Pvoid_t Pjpm) +{ + Word_t DcdP0; + uint8_t * PLeaf; + Pjlb_t PjlbRaw; + Pjlb_t Pjlb; + Word_t Pop1; + Word_t ii; // temp for loop counter +JUDYLCODE(Pjv_t Pjv;) + + assert(JU_JPTYPE(Pjp) == cJU_JPLEAF1); + assert((JU_JPDCDPOP0(Pjp) & 0xFF) == (cJU_LEAF1_MAXPOP1-1)); + + PjlbRaw = j__udyAllocJLB1(Pjpm); + if (PjlbRaw == (Pjlb_t) NULL) return(-1); + + Pjlb = P_JLB(PjlbRaw); + PLeaf = (uint8_t *) P_JLL(Pjp->jp_Addr); + Pop1 = JU_JPLEAF_POP0(Pjp) + 1; + + JUDYLCODE(Pjv = JL_LEAF1VALUEAREA(PLeaf, Pop1);) + +// Copy 1 byte index Leaf to bitmap Leaf + for (ii = 0; ii < Pop1; ii++) JU_BITMAPSETL(Pjlb, PLeaf[ii]); + +#ifdef JUDYL +// Build 8 subexpanse Value leaves from bitmap + for (ii = 0; ii < cJU_NUMSUBEXPL; ii++) + { +// Get number of Indexes in subexpanse + if ((Pop1 = j__udyCountBitsL(JU_JLB_BITMAP(Pjlb, ii)))) + { + Pjv_t PjvnewRaw; // value area of new leaf. + Pjv_t Pjvnew; + + PjvnewRaw = j__udyLAllocJV(Pop1, Pjpm); + if (PjvnewRaw == (Pjv_t) NULL) // out of memory. + { +// Free prevously allocated LeafVs: + while(ii--) + { + if ((Pop1 = j__udyCountBitsL(JU_JLB_BITMAP(Pjlb, ii)))) + { + PjvnewRaw = JL_JLB_PVALUE(Pjlb, ii); + j__udyLFreeJV(PjvnewRaw, Pop1, Pjpm); + } + } +// Free the bitmap leaf + j__udyLFreeJLB1(PjlbRaw,Pjpm); + return(-1); + } + Pjvnew = P_JV(PjvnewRaw); + JU_COPYMEM(Pjvnew, Pjv, Pop1); + + Pjv += Pop1; + JL_JLB_PVALUE(Pjlb, ii) = PjvnewRaw; + } + } +#endif // JUDYL + + DcdP0 = JU_JPDCDPOP0(Pjp) | (PLeaf[0] & cJU_DCDMASK(1)); + JU_JPSETADT(Pjp, (Word_t)PjlbRaw, DcdP0, cJU_JPLEAF_B1); + + return(1); // return success + +} // j__udyCascade1() + +#endif // (!(JUDY1 && JU_64BIT)) + + +// **************************************************************************** +// __ J U D Y C A S C A D E 2 +// +// Entry PLeaf of size LeafPop1 is either compressed or splayed with pointer +// returned in Pjp. Entry Levels sizeof(Word_t) down to level 2. +// +// Splay or compress the 2-byte Index Leaf that Pjp point to. Return *Pjp as a +// (compressed) cJU_LEAFB1 or a cJU_BRANCH_*2 + +FUNCTION int j__udyCascade2( + Pjp_t Pjp, + Pvoid_t Pjpm) +{ + uint16_t * PLeaf; // pointer to leaf, explicit type. + Word_t End, Start; // temporaries. + Word_t ExpCnt; // count of expanses of splay. + Word_t CIndex; // current Index word. +JUDYLCODE(Pjv_t Pjv;) // value area of leaf. + +// Temp staging for parts(Leaves) of newly splayed leaf + jp_t StageJP [cJU_LEAF2_MAXPOP1]; // JPs of new leaves + uint8_t StageExp [cJU_LEAF2_MAXPOP1]; // Expanses of new leaves + uint8_t SubJPCount[cJU_NUMSUBEXPB]; // JPs in each subexpanse + jbb_t StageJBB; // staged bitmap branch + + assert(JU_JPTYPE(Pjp) == cJU_JPLEAF2); + assert((JU_JPDCDPOP0(Pjp) & 0xFFFF) == (cJU_LEAF2_MAXPOP1-1)); + +// Get the address of the Leaf + PLeaf = (uint16_t *) P_JLL(Pjp->jp_Addr); + +// And its Value area + JUDYLCODE(Pjv = JL_LEAF2VALUEAREA(PLeaf, cJU_LEAF2_MAXPOP1);) + +// If Leaf is in 1 expanse -- just compress it to a Bitmap Leaf + + CIndex = PLeaf[0]; + if (!JU_DIGITATSTATE(CIndex ^ PLeaf[cJU_LEAF2_MAXPOP1-1], 2)) + { +// cJU_JPLEAF_B1 + Word_t DcdP0; + Pjlb_t PjlbRaw; + PjlbRaw = j__udyJLL2toJLB1(PLeaf, +#ifdef JUDYL + Pjv, +#endif + cJU_LEAF2_MAXPOP1, Pjpm); + if (PjlbRaw == (Pjlb_t)NULL) return(-1); // out of memory + +// Merge in another Dcd byte because compressing + DcdP0 = (CIndex & cJU_DCDMASK(1)) | JU_JPDCDPOP0(Pjp); + JU_JPSETADT(Pjp, (Word_t)PjlbRaw, DcdP0, cJU_JPLEAF_B1); + + return(1); + } + +// Else in 2+ expanses, splay Leaf into smaller leaves at higher compression + + StageJBB = StageJBBZero; // zero staged bitmap branch + ZEROJP(SubJPCount); + +// Splay the 2 byte index Leaf to 1 byte Index Leaves + for (ExpCnt = Start = 0, End = 1; ; End++) + { +// Check if new expanse or last one + if ( (End == cJU_LEAF2_MAXPOP1) + || + (JU_DIGITATSTATE(CIndex ^ PLeaf[End], 2)) + ) + { +// Build a leaf below the previous expanse +// + Pjp_t PjpJP = StageJP + ExpCnt; + Word_t Pop1 = End - Start; + Word_t expanse = JU_DIGITATSTATE(CIndex, 2); + Word_t subexp = expanse / cJU_BITSPERSUBEXPB; +// +// set the bit that is the current expanse + JU_JBB_BITMAP(&StageJBB, subexp) |= JU_BITPOSMASKB(expanse); +#ifdef SUBEXPCOUNTS + StageJBB.jbb_subPop1[subexp] += Pop1; // pop of subexpanse +#endif +// count number of expanses in each subexpanse + SubJPCount[subexp]++; + +// Save byte expanse of leaf + StageExp[ExpCnt] = JU_DIGITATSTATE(CIndex, 2); + + if (Pop1 == 1) // cJU_JPIMMED_1_01 + { + Word_t DcdP0; + DcdP0 = (JU_JPDCDPOP0(Pjp) & cJU_DCDMASK(1)) | + CIndex; +#ifdef JUDY1 + JU_JPSETADT(PjpJP, 0, DcdP0, cJ1_JPIMMED_1_01); +#else // JUDYL + JU_JPSETADT(PjpJP, Pjv[Start], DcdP0, + cJL_JPIMMED_1_01); +#endif // JUDYL + } + else if (Pop1 <= cJU_IMMED1_MAXPOP1) // bigger + { +// cJL_JPIMMED_1_02..3: JudyL 32 +// cJ1_JPIMMED_1_02..7: Judy1 32 +// cJL_JPIMMED_1_02..7: JudyL 64 +// cJ1_JPIMMED_1_02..15: Judy1 64 +#ifdef JUDYL + Pjv_t PjvnewRaw; // value area of leaf. + Pjv_t Pjvnew; + +// Allocate Value area for Immediate Leaf + PjvnewRaw = j__udyLAllocJV(Pop1, Pjpm); + if (PjvnewRaw == (Pjv_t) NULL) + FREEALLEXIT(ExpCnt, StageJP, Pjpm); + + Pjvnew = P_JV(PjvnewRaw); + +// Copy to Values to Value Leaf + JU_COPYMEM(Pjvnew, Pjv + Start, Pop1); + PjpJP->jp_Addr = (Word_t) PjvnewRaw; + +// Copy to JP as an immediate Leaf + JU_COPYMEM(PjpJP->jp_LIndex, PLeaf + Start, + Pop1); +#else + JU_COPYMEM(PjpJP->jp_1Index, PLeaf + Start, + Pop1); +#endif +// Set Type, Population and Index size + PjpJP->jp_Type = cJU_JPIMMED_1_02 + Pop1 - 2; + } + +// 64Bit Judy1 does not have Leaf1: (note, unifdef disallows comment on next +// line) + +#if (! (defined(JUDY1) && defined(JU_64BIT))) + else if (Pop1 <= cJU_LEAF1_MAXPOP1) // still bigger + { +// cJU_JPLEAF1 + Word_t DcdP0; + Pjll_t PjllRaw; // pointer to new leaf. + Pjll_t Pjll; + JUDYLCODE(Pjv_t Pjvnew;) // value area of new leaf. + +// Get a new Leaf + PjllRaw = j__udyAllocJLL1(Pop1, Pjpm); + if (PjllRaw == (Pjll_t)NULL) + FREEALLEXIT(ExpCnt, StageJP, Pjpm); + + Pjll = P_JLL(PjllRaw); +#ifdef JUDYL +// Copy to Values to new Leaf + Pjvnew = JL_LEAF1VALUEAREA(Pjll, Pop1); + JU_COPYMEM(Pjvnew, Pjv + Start, Pop1); +#endif +// Copy Indexes to new Leaf + JU_COPYMEM((uint8_t *)Pjll, PLeaf+Start, Pop1); + + DBGCODE(JudyCheckSorted(Pjll, Pop1, 1);) + + DcdP0 = (JU_JPDCDPOP0(Pjp) & cJU_DCDMASK(2)) + | + (CIndex & cJU_DCDMASK(2-1)) + | + (Pop1 - 1); + + JU_JPSETADT(PjpJP, (Word_t)PjllRaw, DcdP0, + cJU_JPLEAF1); + } +#endif // (!(JUDY1 && JU_64BIT)) // Not 64Bit Judy1 + + else // biggest + { +// cJU_JPLEAF_B1 + Word_t DcdP0; + Pjlb_t PjlbRaw; + PjlbRaw = j__udyJLL2toJLB1( + PLeaf + Start, +#ifdef JUDYL + Pjv + Start, +#endif + Pop1, Pjpm); + if (PjlbRaw == (Pjlb_t)NULL) + FREEALLEXIT(ExpCnt, StageJP, Pjpm); + + DcdP0 = (JU_JPDCDPOP0(Pjp) & cJU_DCDMASK(2)) + | + (CIndex & cJU_DCDMASK(2-1)) + | + (Pop1 - 1); + + JU_JPSETADT(PjpJP, (Word_t)PjlbRaw, DcdP0, + cJU_JPLEAF_B1); + } + ExpCnt++; +// Done? + if (End == cJU_LEAF2_MAXPOP1) break; + +// New Expanse, Start and Count + CIndex = PLeaf[End]; + Start = End; + } + } + +// Now put all the Leaves below a BranchL or BranchB: + if (ExpCnt <= cJU_BRANCHLMAXJPS) // put the Leaves below a BranchL + { + if (j__udyCreateBranchL(Pjp, StageJP, StageExp, ExpCnt, + Pjpm) == -1) FREEALLEXIT(ExpCnt, StageJP, Pjpm); + + Pjp->jp_Type = cJU_JPBRANCH_L2; + } + else + { + if (j__udyStageJBBtoJBB(Pjp, &StageJBB, StageJP, SubJPCount, Pjpm) + == -1) FREEALLEXIT(ExpCnt, StageJP, Pjpm); + } + return(1); + +} // j__udyCascade2() + + +// **************************************************************************** +// __ J U D Y C A S C A D E 3 +// +// Return *Pjp as a (compressed) cJU_LEAF2, cJU_BRANCH_L3, cJU_BRANCH_B3. + +FUNCTION int j__udyCascade3( + Pjp_t Pjp, + Pvoid_t Pjpm) +{ + uint8_t * PLeaf; // pointer to leaf, explicit type. + Word_t End, Start; // temporaries. + Word_t ExpCnt; // count of expanses of splay. + Word_t CIndex; // current Index word. +JUDYLCODE(Pjv_t Pjv;) // value area of leaf. + +// Temp staging for parts(Leaves) of newly splayed leaf + jp_t StageJP [cJU_LEAF3_MAXPOP1]; // JPs of new leaves + Word_t StageA [cJU_LEAF3_MAXPOP1]; + uint8_t StageExp [cJU_LEAF3_MAXPOP1]; // Expanses of new leaves + uint8_t SubJPCount[cJU_NUMSUBEXPB]; // JPs in each subexpanse + jbb_t StageJBB; // staged bitmap branch + + assert(JU_JPTYPE(Pjp) == cJU_JPLEAF3); + assert((JU_JPDCDPOP0(Pjp) & 0xFFFFFF) == (cJU_LEAF3_MAXPOP1-1)); + +// Get the address of the Leaf + PLeaf = (uint8_t *) P_JLL(Pjp->jp_Addr); + +// Extract leaf to Word_t and insert-sort Index into it + j__udyCopy3toW(StageA, PLeaf, cJU_LEAF3_MAXPOP1); + +// Get the address of the Leaf and Value area + JUDYLCODE(Pjv = JL_LEAF3VALUEAREA(PLeaf, cJU_LEAF3_MAXPOP1);) + +// If Leaf is in 1 expanse -- just compress it (compare 1st, last & Index) + + CIndex = StageA[0]; + if (!JU_DIGITATSTATE(CIndex ^ StageA[cJU_LEAF3_MAXPOP1-1], 3)) + { + Word_t DcdP0; + Pjll_t PjllRaw; // pointer to new leaf. + Pjll_t Pjll; + JUDYLCODE(Pjv_t Pjvnew;) // value area of new leaf. + +// Alloc a 2 byte Index Leaf + PjllRaw = j__udyAllocJLL2(cJU_LEAF3_MAXPOP1, Pjpm); + if (PjllRaw == (Pjlb_t)NULL) return(-1); // out of memory + + Pjll = P_JLL(PjllRaw); + +// Copy just 2 bytes Indexes to new Leaf +// j__udyCopyWto2((uint16_t *) Pjll, StageA, cJU_LEAF3_MAXPOP1); + JU_COPYMEM ((uint16_t *) Pjll, StageA, cJU_LEAF3_MAXPOP1); +#ifdef JUDYL +// Copy Value area into new Leaf + Pjvnew = JL_LEAF2VALUEAREA(Pjll, cJU_LEAF3_MAXPOP1); + JU_COPYMEM(Pjvnew, Pjv, cJU_LEAF3_MAXPOP1); +#endif + DBGCODE(JudyCheckSorted(Pjll, cJU_LEAF3_MAXPOP1, 2);) + +// Form new JP, Pop0 field is unchanged +// Add in another Dcd byte because compressing + DcdP0 = (CIndex & cJU_DCDMASK(2)) | JU_JPDCDPOP0(Pjp); + + JU_JPSETADT(Pjp, (Word_t) PjllRaw, DcdP0, cJU_JPLEAF2); + + return(1); // Success + } + +// Else in 2+ expanses, splay Leaf into smaller leaves at higher compression + + StageJBB = StageJBBZero; // zero staged bitmap branch + ZEROJP(SubJPCount); + +// Splay the 3 byte index Leaf to 2 byte Index Leaves + for (ExpCnt = Start = 0, End = 1; ; End++) + { +// Check if new expanse or last one + if ( (End == cJU_LEAF3_MAXPOP1) + || + (JU_DIGITATSTATE(CIndex ^ StageA[End], 3)) + ) + { +// Build a leaf below the previous expanse + + Pjp_t PjpJP = StageJP + ExpCnt; + Word_t Pop1 = End - Start; + Word_t expanse = JU_DIGITATSTATE(CIndex, 3); + Word_t subexp = expanse / cJU_BITSPERSUBEXPB; +// +// set the bit that is the current expanse + JU_JBB_BITMAP(&StageJBB, subexp) |= JU_BITPOSMASKB(expanse); +#ifdef SUBEXPCOUNTS + StageJBB.jbb_subPop1[subexp] += Pop1; // pop of subexpanse +#endif +// count number of expanses in each subexpanse + SubJPCount[subexp]++; + +// Save byte expanse of leaf + StageExp[ExpCnt] = JU_DIGITATSTATE(CIndex, 3); + + if (Pop1 == 1) // cJU_JPIMMED_2_01 + { + Word_t DcdP0; + DcdP0 = (JU_JPDCDPOP0(Pjp) & cJU_DCDMASK(2)) | + CIndex; +#ifdef JUDY1 + JU_JPSETADT(PjpJP, 0, DcdP0, cJ1_JPIMMED_2_01); +#else // JUDYL + JU_JPSETADT(PjpJP, Pjv[Start], DcdP0, + cJL_JPIMMED_2_01); +#endif // JUDYL + } +#if (defined(JUDY1) || defined(JU_64BIT)) + else if (Pop1 <= cJU_IMMED2_MAXPOP1) + { +// cJ1_JPIMMED_2_02..3: Judy1 32 +// cJL_JPIMMED_2_02..3: JudyL 64 +// cJ1_JPIMMED_2_02..7: Judy1 64 +#ifdef JUDYL +// Alloc is 1st in case of malloc fail + Pjv_t PjvnewRaw; // value area of new leaf. + Pjv_t Pjvnew; + +// Allocate Value area for Immediate Leaf + PjvnewRaw = j__udyLAllocJV(Pop1, Pjpm); + if (PjvnewRaw == (Pjv_t) NULL) + FREEALLEXIT(ExpCnt, StageJP, Pjpm); + + Pjvnew = P_JV(PjvnewRaw); + +// Copy to Values to Value Leaf + JU_COPYMEM(Pjvnew, Pjv + Start, Pop1); + + PjpJP->jp_Addr = (Word_t) PjvnewRaw; + +// Copy to Index to JP as an immediate Leaf + JU_COPYMEM((uint16_t *) (PjpJP->jp_LIndex), + StageA + Start, Pop1); +#else // JUDY1 + JU_COPYMEM((uint16_t *) (PjpJP->jp_1Index), + StageA + Start, Pop1); +#endif // JUDY1 +// Set Type, Population and Index size + PjpJP->jp_Type = cJU_JPIMMED_2_02 + Pop1 - 2; + } +#endif // (JUDY1 || JU_64BIT) + + else // Make a linear leaf2 + { +// cJU_JPLEAF2 + Word_t DcdP0; + Pjll_t PjllRaw; // pointer to new leaf. + Pjll_t Pjll; + JUDYLCODE(Pjv_t Pjvnew;) // value area of new leaf. + + PjllRaw = j__udyAllocJLL2(Pop1, Pjpm); + if (PjllRaw == (Pjll_t) NULL) + FREEALLEXIT(ExpCnt, StageJP, Pjpm); + + Pjll = P_JLL(PjllRaw); +#ifdef JUDYL +// Copy to Values to new Leaf + Pjvnew = JL_LEAF2VALUEAREA(Pjll, Pop1); + JU_COPYMEM(Pjvnew, Pjv + Start, Pop1); +#endif +// Copy least 2 bytes per Index of Leaf to new Leaf + JU_COPYMEM((uint16_t *) Pjll, StageA+Start, + Pop1); + + DBGCODE(JudyCheckSorted(Pjll, Pop1, 2);) + + DcdP0 = (JU_JPDCDPOP0(Pjp) & cJU_DCDMASK(3)) + | + (CIndex & cJU_DCDMASK(3-1)) + | + (Pop1 - 1); + + JU_JPSETADT(PjpJP, (Word_t)PjllRaw, DcdP0, + cJU_JPLEAF2); + } + ExpCnt++; +// Done? + if (End == cJU_LEAF3_MAXPOP1) break; + +// New Expanse, Start and Count + CIndex = StageA[End]; + Start = End; + } + } + +// Now put all the Leaves below a BranchL or BranchB: + if (ExpCnt <= cJU_BRANCHLMAXJPS) // put the Leaves below a BranchL + { + if (j__udyCreateBranchL(Pjp, StageJP, StageExp, ExpCnt, + Pjpm) == -1) FREEALLEXIT(ExpCnt, StageJP, Pjpm); + + Pjp->jp_Type = cJU_JPBRANCH_L3; + } + else + { + if (j__udyStageJBBtoJBB(Pjp, &StageJBB, StageJP, SubJPCount, Pjpm) + == -1) FREEALLEXIT(ExpCnt, StageJP, Pjpm); + } + return(1); + +} // j__udyCascade3() + + +#ifdef JU_64BIT // JudyCascade[4567] + +// **************************************************************************** +// __ J U D Y C A S C A D E 4 +// +// Cascade from a cJU_JPLEAF4 to one of the following: +// 1. if leaf is in 1 expanse: +// compress it into a JPLEAF3 +// 2. if leaf contains multiple expanses: +// create linear or bitmap branch containing +// each new expanse is either a: +// JPIMMED_3_01 branch +// JPIMMED_3_02 branch +// JPLEAF3 + +FUNCTION int j__udyCascade4( + Pjp_t Pjp, + Pvoid_t Pjpm) +{ + uint32_t * PLeaf; // pointer to leaf, explicit type. + Word_t End, Start; // temporaries. + Word_t ExpCnt; // count of expanses of splay. + Word_t CIndex; // current Index word. +JUDYLCODE(Pjv_t Pjv;) // value area of leaf. + +// Temp staging for parts(Leaves) of newly splayed leaf + jp_t StageJP [cJU_LEAF4_MAXPOP1]; // JPs of new leaves + Word_t StageA [cJU_LEAF4_MAXPOP1]; + uint8_t StageExp [cJU_LEAF4_MAXPOP1]; // Expanses of new leaves + uint8_t SubJPCount[cJU_NUMSUBEXPB]; // JPs in each subexpanse + jbb_t StageJBB; // staged bitmap branch + + assert(JU_JPTYPE(Pjp) == cJU_JPLEAF4); + assert((JU_JPDCDPOP0(Pjp) & 0xFFFFFFFF) == (cJU_LEAF4_MAXPOP1-1)); + +// Get the address of the Leaf + PLeaf = (uint32_t *) P_JLL(Pjp->jp_Addr); + +// Extract 4 byte index Leaf to Word_t + j__udyCopy4toW(StageA, PLeaf, cJU_LEAF4_MAXPOP1); + +// Get the address of the Leaf and Value area + JUDYLCODE(Pjv = JL_LEAF4VALUEAREA(PLeaf, cJU_LEAF4_MAXPOP1);) + +// If Leaf is in 1 expanse -- just compress it (compare 1st, last & Index) + + CIndex = StageA[0]; + if (!JU_DIGITATSTATE(CIndex ^ StageA[cJU_LEAF4_MAXPOP1-1], 4)) + { + Word_t DcdP0; + Pjll_t PjllRaw; // pointer to new leaf. + Pjll_t Pjll; + JUDYLCODE(Pjv_t Pjvnew;) // value area of new Leaf. + +// Alloc a 3 byte Index Leaf + PjllRaw = j__udyAllocJLL3(cJU_LEAF4_MAXPOP1, Pjpm); + if (PjllRaw == (Pjlb_t)NULL) return(-1); // out of memory + + Pjll = P_JLL(PjllRaw); + +// Copy Index area into new Leaf + j__udyCopyWto3((uint8_t *) Pjll, StageA, cJU_LEAF4_MAXPOP1); +#ifdef JUDYL +// Copy Value area into new Leaf + Pjvnew = JL_LEAF3VALUEAREA(Pjll, cJU_LEAF4_MAXPOP1); + JU_COPYMEM(Pjvnew, Pjv, cJU_LEAF4_MAXPOP1); +#endif + DBGCODE(JudyCheckSorted(Pjll, cJU_LEAF4_MAXPOP1, 3);) + + DcdP0 = JU_JPDCDPOP0(Pjp) | (CIndex & cJU_DCDMASK(3)); + JU_JPSETADT(Pjp, (Word_t)PjllRaw, DcdP0, cJU_JPLEAF3); + + return(1); + } + +// Else in 2+ expanses, splay Leaf into smaller leaves at higher compression + + StageJBB = StageJBBZero; // zero staged bitmap branch + ZEROJP(SubJPCount); + +// Splay the 4 byte index Leaf to 3 byte Index Leaves + for (ExpCnt = Start = 0, End = 1; ; End++) + { +// Check if new expanse or last one + if ( (End == cJU_LEAF4_MAXPOP1) + || + (JU_DIGITATSTATE(CIndex ^ StageA[End], 4)) + ) + { +// Build a leaf below the previous expanse + + Pjp_t PjpJP = StageJP + ExpCnt; + Word_t Pop1 = End - Start; + Word_t expanse = JU_DIGITATSTATE(CIndex, 4); + Word_t subexp = expanse / cJU_BITSPERSUBEXPB; +// +// set the bit that is the current expanse + JU_JBB_BITMAP(&StageJBB, subexp) |= JU_BITPOSMASKB(expanse); +#ifdef SUBEXPCOUNTS + StageJBB.jbb_subPop1[subexp] += Pop1; // pop of subexpanse +#endif +// count number of expanses in each subexpanse + SubJPCount[subexp]++; + +// Save byte expanse of leaf + StageExp[ExpCnt] = JU_DIGITATSTATE(CIndex, 4); + + if (Pop1 == 1) // cJU_JPIMMED_3_01 + { + Word_t DcdP0; + DcdP0 = (JU_JPDCDPOP0(Pjp) & cJU_DCDMASK(3)) | + CIndex; +#ifdef JUDY1 + JU_JPSETADT(PjpJP, 0, DcdP0, cJ1_JPIMMED_3_01); +#else // JUDYL + JU_JPSETADT(PjpJP, Pjv[Start], DcdP0, + cJL_JPIMMED_3_01); +#endif // JUDYL + } + else if (Pop1 <= cJU_IMMED3_MAXPOP1) + { +// cJ1_JPIMMED_3_02 : Judy1 32 +// cJL_JPIMMED_3_02 : JudyL 64 +// cJ1_JPIMMED_3_02..5: Judy1 64 + +#ifdef JUDYL +// Alloc is 1st in case of malloc fail + Pjv_t PjvnewRaw; // value area of new leaf. + Pjv_t Pjvnew; + +// Allocate Value area for Immediate Leaf + PjvnewRaw = j__udyLAllocJV(Pop1, Pjpm); + if (PjvnewRaw == (Pjv_t) NULL) + FREEALLEXIT(ExpCnt, StageJP, Pjpm); + + Pjvnew = P_JV(PjvnewRaw); + +// Copy to Values to Value Leaf + JU_COPYMEM(Pjvnew, Pjv + Start, Pop1); + PjpJP->jp_Addr = (Word_t) PjvnewRaw; + +// Copy to Index to JP as an immediate Leaf + j__udyCopyWto3(PjpJP->jp_LIndex, + StageA + Start, Pop1); +#else + j__udyCopyWto3(PjpJP->jp_1Index, + StageA + Start, Pop1); +#endif +// Set type, population and Index size + PjpJP->jp_Type = cJU_JPIMMED_3_02 + Pop1 - 2; + } + else + { +// cJU_JPLEAF3 + Word_t DcdP0; + Pjll_t PjllRaw; // pointer to new leaf. + Pjll_t Pjll; + JUDYLCODE(Pjv_t Pjvnew;) // value area of new leaf. + + PjllRaw = j__udyAllocJLL3(Pop1, Pjpm); + if (PjllRaw == (Pjll_t)NULL) + FREEALLEXIT(ExpCnt, StageJP, Pjpm); + + Pjll = P_JLL(PjllRaw); + +// Copy Indexes to new Leaf + j__udyCopyWto3((uint8_t *) Pjll, StageA + Start, + Pop1); +#ifdef JUDYL +// Copy to Values to new Leaf + Pjvnew = JL_LEAF3VALUEAREA(Pjll, Pop1); + JU_COPYMEM(Pjvnew, Pjv + Start, Pop1); +#endif + DBGCODE(JudyCheckSorted(Pjll, Pop1, 3);) + + DcdP0 = (JU_JPDCDPOP0(Pjp) & cJU_DCDMASK(4)) + | + (CIndex & cJU_DCDMASK(4-1)) + | + (Pop1 - 1); + + JU_JPSETADT(PjpJP, (Word_t)PjllRaw, DcdP0, + cJU_JPLEAF3); + } + ExpCnt++; +// Done? + if (End == cJU_LEAF4_MAXPOP1) break; + +// New Expanse, Start and Count + CIndex = StageA[End]; + Start = End; + } + } + +// Now put all the Leaves below a BranchL or BranchB: + if (ExpCnt <= cJU_BRANCHLMAXJPS) // put the Leaves below a BranchL + { + if (j__udyCreateBranchL(Pjp, StageJP, StageExp, ExpCnt, + Pjpm) == -1) FREEALLEXIT(ExpCnt, StageJP, Pjpm); + + Pjp->jp_Type = cJU_JPBRANCH_L4; + } + else + { + if (j__udyStageJBBtoJBB(Pjp, &StageJBB, StageJP, SubJPCount, Pjpm) + == -1) FREEALLEXIT(ExpCnt, StageJP, Pjpm); + } + return(1); + +} // j__udyCascade4() + + +// **************************************************************************** +// __ J U D Y C A S C A D E 5 +// +// Cascade from a cJU_JPLEAF5 to one of the following: +// 1. if leaf is in 1 expanse: +// compress it into a JPLEAF4 +// 2. if leaf contains multiple expanses: +// create linear or bitmap branch containing +// each new expanse is either a: +// JPIMMED_4_01 branch +// JPLEAF4 + +FUNCTION int j__udyCascade5( + Pjp_t Pjp, + Pvoid_t Pjpm) +{ + uint8_t * PLeaf; // pointer to leaf, explicit type. + Word_t End, Start; // temporaries. + Word_t ExpCnt; // count of expanses of splay. + Word_t CIndex; // current Index word. +JUDYLCODE(Pjv_t Pjv;) // value area of leaf. + +// Temp staging for parts(Leaves) of newly splayed leaf + jp_t StageJP [cJU_LEAF5_MAXPOP1]; // JPs of new leaves + Word_t StageA [cJU_LEAF5_MAXPOP1]; + uint8_t StageExp [cJU_LEAF5_MAXPOP1]; // Expanses of new leaves + uint8_t SubJPCount[cJU_NUMSUBEXPB]; // JPs in each subexpanse + jbb_t StageJBB; // staged bitmap branch + + assert(JU_JPTYPE(Pjp) == cJU_JPLEAF5); + assert((JU_JPDCDPOP0(Pjp) & 0xFFFFFFFFFF) == (cJU_LEAF5_MAXPOP1-1)); + +// Get the address of the Leaf + PLeaf = (uint8_t *) P_JLL(Pjp->jp_Addr); + +// Extract 5 byte index Leaf to Word_t + j__udyCopy5toW(StageA, PLeaf, cJU_LEAF5_MAXPOP1); + +// Get the address of the Leaf and Value area + JUDYLCODE(Pjv = JL_LEAF5VALUEAREA(PLeaf, cJU_LEAF5_MAXPOP1);) + +// If Leaf is in 1 expanse -- just compress it (compare 1st, last & Index) + + CIndex = StageA[0]; + if (!JU_DIGITATSTATE(CIndex ^ StageA[cJU_LEAF5_MAXPOP1-1], 5)) + { + Word_t DcdP0; + Pjll_t PjllRaw; // pointer to new leaf. + Pjll_t Pjll; + JUDYLCODE(Pjv_t Pjvnew;) // value area of new leaf. + +// Alloc a 4 byte Index Leaf + PjllRaw = j__udyAllocJLL4(cJU_LEAF5_MAXPOP1, Pjpm); + if (PjllRaw == (Pjlb_t)NULL) return(-1); // out of memory + + Pjll = P_JLL(PjllRaw); + +// Copy Index area into new Leaf + j__udyCopyWto4((uint8_t *) Pjll, StageA, cJU_LEAF5_MAXPOP1); +#ifdef JUDYL +// Copy Value area into new Leaf + Pjvnew = JL_LEAF4VALUEAREA(Pjll, cJU_LEAF5_MAXPOP1); + JU_COPYMEM(Pjvnew, Pjv, cJU_LEAF5_MAXPOP1); +#endif + DBGCODE(JudyCheckSorted(Pjll, cJU_LEAF5_MAXPOP1, 4);) + + DcdP0 = JU_JPDCDPOP0(Pjp) | (CIndex & cJU_DCDMASK(4)); + JU_JPSETADT(Pjp, (Word_t)PjllRaw, DcdP0, cJU_JPLEAF4); + + return(1); + } + +// Else in 2+ expanses, splay Leaf into smaller leaves at higher compression + + StageJBB = StageJBBZero; // zero staged bitmap branch + ZEROJP(SubJPCount); + +// Splay the 5 byte index Leaf to 4 byte Index Leaves + for (ExpCnt = Start = 0, End = 1; ; End++) + { +// Check if new expanse or last one + if ( (End == cJU_LEAF5_MAXPOP1) + || + (JU_DIGITATSTATE(CIndex ^ StageA[End], 5)) + ) + { +// Build a leaf below the previous expanse + + Pjp_t PjpJP = StageJP + ExpCnt; + Word_t Pop1 = End - Start; + Word_t expanse = JU_DIGITATSTATE(CIndex, 5); + Word_t subexp = expanse / cJU_BITSPERSUBEXPB; +// +// set the bit that is the current expanse + JU_JBB_BITMAP(&StageJBB, subexp) |= JU_BITPOSMASKB(expanse); +#ifdef SUBEXPCOUNTS + StageJBB.jbb_subPop1[subexp] += Pop1; // pop of subexpanse +#endif +// count number of expanses in each subexpanse + SubJPCount[subexp]++; + +// Save byte expanse of leaf + StageExp[ExpCnt] = JU_DIGITATSTATE(CIndex, 5); + + if (Pop1 == 1) // cJU_JPIMMED_4_01 + { + Word_t DcdP0; + DcdP0 = (JU_JPDCDPOP0(Pjp) & cJU_DCDMASK(4)) | + CIndex; +#ifdef JUDY1 + JU_JPSETADT(PjpJP, 0, DcdP0, cJ1_JPIMMED_4_01); +#else // JUDYL + JU_JPSETADT(PjpJP, Pjv[Start], DcdP0, + cJL_JPIMMED_4_01); +#endif // JUDYL + } +#ifdef JUDY1 + else if (Pop1 <= cJ1_IMMED4_MAXPOP1) + { +// cJ1_JPIMMED_4_02..3: Judy1 64 + +// Copy to Index to JP as an immediate Leaf + j__udyCopyWto4(PjpJP->jp_1Index, + StageA + Start, Pop1); + +// Set pointer, type, population and Index size + PjpJP->jp_Type = cJ1_JPIMMED_4_02 + Pop1 - 2; + } +#endif + else + { +// cJU_JPLEAF4 + Word_t DcdP0; + Pjll_t PjllRaw; // pointer to new leaf. + Pjll_t Pjll; + JUDYLCODE(Pjv_t Pjvnew;) // value area of new leaf. + +// Get a new Leaf + PjllRaw = j__udyAllocJLL4(Pop1, Pjpm); + if (PjllRaw == (Pjll_t)NULL) + FREEALLEXIT(ExpCnt, StageJP, Pjpm); + + Pjll = P_JLL(PjllRaw); + +// Copy Indexes to new Leaf + j__udyCopyWto4((uint8_t *) Pjll, StageA + Start, + Pop1); +#ifdef JUDYL +// Copy to Values to new Leaf + Pjvnew = JL_LEAF4VALUEAREA(Pjll, Pop1); + JU_COPYMEM(Pjvnew, Pjv + Start, Pop1); +#endif + DBGCODE(JudyCheckSorted(Pjll, Pop1, 4);) + + DcdP0 = (JU_JPDCDPOP0(Pjp) & cJU_DCDMASK(5)) + | + (CIndex & cJU_DCDMASK(5-1)) + | + (Pop1 - 1); + + JU_JPSETADT(PjpJP, (Word_t)PjllRaw, DcdP0, + cJU_JPLEAF4); + } + ExpCnt++; +// Done? + if (End == cJU_LEAF5_MAXPOP1) break; + +// New Expanse, Start and Count + CIndex = StageA[End]; + Start = End; + } + } + +// Now put all the Leaves below a BranchL or BranchB: + if (ExpCnt <= cJU_BRANCHLMAXJPS) // put the Leaves below a BranchL + { + if (j__udyCreateBranchL(Pjp, StageJP, StageExp, ExpCnt, + Pjpm) == -1) FREEALLEXIT(ExpCnt, StageJP, Pjpm); + + Pjp->jp_Type = cJU_JPBRANCH_L5; + } + else + { + if (j__udyStageJBBtoJBB(Pjp, &StageJBB, StageJP, SubJPCount, Pjpm) + == -1) FREEALLEXIT(ExpCnt, StageJP, Pjpm); + } + return(1); + +} // j__udyCascade5() + + +// **************************************************************************** +// __ J U D Y C A S C A D E 6 +// +// Cascade from a cJU_JPLEAF6 to one of the following: +// 1. if leaf is in 1 expanse: +// compress it into a JPLEAF5 +// 2. if leaf contains multiple expanses: +// create linear or bitmap branch containing +// each new expanse is either a: +// JPIMMED_5_01 ... JPIMMED_5_03 branch +// JPIMMED_5_01 branch +// JPLEAF5 + +FUNCTION int j__udyCascade6( + Pjp_t Pjp, + Pvoid_t Pjpm) +{ + uint8_t * PLeaf; // pointer to leaf, explicit type. + Word_t End, Start; // temporaries. + Word_t ExpCnt; // count of expanses of splay. + Word_t CIndex; // current Index word. +JUDYLCODE(Pjv_t Pjv;) // value area of leaf. + +// Temp staging for parts(Leaves) of newly splayed leaf + jp_t StageJP [cJU_LEAF6_MAXPOP1]; // JPs of new leaves + Word_t StageA [cJU_LEAF6_MAXPOP1]; + uint8_t StageExp [cJU_LEAF6_MAXPOP1]; // Expanses of new leaves + uint8_t SubJPCount[cJU_NUMSUBEXPB]; // JPs in each subexpanse + jbb_t StageJBB; // staged bitmap branch + + assert(JU_JPTYPE(Pjp) == cJU_JPLEAF6); + assert((JU_JPDCDPOP0(Pjp) & 0xFFFFFFFFFFFF) == (cJU_LEAF6_MAXPOP1-1)); + +// Get the address of the Leaf + PLeaf = (uint8_t *) P_JLL(Pjp->jp_Addr); + +// Extract 6 byte index Leaf to Word_t + j__udyCopy6toW(StageA, PLeaf, cJU_LEAF6_MAXPOP1); + +// Get the address of the Leaf and Value area + JUDYLCODE(Pjv = JL_LEAF6VALUEAREA(PLeaf, cJU_LEAF6_MAXPOP1);) + +// If Leaf is in 1 expanse -- just compress it (compare 1st, last & Index) + + CIndex = StageA[0]; + if (!JU_DIGITATSTATE(CIndex ^ StageA[cJU_LEAF6_MAXPOP1-1], 6)) + { + Word_t DcdP0; + Pjll_t PjllRaw; // pointer to new leaf. + Pjll_t Pjll; + JUDYLCODE(Pjv_t Pjvnew;) // value area of new leaf. + +// Alloc a 5 byte Index Leaf + PjllRaw = j__udyAllocJLL5(cJU_LEAF6_MAXPOP1, Pjpm); + if (PjllRaw == (Pjlb_t)NULL) return(-1); // out of memory + + Pjll = P_JLL(PjllRaw); + +// Copy Index area into new Leaf + j__udyCopyWto5((uint8_t *) Pjll, StageA, cJU_LEAF6_MAXPOP1); +#ifdef JUDYL +// Copy Value area into new Leaf + Pjvnew = JL_LEAF5VALUEAREA(Pjll, cJU_LEAF6_MAXPOP1); + JU_COPYMEM(Pjvnew, Pjv, cJU_LEAF6_MAXPOP1); +#endif + DBGCODE(JudyCheckSorted(Pjll, cJU_LEAF6_MAXPOP1, 5);) + + DcdP0 = JU_JPDCDPOP0(Pjp) | (CIndex & cJU_DCDMASK(5)); + JU_JPSETADT(Pjp, (Word_t)PjllRaw, DcdP0, cJU_JPLEAF5); + + return(1); + } + +// Else in 2+ expanses, splay Leaf into smaller leaves at higher compression + + StageJBB = StageJBBZero; // zero staged bitmap branch + ZEROJP(SubJPCount); + +// Splay the 6 byte index Leaf to 5 byte Index Leaves + for (ExpCnt = Start = 0, End = 1; ; End++) + { +// Check if new expanse or last one + if ( (End == cJU_LEAF6_MAXPOP1) + || + (JU_DIGITATSTATE(CIndex ^ StageA[End], 6)) + ) + { +// Build a leaf below the previous expanse + + Pjp_t PjpJP = StageJP + ExpCnt; + Word_t Pop1 = End - Start; + Word_t expanse = JU_DIGITATSTATE(CIndex, 6); + Word_t subexp = expanse / cJU_BITSPERSUBEXPB; +// +// set the bit that is the current expanse + JU_JBB_BITMAP(&StageJBB, subexp) |= JU_BITPOSMASKB(expanse); +#ifdef SUBEXPCOUNTS + StageJBB.jbb_subPop1[subexp] += Pop1; // pop of subexpanse +#endif +// count number of expanses in each subexpanse + SubJPCount[subexp]++; + +// Save byte expanse of leaf + StageExp[ExpCnt] = JU_DIGITATSTATE(CIndex, 6); + + if (Pop1 == 1) // cJU_JPIMMED_5_01 + { + Word_t DcdP0; + DcdP0 = (JU_JPDCDPOP0(Pjp) & cJU_DCDMASK(5)) | + CIndex; +#ifdef JUDY1 + JU_JPSETADT(PjpJP, 0, DcdP0, cJ1_JPIMMED_5_01); +#else // JUDYL + JU_JPSETADT(PjpJP, Pjv[Start], DcdP0, + cJL_JPIMMED_5_01); +#endif // JUDYL + } +#ifdef JUDY1 + else if (Pop1 <= cJ1_IMMED5_MAXPOP1) + { +// cJ1_JPIMMED_5_02..3: Judy1 64 + +// Copy to Index to JP as an immediate Leaf + j__udyCopyWto5(PjpJP->jp_1Index, + StageA + Start, Pop1); + +// Set pointer, type, population and Index size + PjpJP->jp_Type = cJ1_JPIMMED_5_02 + Pop1 - 2; + } +#endif + else + { +// cJU_JPLEAF5 + Word_t DcdP0; + Pjll_t PjllRaw; // pointer to new leaf. + Pjll_t Pjll; + JUDYLCODE(Pjv_t Pjvnew;) // value area of new leaf. + +// Get a new Leaf + PjllRaw = j__udyAllocJLL5(Pop1, Pjpm); + if (PjllRaw == (Pjll_t)NULL) + FREEALLEXIT(ExpCnt, StageJP, Pjpm); + + Pjll = P_JLL(PjllRaw); + +// Copy Indexes to new Leaf + j__udyCopyWto5((uint8_t *) Pjll, StageA + Start, + Pop1); + +// Copy to Values to new Leaf +#ifdef JUDYL + Pjvnew = JL_LEAF5VALUEAREA(Pjll, Pop1); + JU_COPYMEM(Pjvnew, Pjv + Start, Pop1); +#endif + DBGCODE(JudyCheckSorted(Pjll, Pop1, 5);) + + DcdP0 = (JU_JPDCDPOP0(Pjp) & cJU_DCDMASK(6)) + | + (CIndex & cJU_DCDMASK(6-1)) + | + (Pop1 - 1); + + JU_JPSETADT(PjpJP, (Word_t)PjllRaw, DcdP0, + cJU_JPLEAF5); + } + ExpCnt++; +// Done? + if (End == cJU_LEAF6_MAXPOP1) break; + +// New Expanse, Start and Count + CIndex = StageA[End]; + Start = End; + } + } + +// Now put all the Leaves below a BranchL or BranchB: + if (ExpCnt <= cJU_BRANCHLMAXJPS) // put the Leaves below a BranchL + { + if (j__udyCreateBranchL(Pjp, StageJP, StageExp, ExpCnt, + Pjpm) == -1) FREEALLEXIT(ExpCnt, StageJP, Pjpm); + + Pjp->jp_Type = cJU_JPBRANCH_L6; + } + else + { + if (j__udyStageJBBtoJBB(Pjp, &StageJBB, StageJP, SubJPCount, Pjpm) + == -1) FREEALLEXIT(ExpCnt, StageJP, Pjpm); + } + return(1); + +} // j__udyCascade6() + + +// **************************************************************************** +// __ J U D Y C A S C A D E 7 +// +// Cascade from a cJU_JPLEAF7 to one of the following: +// 1. if leaf is in 1 expanse: +// compress it into a JPLEAF6 +// 2. if leaf contains multiple expanses: +// create linear or bitmap branch containing +// each new expanse is either a: +// JPIMMED_6_01 ... JPIMMED_6_02 branch +// JPIMMED_6_01 branch +// JPLEAF6 + +FUNCTION int j__udyCascade7( + Pjp_t Pjp, + Pvoid_t Pjpm) +{ + uint8_t * PLeaf; // pointer to leaf, explicit type. + Word_t End, Start; // temporaries. + Word_t ExpCnt; // count of expanses of splay. + Word_t CIndex; // current Index word. +JUDYLCODE(Pjv_t Pjv;) // value area of leaf. + +// Temp staging for parts(Leaves) of newly splayed leaf + jp_t StageJP [cJU_LEAF7_MAXPOP1]; // JPs of new leaves + Word_t StageA [cJU_LEAF7_MAXPOP1]; + uint8_t StageExp [cJU_LEAF7_MAXPOP1]; // Expanses of new leaves + uint8_t SubJPCount[cJU_NUMSUBEXPB]; // JPs in each subexpanse + jbb_t StageJBB; // staged bitmap branch + + assert(JU_JPTYPE(Pjp) == cJU_JPLEAF7); + assert(JU_JPDCDPOP0(Pjp) == (cJU_LEAF7_MAXPOP1-1)); + +// Get the address of the Leaf + PLeaf = (uint8_t *) P_JLL(Pjp->jp_Addr); + +// Extract 7 byte index Leaf to Word_t + j__udyCopy7toW(StageA, PLeaf, cJU_LEAF7_MAXPOP1); + +// Get the address of the Leaf and Value area + JUDYLCODE(Pjv = JL_LEAF7VALUEAREA(PLeaf, cJU_LEAF7_MAXPOP1);) + +// If Leaf is in 1 expanse -- just compress it (compare 1st, last & Index) + + CIndex = StageA[0]; + if (!JU_DIGITATSTATE(CIndex ^ StageA[cJU_LEAF7_MAXPOP1-1], 7)) + { + Word_t DcdP0; + Pjll_t PjllRaw; // pointer to new leaf. + Pjll_t Pjll; + JUDYLCODE(Pjv_t Pjvnew;) // value area of new leaf. + +// Alloc a 6 byte Index Leaf + PjllRaw = j__udyAllocJLL6(cJU_LEAF7_MAXPOP1, Pjpm); + if (PjllRaw == (Pjlb_t)NULL) return(-1); // out of memory + + Pjll = P_JLL(PjllRaw); + +// Copy Index area into new Leaf + j__udyCopyWto6((uint8_t *) Pjll, StageA, cJU_LEAF7_MAXPOP1); +#ifdef JUDYL +// Copy Value area into new Leaf + Pjvnew = JL_LEAF6VALUEAREA(Pjll, cJU_LEAF7_MAXPOP1); + JU_COPYMEM(Pjvnew, Pjv, cJU_LEAF7_MAXPOP1); +#endif + DBGCODE(JudyCheckSorted(Pjll, cJU_LEAF7_MAXPOP1, 6);) + + DcdP0 = JU_JPDCDPOP0(Pjp) | (CIndex & cJU_DCDMASK(6)); + JU_JPSETADT(Pjp, (Word_t)PjllRaw, DcdP0, cJU_JPLEAF6); + + return(1); + } + +// Else in 2+ expanses, splay Leaf into smaller leaves at higher compression + + StageJBB = StageJBBZero; // zero staged bitmap branch + ZEROJP(SubJPCount); + +// Splay the 7 byte index Leaf to 6 byte Index Leaves + for (ExpCnt = Start = 0, End = 1; ; End++) + { +// Check if new expanse or last one + if ( (End == cJU_LEAF7_MAXPOP1) + || + (JU_DIGITATSTATE(CIndex ^ StageA[End], 7)) + ) + { +// Build a leaf below the previous expanse + + Pjp_t PjpJP = StageJP + ExpCnt; + Word_t Pop1 = End - Start; + Word_t expanse = JU_DIGITATSTATE(CIndex, 7); + Word_t subexp = expanse / cJU_BITSPERSUBEXPB; +// +// set the bit that is the current expanse + JU_JBB_BITMAP(&StageJBB, subexp) |= JU_BITPOSMASKB(expanse); +#ifdef SUBEXPCOUNTS + StageJBB.jbb_subPop1[subexp] += Pop1; // pop of subexpanse +#endif +// count number of expanses in each subexpanse + SubJPCount[subexp]++; + +// Save byte expanse of leaf + StageExp[ExpCnt] = JU_DIGITATSTATE(CIndex, 7); + + if (Pop1 == 1) // cJU_JPIMMED_6_01 + { + Word_t DcdP0; + DcdP0 = (JU_JPDCDPOP0(Pjp) & cJU_DCDMASK(6)) | + CIndex; +#ifdef JUDY1 + JU_JPSETADT(PjpJP, 0, DcdP0, cJ1_JPIMMED_6_01); +#else // JUDYL + JU_JPSETADT(PjpJP, Pjv[Start], DcdP0, + cJL_JPIMMED_6_01); +#endif // JUDYL + } +#ifdef JUDY1 + else if (Pop1 == cJ1_IMMED6_MAXPOP1) + { +// cJ1_JPIMMED_6_02: Judy1 64 + +// Copy to Index to JP as an immediate Leaf + j__udyCopyWto6(PjpJP->jp_1Index, + StageA + Start, 2); + +// Set pointer, type, population and Index size + PjpJP->jp_Type = cJ1_JPIMMED_6_02; + } +#endif + else + { +// cJU_JPLEAF6 + Word_t DcdP0; + Pjll_t PjllRaw; // pointer to new leaf. + Pjll_t Pjll; + JUDYLCODE(Pjv_t Pjvnew;) // value area of new leaf. + +// Get a new Leaf + PjllRaw = j__udyAllocJLL6(Pop1, Pjpm); + if (PjllRaw == (Pjll_t)NULL) + FREEALLEXIT(ExpCnt, StageJP, Pjpm); + Pjll = P_JLL(PjllRaw); + +// Copy Indexes to new Leaf + j__udyCopyWto6((uint8_t *) Pjll, StageA + Start, + Pop1); +#ifdef JUDYL +// Copy to Values to new Leaf + Pjvnew = JL_LEAF6VALUEAREA(Pjll, Pop1); + JU_COPYMEM(Pjvnew, Pjv + Start, Pop1); +#endif + DBGCODE(JudyCheckSorted(Pjll, Pop1, 6);) + + DcdP0 = (JU_JPDCDPOP0(Pjp) & cJU_DCDMASK(7)) + | + (CIndex & cJU_DCDMASK(7-1)) + | + (Pop1 - 1); + + JU_JPSETADT(PjpJP, (Word_t)PjllRaw, DcdP0, + cJU_JPLEAF6); + } + ExpCnt++; +// Done? + if (End == cJU_LEAF7_MAXPOP1) break; + +// New Expanse, Start and Count + CIndex = StageA[End]; + Start = End; + } + } + +// Now put all the Leaves below a BranchL or BranchB: + if (ExpCnt <= cJU_BRANCHLMAXJPS) // put the Leaves below a BranchL + { + if (j__udyCreateBranchL(Pjp, StageJP, StageExp, ExpCnt, + Pjpm) == -1) FREEALLEXIT(ExpCnt, StageJP, Pjpm); + + Pjp->jp_Type = cJU_JPBRANCH_L7; + } + else + { + if (j__udyStageJBBtoJBB(Pjp, &StageJBB, StageJP, SubJPCount, Pjpm) + == -1) FREEALLEXIT(ExpCnt, StageJP, Pjpm); + } + return(1); + +} // j__udyCascade7() + +#endif // JU_64BIT + + +// **************************************************************************** +// __ J U D Y C A S C A D E L +// +// (Compressed) cJU_LEAF3[7], cJ1_JPBRANCH_L. +// +// Cascade from a LEAFW (under Pjp) to one of the following: +// 1. if LEAFW is in 1 expanse: +// create linear branch with a JPLEAF3[7] under it +// 2. LEAFW contains multiple expanses: +// create linear or bitmap branch containing new expanses +// each new expanse is either a: 32 64 +// JPIMMED_3_01 branch Y N +// JPIMMED_7_01 branch N Y +// JPLEAF3 Y N +// JPLEAF7 N Y + +FUNCTION int j__udyCascadeL( + Pjp_t Pjp, + Pvoid_t Pjpm) +{ + Pjlw_t Pjlw; // leaf to work on. + Word_t End, Start; // temporaries. + Word_t ExpCnt; // count of expanses of splay. + Word_t CIndex; // current Index word. +JUDYLCODE(Pjv_t Pjv;) // value area of leaf. + +// Temp staging for parts(Leaves) of newly splayed leaf + jp_t StageJP [cJU_LEAFW_MAXPOP1]; + uint8_t StageExp[cJU_LEAFW_MAXPOP1]; + uint8_t SubJPCount[cJU_NUMSUBEXPB]; // JPs in each subexpanse + jbb_t StageJBB; // staged bitmap branch + +// Get the address of the Leaf + Pjlw = P_JLW(Pjp->jp_Addr); + + assert(Pjlw[0] == (cJU_LEAFW_MAXPOP1 - 1)); + +// Get pointer to Value area of old Leaf + JUDYLCODE(Pjv = JL_LEAFWVALUEAREA(Pjlw, cJU_LEAFW_MAXPOP1);) + + Pjlw++; // Now point to Index area + +// If Leaf is in 1 expanse -- first compress it (compare 1st, last & Index): + + CIndex = Pjlw[0]; // also used far below + if (!JU_DIGITATSTATE(CIndex ^ Pjlw[cJU_LEAFW_MAXPOP1 - 1], + cJU_ROOTSTATE)) + { + Pjll_t PjllRaw; // pointer to new leaf. + Pjll_t Pjll; + JUDYLCODE(Pjv_t Pjvnew;) // value area of new leaf. + +// Get the common expanse to all elements in Leaf + StageExp[0] = JU_DIGITATSTATE(CIndex, cJU_ROOTSTATE); + +// Alloc a 3[7] byte Index Leaf +#ifdef JU_64BIT + PjllRaw = j__udyAllocJLL7(cJU_LEAFW_MAXPOP1, Pjpm); + if (PjllRaw == (Pjlb_t)NULL) return(-1); // out of memory + + Pjll = P_JLL(PjllRaw); + +// Copy LEAFW to a cJU_JPLEAF7 + j__udyCopyWto7((uint8_t *) Pjll, Pjlw, cJU_LEAFW_MAXPOP1); +#ifdef JUDYL +// Get the Value area of new Leaf + Pjvnew = JL_LEAF7VALUEAREA(Pjll, cJU_LEAFW_MAXPOP1); + JU_COPYMEM(Pjvnew, Pjv, cJU_LEAFW_MAXPOP1); +#endif + DBGCODE(JudyCheckSorted(Pjll, cJU_LEAFW_MAXPOP1, 7);) +#else // 32 Bit + PjllRaw = j__udyAllocJLL3(cJU_LEAFW_MAXPOP1, Pjpm); + if (PjllRaw == (Pjll_t) NULL) return(-1); + + Pjll = P_JLL(PjllRaw); + +// Copy LEAFW to a cJU_JPLEAF3 + j__udyCopyWto3((uint8_t *) Pjll, Pjlw, cJU_LEAFW_MAXPOP1); +#ifdef JUDYL +// Get the Value area of new Leaf + Pjvnew = JL_LEAF3VALUEAREA(Pjll, cJU_LEAFW_MAXPOP1); + JU_COPYMEM(Pjvnew, Pjv, cJU_LEAFW_MAXPOP1); +#endif + DBGCODE(JudyCheckSorted(Pjll, cJU_LEAFW_MAXPOP1, 3);) +#endif // 32 Bit + +// Following not needed because cJU_DCDMASK(3[7]) is == 0 +////// StageJP[0].jp_DcdPopO |= (CIndex & cJU_DCDMASK(3[7])); +#ifdef JU_64BIT + JU_JPSETADT(&(StageJP[0]), (Word_t)PjllRaw, cJU_LEAFW_MAXPOP1-1, + cJU_JPLEAF7); +#else // 32BIT + JU_JPSETADT(&(StageJP[0]), (Word_t)PjllRaw, cJU_LEAFW_MAXPOP1-1, + cJU_JPLEAF3); +#endif // 32BIT +// Create a 1 element Linear branch + if (j__udyCreateBranchL(Pjp, StageJP, StageExp, 1, Pjpm) == -1) + return(-1); + +// Change the type of callers JP + Pjp->jp_Type = cJU_JPBRANCH_L; + + return(1); + } + +// Else in 2+ expanses, splay Leaf into smaller leaves at higher compression + + StageJBB = StageJBBZero; // zero staged bitmap branch + ZEROJP(SubJPCount); + +// Splay the 4[8] byte Index Leaf to 3[7] byte Index Leaves + for (ExpCnt = Start = 0, End = 1; ; End++) + { +// Check if new expanse or last one + if ( (End == cJU_LEAFW_MAXPOP1) + || + (JU_DIGITATSTATE(CIndex ^ Pjlw[End], cJU_ROOTSTATE)) + ) + { +// Build a leaf below the previous expanse + + Pjp_t PjpJP = StageJP + ExpCnt; + Word_t Pop1 = End - Start; + Word_t expanse = JU_DIGITATSTATE(CIndex, cJU_ROOTSTATE); + Word_t subexp = expanse / cJU_BITSPERSUBEXPB; +// +// set the bit that is the current expanse + JU_JBB_BITMAP(&StageJBB, subexp) |= JU_BITPOSMASKB(expanse); +#ifdef SUBEXPCOUNTS + StageJBB.jbb_subPop1[subexp] += Pop1; // pop of subexpanse +#endif +// count number of expanses in each subexpanse + SubJPCount[subexp]++; + +// Save byte expanse of leaf + StageExp[ExpCnt] = JU_DIGITATSTATE(CIndex, + cJU_ROOTSTATE); + + if (Pop1 == 1) // cJU_JPIMMED_3[7]_01 + { +#ifdef JU_64BIT +#ifdef JUDY1 + JU_JPSETADT(PjpJP, 0, CIndex, cJ1_JPIMMED_7_01); +#else // JUDYL + JU_JPSETADT(PjpJP, Pjv[Start], CIndex, + cJL_JPIMMED_7_01); +#endif // JUDYL + +#else // JU_32BIT +#ifdef JUDY1 + JU_JPSETADT(PjpJP, 0, CIndex, cJ1_JPIMMED_3_01); +#else // JUDYL + JU_JPSETADT(PjpJP, Pjv[Start], CIndex, + cJL_JPIMMED_3_01); +#endif // JUDYL +#endif // JU_32BIT + } +#ifdef JUDY1 +#ifdef JU_64BIT + else if (Pop1 <= cJ1_IMMED7_MAXPOP1) +#else + else if (Pop1 <= cJ1_IMMED3_MAXPOP1) +#endif + { +// cJ1_JPIMMED_3_02 : Judy1 32 +// cJ1_JPIMMED_7_02 : Judy1 64 +// Copy to JP as an immediate Leaf +#ifdef JU_64BIT + j__udyCopyWto7(PjpJP->jp_1Index, Pjlw+Start, 2); + PjpJP->jp_Type = cJ1_JPIMMED_7_02; +#else + j__udyCopyWto3(PjpJP->jp_1Index, Pjlw+Start, 2); + PjpJP->jp_Type = cJ1_JPIMMED_3_02; +#endif // 32 Bit + } +#endif // JUDY1 + else // Linear Leaf JPLEAF3[7] + { +// cJU_JPLEAF3[7] + Pjll_t PjllRaw; // pointer to new leaf. + Pjll_t Pjll; + JUDYLCODE(Pjv_t Pjvnew;) // value area of new leaf. +#ifdef JU_64BIT + PjllRaw = j__udyAllocJLL7(Pop1, Pjpm); + if (PjllRaw == (Pjll_t) NULL) return(-1); + Pjll = P_JLL(PjllRaw); + + j__udyCopyWto7((uint8_t *) Pjll, Pjlw + Start, + Pop1); +#ifdef JUDYL + Pjvnew = JL_LEAF7VALUEAREA(Pjll, Pop1); + JU_COPYMEM(Pjvnew, Pjv + Start, Pop1); +#endif // JUDYL + DBGCODE(JudyCheckSorted(Pjll, Pop1, 7);) +#else // JU_64BIT - 32 Bit + PjllRaw = j__udyAllocJLL3(Pop1, Pjpm); + if (PjllRaw == (Pjll_t) NULL) return(-1); + Pjll = P_JLL(PjllRaw); + + j__udyCopyWto3((uint8_t *) Pjll, Pjlw + Start, + Pop1); +#ifdef JUDYL + Pjvnew = JL_LEAF3VALUEAREA(Pjll, Pop1); + JU_COPYMEM(Pjvnew, Pjv + Start, Pop1); +#endif // JUDYL + DBGCODE(JudyCheckSorted(Pjll, Pop1, 3);) +#endif // 32 Bit + +#ifdef JU_64BIT + JU_JPSETADT(PjpJP, (Word_t)PjllRaw, Pop1 - 1, + cJU_JPLEAF7); +#else // JU_64BIT - 32 Bit + JU_JPSETADT(PjpJP, (Word_t)PjllRaw, Pop1 - 1, + cJU_JPLEAF3); +#endif // 32 Bit + } + ExpCnt++; +// Done? + if (End == cJU_LEAFW_MAXPOP1) break; + +// New Expanse, Start and Count + CIndex = Pjlw[End]; + Start = End; + } + } + +// Now put all the Leaves below a BranchL or BranchB: + if (ExpCnt <= cJU_BRANCHLMAXJPS) // put the Leaves below a BranchL + { + if (j__udyCreateBranchL(Pjp, StageJP, StageExp, ExpCnt, + Pjpm) == -1) FREEALLEXIT(ExpCnt, StageJP, Pjpm); + + Pjp->jp_Type = cJU_JPBRANCH_L; + } + else + { + if (j__udyStageJBBtoJBB(Pjp, &StageJBB, StageJP, SubJPCount, Pjpm) + == -1) FREEALLEXIT(ExpCnt, StageJP, Pjpm); + + Pjp->jp_Type = cJU_JPBRANCH_B; // cJU_LEAFW is out of sequence + } + return(1); + +} // j__udyCascadeL() diff --git a/src/libnetdata/libjudy/src/JudyL/JudyLCount.c b/src/libnetdata/libjudy/src/JudyL/JudyLCount.c new file mode 100644 index 00000000..179757f0 --- /dev/null +++ b/src/libnetdata/libjudy/src/JudyL/JudyLCount.c @@ -0,0 +1,1195 @@ +// Copyright (C) 2000 - 2002 Hewlett-Packard Company +// +// This program is free software; you can redistribute it and/or modify it +// under the term of the GNU Lesser General Public License as published by the +// Free Software Foundation; either version 2 of the License, or (at your +// option) any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License +// for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this program; if not, write to the Free Software Foundation, +// Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// _________________ + +// @(#) $Revision: 4.78 $ $Source: /judy/src/JudyCommon/JudyCount.c $ +// +// Judy*Count() function for Judy1 and JudyL. +// Compile with one of -DJUDY1 or -DJUDYL. +// +// Compile with -DNOSMARTJBB, -DNOSMARTJBU, and/or -DNOSMARTJLB to build a +// version with cache line optimizations deleted, for testing. +// +// Compile with -DSMARTMETRICS to obtain global variables containing smart +// cache line metrics. Note: Dont turn this on simultaneously for this file +// and JudyByCount.c because they export the same globals. +// +// Judy*Count() returns the "count of Indexes" (inclusive) between the two +// specified limits (Indexes). This code is remarkably fast. It traverses the +// "Judy array" data structure. +// +// This count code is the GENERIC untuned version (minimum code size). It +// might be possible to tuned to a specific architecture to be faster. +// However, in real applications, with a modern machine, it is expected that +// the instruction times will be swamped by cache line fills. +// **************************************************************************** + +#if (! (defined(JUDY1) || defined(JUDYL))) +#error: One of -DJUDY1 or -DJUDYL must be specified. +#endif + +#ifdef JUDY1 +#include "Judy1.h" +#else +#include "JudyL.h" +#endif + +#include "JudyPrivate1L.h" + + +// define a phoney that is for sure + +#define cJU_LEAFW cJU_JPIMMED_CAP + +// Avoid duplicate symbols since this file is multi-compiled: + +#ifdef SMARTMETRICS +#ifdef JUDY1 +Word_t jbb_upward = 0; // counts of directions taken: +Word_t jbb_downward = 0; +Word_t jbu_upward = 0; +Word_t jbu_downward = 0; +Word_t jlb_upward = 0; +Word_t jlb_downward = 0; +#else +extern Word_t jbb_upward; +extern Word_t jbb_downward; +extern Word_t jbu_upward; +extern Word_t jbu_downward; +extern Word_t jlb_upward; +extern Word_t jlb_downward; +#endif +#endif + + +// FORWARD DECLARATIONS (prototypes): + +static Word_t j__udy1LCountSM(const Pjp_t Pjp, const Word_t Index, + const Pjpm_t Pjpm); + +// Each of Judy1 and JudyL get their own private (static) version of this +// function: + +static int j__udyCountLeafB1(const Pjll_t Pjll, const Word_t Pop1, + const Word_t Index); + +// These functions are not static because they are exported to Judy*ByCount(): +// +// TBD: Should be made static for performance reasons? And thus duplicated? +// +// Note: There really are two different functions, but for convenience they +// are referred to here with a generic name. + +#ifdef JUDY1 +#define j__udyJPPop1 j__udy1JPPop1 +#else +#define j__udyJPPop1 j__udyLJPPop1 +#endif + +Word_t j__udyJPPop1(const Pjp_t Pjp); + + +// LOCAL ERROR HANDLING: +// +// The Judy*Count() functions are unusual because they return 0 instead of JERR +// for an error. In this source file, define C_JERR for clarity. + +#define C_JERR 0 + + +// **************************************************************************** +// J U D Y 1 C O U N T +// J U D Y L C O U N T +// +// See the manual entry for details. +// +// This code is written recursively, at least at first, because thats much +// simpler; hope its fast enough. + +#ifdef JUDY1 +FUNCTION Word_t Judy1Count +#else +FUNCTION Word_t JudyLCount +#endif + ( + Pcvoid_t PArray, // JRP to first branch/leaf in SM. + Word_t Index1, // starting Index. + Word_t Index2, // ending Index. + PJError_t PJError // optional, for returning error info. + ) +{ + jpm_t fakejpm; // local temporary for small arrays. + Pjpm_t Pjpm; // top JPM or local temporary for error info. + jp_t fakejp; // constructed for calling j__udy1LCountSM(). + Pjp_t Pjp; // JP to pass to j__udy1LCountSM(). + Word_t pop1; // total for the array. + Word_t pop1above1; // indexes at or above Index1, inclusive. + Word_t pop1above2; // indexes at or above Index2, exclusive. + int retcode; // from Judy*First() calls. +JUDYLCODE(PPvoid_t PPvalue); // from JudyLFirst() calls. + + +// CHECK FOR SHORTCUTS: +// +// As documented, return C_JERR if the Judy array is empty or Index1 > Index2. + + if ((PArray == (Pvoid_t) NULL) || (Index1 > Index2)) + { + JU_SET_ERRNO(PJError, JU_ERRNO_NONE); + return(C_JERR); + } + +// If Index1 == Index2, simply check if the specified Index is set; pass +// through the return value from Judy1Test() or JudyLGet() with appropriate +// translations. + + if (Index1 == Index2) + { +#ifdef JUDY1 + retcode = Judy1Test(PArray, Index1, PJError); + + if (retcode == JERRI) return(C_JERR); // pass through error. + + if (retcode == 0) + { + JU_SET_ERRNO(PJError, JU_ERRNO_NONE); + return(C_JERR); + } +#else + PPvalue = JudyLGet(PArray, Index1, PJError); + + if (PPvalue == PPJERR) return(C_JERR); // pass through error. + + if (PPvalue == (PPvoid_t) NULL) // Index is not set. + { + JU_SET_ERRNO(PJError, JU_ERRNO_NONE); + return(C_JERR); + } +#endif + return(1); // single index is set. + } + + +// CHECK JRP TYPE: +// +// Use an if/then for speed rather than a switch, and put the most common cases +// first. +// +// Note: Since even cJU_LEAFW types require counting between two Indexes, +// prepare them here for common code below that calls j__udy1LCountSM(), rather +// than handling them even more specially here. + + if (JU_LEAFW_POP0(PArray) < cJU_LEAFW_MAXPOP1) // must be a LEAFW + { + Pjlw_t Pjlw = P_JLW(PArray); // first word of leaf. + Pjpm = & fakejpm; + Pjp = & fakejp; + Pjp->jp_Addr = (Word_t) Pjlw; + Pjp->jp_Type = cJU_LEAFW; + Pjpm->jpm_Pop0 = Pjlw[0]; // from first word of leaf. + pop1 = Pjpm->jpm_Pop0 + 1; + } + else + { + Pjpm = P_JPM(PArray); + Pjp = &(Pjpm->jpm_JP); + pop1 = (Pjpm->jpm_Pop0) + 1; // note: can roll over to 0. + +#if (defined(JUDY1) && (! defined(JU_64BIT))) + if (pop1 == 0) // rare special case of full array: + { + Word_t count = Index2 - Index1 + 1; // can roll over again. + + if (count == 0) + { + JU_SET_ERRNO(PJError, JU_ERRNO_FULL); + return(C_JERR); + } + return(count); + } +#else + assert(pop1); // JudyL or 64-bit cannot create a full array! +#endif + } + + +// COUNT POP1 ABOVE INDEX1, INCLUSIVE: + + assert(pop1); // just to be safe. + + if (Index1 == 0) // shortcut, pop1above1 is entire population: + { + pop1above1 = pop1; + } + else // find first valid Index above Index1, if any: + { +#ifdef JUDY1 + if ((retcode = Judy1First(PArray, & Index1, PJError)) == JERRI) + return(C_JERR); // pass through error. +#else + if ((PPvalue = JudyLFirst(PArray, & Index1, PJError)) == PPJERR) + return(C_JERR); // pass through error. + + retcode = (PPvalue != (PPvoid_t) NULL); // found a next Index. +#endif + +// If theres no Index at or above Index1, just return C_JERR (early exit): + + if (retcode == 0) + { + JU_SET_ERRNO(PJError, JU_ERRNO_NONE); + return(C_JERR); + } + +// If a first/next Index was found, call the counting motor starting with that +// known valid Index, meaning the return should be positive, not C_JERR except +// in case of a real error: + + if ((pop1above1 = j__udy1LCountSM(Pjp, Index1, Pjpm)) == C_JERR) + { + JU_COPY_ERRNO(PJError, Pjpm); // pass through error. + return(C_JERR); + } + } + + +// COUNT POP1 ABOVE INDEX2, EXCLUSIVE, AND RETURN THE DIFFERENCE: +// +// In principle, calculate the ordinal of each Index and take the difference, +// with caution about off-by-one errors due to the specified Indexes being set +// or unset. In practice: +// +// - The ordinals computed here are inverse ordinals, that is, the populations +// ABOVE the specified Indexes (Index1 inclusive, Index2 exclusive), so +// subtract pop1above2 from pop1above1, rather than vice-versa. +// +// - Index1s result already includes a count for Index1 and/or Index2 if +// either is set, so calculate pop1above2 exclusive of Index2. +// +// TBD: If Index1 and Index2 fall in the same expanse in the top-state +// branch(es), would it be faster to walk the SM only once, to their divergence +// point, before calling j__udy1LCountSM() or equivalent? Possibly a non-issue +// if a top-state pop1 becomes stored with each Judy1 array. Also, consider +// whether the first call of j__udy1LCountSM() fills the cache, for common tree +// branches, for the second call. +// +// As for pop1above1, look for shortcuts for special cases when pop1above2 is +// zero. Otherwise call the counting "motor". + + assert(pop1above1); // just to be safe. + + if (Index2++ == cJU_ALLONES) return(pop1above1); // Index2 at limit. + +#ifdef JUDY1 + if ((retcode = Judy1First(PArray, & Index2, PJError)) == JERRI) + return(C_JERR); +#else + if ((PPvalue = JudyLFirst(PArray, & Index2, PJError)) == PPJERR) + return(C_JERR); + + retcode = (PPvalue != (PPvoid_t) NULL); // found a next Index. +#endif + if (retcode == 0) return(pop1above1); // no Index above Index2. + +// Just as for Index1, j__udy1LCountSM() cannot return 0 (locally == C_JERR) +// except in case of a real error: + + if ((pop1above2 = j__udy1LCountSM(Pjp, Index2, Pjpm)) == C_JERR) + { + JU_COPY_ERRNO(PJError, Pjpm); // pass through error. + return(C_JERR); + } + + if (pop1above1 == pop1above2) + { + JU_SET_ERRNO(PJError, JU_ERRNO_NONE); + return(C_JERR); + } + + return(pop1above1 - pop1above2); + +} // Judy1Count() / JudyLCount() + + +// **************************************************************************** +// __ J U D Y 1 L C O U N T S M +// +// Given a pointer to a JP (with invalid jp_DcdPopO at cJU_ROOTSTATE), a known +// valid Index, and a Pjpm for returning error info, recursively visit a Judy +// array state machine (SM) and return the count of Indexes, including Index, +// through the end of the Judy array at this state or below. In case of error +// or a count of 0 (should never happen), return C_JERR with appropriate +// JU_ERRNO in the Pjpm. +// +// Note: This function is not told the current state because its encoded in +// the JP Type. +// +// Method: To minimize cache line fills, while studying each branch, if Index +// resides above the midpoint of the branch (which often consists of multiple +// cache lines), ADD the populations at or above Index; otherwise, SUBTRACT +// from the population of the WHOLE branch (available from the JP) the +// populations at or above Index. This is especially tricky for bitmap +// branches. +// +// Note: Unlike, say, the Ins and Del walk routines, this function returns the +// same type of returns as Judy*Count(), so it can use *_SET_ERRNO*() macros +// the same way. + +FUNCTION static Word_t j__udy1LCountSM( +const Pjp_t Pjp, // top of Judy (sub)SM. +const Word_t Index, // count at or above this Index. +const Pjpm_t Pjpm) // for returning error info. +{ + Pjbl_t Pjbl; // Pjp->jp_Addr masked and cast to types: + Pjbb_t Pjbb; + Pjbu_t Pjbu; + Pjll_t Pjll; // a Judy lower-level linear leaf. + + Word_t digit; // next digit to decode from Index. + long jpnum; // JP number in a branch (base 0). + int offset; // index ordinal within a leaf, base 0. + Word_t pop1; // total population of an expanse. + Word_t pop1above; // to return. + +// Common code to check Decode bits in a JP against the equivalent portion of +// Index; XOR together, then mask bits of interest; must be all 0: +// +// Note: Why does this code only assert() compliance rather than actively +// checking for outliers? Its because Index is supposed to be valid, hence +// always match any Dcd bits traversed. +// +// Note: This assertion turns out to be always true for cState = 3 on 32-bit +// and 7 on 64-bit, but its harmless, probably removed by the compiler. + +#define CHECKDCD(Pjp,cState) \ + assert(! JU_DCDNOTMATCHINDEX(Index, Pjp, cState)) + +// Common code to prepare to handle a root-level or lower-level branch: +// Extract a state-dependent digit from Index in a "constant" way, obtain the +// total population for the branch in a state-dependent way, and then branch to +// common code for multiple cases: +// +// For root-level branches, the state is always cJU_ROOTSTATE, and the +// population is received in Pjpm->jpm_Pop0. +// +// Note: The total population is only needed in cases where the common code +// "counts up" instead of down to minimize cache line fills. However, its +// available cheaply, and its better to do it with a constant shift (constant +// state value) instead of a variable shift later "when needed". + +#define PREPB_ROOT(Pjp,Next) \ + digit = JU_DIGITATSTATE(Index, cJU_ROOTSTATE); \ + pop1 = (Pjpm->jpm_Pop0) + 1; \ + goto Next + +#define PREPB(Pjp,cState,Next) \ + digit = JU_DIGITATSTATE(Index, cState); \ + pop1 = JU_JPBRANCH_POP0(Pjp, (cState)) + 1; \ + goto Next + + +// SWITCH ON JP TYPE: +// +// WARNING: For run-time efficiency the following cases replicate code with +// varying constants, rather than using common code with variable values! + + switch (JU_JPTYPE(Pjp)) + { + + +// ---------------------------------------------------------------------------- +// ROOT-STATE LEAF that starts with a Pop0 word; just count within the leaf: + + case cJU_LEAFW: + { + Pjlw_t Pjlw = P_JLW(Pjp->jp_Addr); // first word of leaf. + + assert((Pjpm->jpm_Pop0) + 1 == Pjlw[0] + 1); // sent correctly. + offset = j__udySearchLeafW(Pjlw + 1, Pjpm->jpm_Pop0 + 1, Index); + assert(offset >= 0); // Index must exist. + assert(offset < (Pjpm->jpm_Pop0) + 1); // Index be in range. + return((Pjpm->jpm_Pop0) + 1 - offset); // INCLUSIVE of Index. + } + +// ---------------------------------------------------------------------------- +// LINEAR BRANCH; count populations in JPs in the JBL ABOVE the next digit in +// Index, and recurse for the next digit in Index: +// +// Note: There are no null JPs in a JBL; watch out for pop1 == 0. +// +// Note: A JBL should always fit in one cache line => no need to count up +// versus down to save cache line fills. (PREPB() sets pop1 for no reason.) + + case cJU_JPBRANCH_L2: CHECKDCD(Pjp, 2); PREPB(Pjp, 2, BranchL); + case cJU_JPBRANCH_L3: CHECKDCD(Pjp, 3); PREPB(Pjp, 3, BranchL); + +#ifdef JU_64BIT + case cJU_JPBRANCH_L4: CHECKDCD(Pjp, 4); PREPB(Pjp, 4, BranchL); + case cJU_JPBRANCH_L5: CHECKDCD(Pjp, 5); PREPB(Pjp, 5, BranchL); + case cJU_JPBRANCH_L6: CHECKDCD(Pjp, 6); PREPB(Pjp, 6, BranchL); + case cJU_JPBRANCH_L7: CHECKDCD(Pjp, 7); PREPB(Pjp, 7, BranchL); +#endif + case cJU_JPBRANCH_L: PREPB_ROOT(Pjp, BranchL); + +// Common code (state-independent) for all cases of linear branches: + +BranchL: + + Pjbl = P_JBL(Pjp->jp_Addr); + jpnum = Pjbl->jbl_NumJPs; // above last JP. + pop1above = 0; + + while (digit < (Pjbl->jbl_Expanse[--jpnum])) // still ABOVE digit. + { + if ((pop1 = j__udyJPPop1((Pjbl->jbl_jp) + jpnum)) == cJU_ALLONES) + { + JU_SET_ERRNO_NONNULL(Pjpm, JU_ERRNO_CORRUPT); + return(C_JERR); + } + + pop1above += pop1; + assert(jpnum > 0); // should find digit. + } + + assert(digit == (Pjbl->jbl_Expanse[jpnum])); // should find digit. + + pop1 = j__udy1LCountSM((Pjbl->jbl_jp) + jpnum, Index, Pjpm); + if (pop1 == C_JERR) return(C_JERR); // pass error up. + + assert(pop1above + pop1); + return(pop1above + pop1); + + +// ---------------------------------------------------------------------------- +// BITMAP BRANCH; count populations in JPs in the JBB ABOVE the next digit in +// Index, and recurse for the next digit in Index: +// +// Note: There are no null JPs in a JBB; watch out for pop1 == 0. + + case cJU_JPBRANCH_B2: CHECKDCD(Pjp, 2); PREPB(Pjp, 2, BranchB); + case cJU_JPBRANCH_B3: CHECKDCD(Pjp, 3); PREPB(Pjp, 3, BranchB); +#ifdef JU_64BIT + case cJU_JPBRANCH_B4: CHECKDCD(Pjp, 4); PREPB(Pjp, 4, BranchB); + case cJU_JPBRANCH_B5: CHECKDCD(Pjp, 5); PREPB(Pjp, 5, BranchB); + case cJU_JPBRANCH_B6: CHECKDCD(Pjp, 6); PREPB(Pjp, 6, BranchB); + case cJU_JPBRANCH_B7: CHECKDCD(Pjp, 7); PREPB(Pjp, 7, BranchB); +#endif + case cJU_JPBRANCH_B: PREPB_ROOT(Pjp, BranchB); + +// Common code (state-independent) for all cases of bitmap branches: + +BranchB: + { + long subexp; // for stepping through layer 1 (subexpanses). + long findsub; // subexpanse containing Index (digit). + Word_t findbit; // bit representing Index (digit). + Word_t lowermask; // bits for indexes at or below Index. + Word_t jpcount; // JPs in a subexpanse. + Word_t clbelow; // cache lines below digits cache line. + Word_t clabove; // cache lines above digits cache line. + + Pjbb = P_JBB(Pjp->jp_Addr); + findsub = digit / cJU_BITSPERSUBEXPB; + findbit = digit % cJU_BITSPERSUBEXPB; + lowermask = JU_MASKLOWERINC(JU_BITPOSMASKB(findbit)); + clbelow = clabove = 0; // initial/default => always downward. + + assert(JU_BITMAPTESTB(Pjbb, digit)); // digit must have a JP. + assert(findsub < cJU_NUMSUBEXPB); // falls in expected range. + +// Shorthand for one subexpanse in a bitmap and for one JP in a bitmap branch: +// +// Note: BMPJP0 exists separately to support assertions. + +#define BMPJP0(Subexp) (P_JP(JU_JBB_PJP(Pjbb, Subexp))) +#define BMPJP(Subexp,JPnum) (BMPJP0(Subexp) + (JPnum)) + +#ifndef NOSMARTJBB // enable to turn off smart code for comparison purposes. + +// FIGURE OUT WHICH DIRECTION CAUSES FEWER CACHE LINE FILLS; adding the pop1s +// in JPs above Indexs JP, or subtracting the pop1s in JPs below Indexs JP. +// +// This is tricky because, while each set bit in the bitmap represents a JP, +// the JPs are scattered over cJU_NUMSUBEXPB subexpanses, each of which can +// contain JPs packed into multiple cache lines, and this code must visit every +// JP either BELOW or ABOVE the JP for Index. +// +// Number of cache lines required to hold a linear list of the given number of +// JPs, assuming the first JP is at the start of a cache line or the JPs in +// jpcount fit wholly within a single cache line, which is ensured by +// JudyMalloc(): + +#define CLPERJPS(jpcount) \ + ((((jpcount) * cJU_WORDSPERJP) + cJU_WORDSPERCL - 1) / cJU_WORDSPERCL) + +// Count cache lines below/above for each subexpanse: + + for (subexp = 0; subexp < cJU_NUMSUBEXPB; ++subexp) + { + jpcount = j__udyCountBitsB(JU_JBB_BITMAP(Pjbb, subexp)); + +// When at the subexpanse containing Index (digit), add cache lines +// below/above appropriately, excluding the cache line containing the JP for +// Index itself: + + if (subexp < findsub) clbelow += CLPERJPS(jpcount); + else if (subexp > findsub) clabove += CLPERJPS(jpcount); + else // (subexp == findsub) + { + Word_t clfind; // cache line containing Index (digit). + + clfind = CLPERJPS(j__udyCountBitsB( + JU_JBB_BITMAP(Pjbb, subexp) & lowermask)); + + assert(clfind > 0); // digit itself should have 1 CL. + clbelow += clfind - 1; + clabove += CLPERJPS(jpcount) - clfind; + } + } +#endif // ! NOSMARTJBB + +// Note: Its impossible to get through the following "if" without setting +// jpnum -- see some of the assertions below -- but gcc -Wall doesnt know +// this, so preset jpnum to make it happy: + + jpnum = 0; + + +// COUNT POPULATION FOR A BITMAP BRANCH, in whichever direction should result +// in fewer cache line fills: +// +// Note: If the remainder of Index is zero, pop1above is the pop1 of the +// entire expanse and theres no point in recursing to lower levels; but this +// should be so rare that its not worth checking for; +// Judy1Count()/JudyLCount() never even calls the motor for Index == 0 (all +// bytes). + + +// COUNT UPWARD, subtracting each "below or at" JPs pop1 from the whole +// expanses pop1: +// +// Note: If this causes clbelow + 1 cache line fills including JPs cache +// line, thats OK; at worst this is the same as clabove. + + if (clbelow < clabove) + { +#ifdef SMARTMETRICS + ++jbb_upward; +#endif + pop1above = pop1; // subtract JPs at/below Index. + +// Count JPs for which to accrue pop1s in this subexpanse: +// +// TBD: If JU_JBB_BITMAP is cJU_FULLBITMAPB, dont bother counting. + + for (subexp = 0; subexp <= findsub; ++subexp) + { + jpcount = j__udyCountBitsB((subexp < findsub) ? + JU_JBB_BITMAP(Pjbb, subexp) : + JU_JBB_BITMAP(Pjbb, subexp) & lowermask); + + // should always find findbit: + assert((subexp < findsub) || jpcount); + +// Subtract pop1s from JPs BELOW OR AT Index (digit): +// +// Note: The pop1 for Indexs JP itself is partially added back later at a +// lower state. +// +// Note: An empty subexpanse (jpcount == 0) is handled "for free". +// +// Note: Must be null JP subexp pointer in empty subexpanse and non-empty in +// non-empty subexpanse: + + assert( jpcount || (BMPJP0(subexp) == (Pjp_t) NULL)); + assert((! jpcount) || (BMPJP0(subexp) != (Pjp_t) NULL)); + + for (jpnum = 0; jpnum < jpcount; ++jpnum) + { + if ((pop1 = j__udyJPPop1(BMPJP(subexp, jpnum))) + == cJU_ALLONES) + { + JU_SET_ERRNO_NONNULL(Pjpm, JU_ERRNO_CORRUPT); + return(C_JERR); + } + + pop1above -= pop1; + } + + jpnum = jpcount - 1; // make correct for digit. + } + } + +// COUNT DOWNWARD, adding each "above" JPs pop1: + + else + { + long jpcountbf; // below findbit, inclusive. +#ifdef SMARTMETRICS + ++jbb_downward; +#endif + pop1above = 0; // add JPs above Index. + jpcountbf = 0; // until subexp == findsub. + +// Count JPs for which to accrue pop1s in this subexpanse: +// +// This is more complicated than counting upward because the scan of digits +// subexpanse must count ALL JPs, to know where to START counting down, and +// ALSO note the offset of digits JP to know where to STOP counting down. + + for (subexp = cJU_NUMSUBEXPB - 1; subexp >= findsub; --subexp) + { + jpcount = j__udyCountBitsB(JU_JBB_BITMAP(Pjbb, subexp)); + + // should always find findbit: + assert((subexp > findsub) || jpcount); + + if (! jpcount) continue; // empty subexpanse, save time. + +// Count JPs below digit, inclusive: + + if (subexp == findsub) + { + jpcountbf = j__udyCountBitsB(JU_JBB_BITMAP(Pjbb, subexp) + & lowermask); + } + + // should always find findbit: + assert((subexp > findsub) || jpcountbf); + assert(jpcount >= jpcountbf); // proper relationship. + +// Add pop1s from JPs ABOVE Index (digit): + + // no null JP subexp pointers: + assert(BMPJP0(subexp) != (Pjp_t) NULL); + + for (jpnum = jpcount - 1; jpnum >= jpcountbf; --jpnum) + { + if ((pop1 = j__udyJPPop1(BMPJP(subexp, jpnum))) + == cJU_ALLONES) + { + JU_SET_ERRNO_NONNULL(Pjpm, JU_ERRNO_CORRUPT); + return(C_JERR); + } + + pop1above += pop1; + } + // jpnum is now correct for digit. + } + } // else. + +// Return the net population ABOVE the digits JP at this state (in this JBB) +// plus the population AT OR ABOVE Index in the SM under the digits JP: + + pop1 = j__udy1LCountSM(BMPJP(findsub, jpnum), Index, Pjpm); + if (pop1 == C_JERR) return(C_JERR); // pass error up. + + assert(pop1above + pop1); + return(pop1above + pop1); + + } // case. + + +// ---------------------------------------------------------------------------- +// UNCOMPRESSED BRANCH; count populations in JPs in the JBU ABOVE the next +// digit in Index, and recurse for the next digit in Index: +// +// Note: If the remainder of Index is zero, pop1above is the pop1 of the +// entire expanse and theres no point in recursing to lower levels; but this +// should be so rare that its not worth checking for; +// Judy1Count()/JudyLCount() never even calls the motor for Index == 0 (all +// bytes). + + case cJU_JPBRANCH_U2: CHECKDCD(Pjp, 2); PREPB(Pjp, 2, BranchU); + case cJU_JPBRANCH_U3: CHECKDCD(Pjp, 3); PREPB(Pjp, 3, BranchU); +#ifdef JU_64BIT + case cJU_JPBRANCH_U4: CHECKDCD(Pjp, 4); PREPB(Pjp, 4, BranchU); + case cJU_JPBRANCH_U5: CHECKDCD(Pjp, 5); PREPB(Pjp, 5, BranchU); + case cJU_JPBRANCH_U6: CHECKDCD(Pjp, 6); PREPB(Pjp, 6, BranchU); + case cJU_JPBRANCH_U7: CHECKDCD(Pjp, 7); PREPB(Pjp, 7, BranchU); +#endif + case cJU_JPBRANCH_U: PREPB_ROOT(Pjp, BranchU); + +// Common code (state-independent) for all cases of uncompressed branches: + +BranchU: + Pjbu = P_JBU(Pjp->jp_Addr); + +#ifndef NOSMARTJBU // enable to turn off smart code for comparison purposes. + +// FIGURE OUT WHICH WAY CAUSES FEWER CACHE LINE FILLS; adding the JPs above +// Indexs JP, or subtracting the JPs below Indexs JP. +// +// COUNT UPWARD, subtracting the pop1 of each JP BELOW OR AT Index, from the +// whole expanses pop1: + + if (digit < (cJU_BRANCHUNUMJPS / 2)) + { + pop1above = pop1; // subtract JPs below Index. +#ifdef SMARTMETRICS + ++jbu_upward; +#endif + for (jpnum = 0; jpnum <= digit; ++jpnum) + { + if ((Pjbu->jbu_jp[jpnum].jp_Type) <= cJU_JPNULLMAX) + continue; // shortcut, save a function call. + + if ((pop1 = j__udyJPPop1(Pjbu->jbu_jp + jpnum)) + == cJU_ALLONES) + { + JU_SET_ERRNO_NONNULL(Pjpm, JU_ERRNO_CORRUPT); + return(C_JERR); + } + + pop1above -= pop1; + } + } + +// COUNT DOWNWARD, simply adding the pop1 of each JP ABOVE Index: + + else +#endif // NOSMARTJBU + { + assert(digit < cJU_BRANCHUNUMJPS); +#ifdef SMARTMETRICS + ++jbu_downward; +#endif + pop1above = 0; // add JPs above Index. + + for (jpnum = cJU_BRANCHUNUMJPS - 1; jpnum > digit; --jpnum) + { + if ((Pjbu->jbu_jp[jpnum].jp_Type) <= cJU_JPNULLMAX) + continue; // shortcut, save a function call. + + if ((pop1 = j__udyJPPop1(Pjbu->jbu_jp + jpnum)) + == cJU_ALLONES) + { + JU_SET_ERRNO_NONNULL(Pjpm, JU_ERRNO_CORRUPT); + return(C_JERR); + } + + pop1above += pop1; + } + } + + if ((pop1 = j__udy1LCountSM(Pjbu->jbu_jp + digit, Index, Pjpm)) + == C_JERR) return(C_JERR); // pass error up. + + assert(pop1above + pop1); + return(pop1above + pop1); + + +// ---------------------------------------------------------------------------- +// LEAF COUNT MACROS: +// +// LEAF*ABOVE() are common code for different JP types (linear leaves, bitmap +// leaves, and immediates) and different leaf Index Sizes, which result in +// calling different leaf search functions. Linear leaves get the leaf address +// from jp_Addr and the Population from jp_DcdPopO, while immediates use Pjp +// itself as the leaf address and get Population from jp_Type. + +#define LEAFLABOVE(Func) \ + Pjll = P_JLL(Pjp->jp_Addr); \ + pop1 = JU_JPLEAF_POP0(Pjp) + 1; \ + LEAFABOVE(Func, Pjll, pop1) + +#define LEAFB1ABOVE(Func) LEAFLABOVE(Func) // different Func, otherwise same. + +#ifdef JUDY1 +#define IMMABOVE(Func,Pop1) \ + Pjll = (Pjll_t) Pjp; \ + LEAFABOVE(Func, Pjll, Pop1) +#else +// Note: For JudyL immediates with >= 2 Indexes, the index bytes are in a +// different place than for Judy1: + +#define IMMABOVE(Func,Pop1) \ + LEAFABOVE(Func, (Pjll_t) (Pjp->jp_LIndex), Pop1) +#endif + +// For all leaf types, the population AT OR ABOVE is the total pop1 less the +// offset of Index; and Index should always be found: + +#define LEAFABOVE(Func,Pjll,Pop1) \ + offset = Func(Pjll, Pop1, Index); \ + assert(offset >= 0); \ + assert(offset < (Pop1)); \ + return((Pop1) - offset) + +// IMMABOVE_01 handles the special case of an immediate JP with 1 index, which +// the search functions arent used for anyway: +// +// The target Index should be the one in this Immediate, in which case the +// count above (inclusive) is always 1. + +#define IMMABOVE_01 \ + assert((JU_JPDCDPOP0(Pjp)) == JU_TRIMTODCDSIZE(Index)); \ + return(1) + + +// ---------------------------------------------------------------------------- +// LINEAR LEAF; search the leaf for Index; size is computed from jp_Type: + +#if (defined(JUDYL) || (! defined(JU_64BIT))) + case cJU_JPLEAF1: LEAFLABOVE(j__udySearchLeaf1); +#endif + case cJU_JPLEAF2: LEAFLABOVE(j__udySearchLeaf2); + case cJU_JPLEAF3: LEAFLABOVE(j__udySearchLeaf3); + +#ifdef JU_64BIT + case cJU_JPLEAF4: LEAFLABOVE(j__udySearchLeaf4); + case cJU_JPLEAF5: LEAFLABOVE(j__udySearchLeaf5); + case cJU_JPLEAF6: LEAFLABOVE(j__udySearchLeaf6); + case cJU_JPLEAF7: LEAFLABOVE(j__udySearchLeaf7); +#endif + + +// ---------------------------------------------------------------------------- +// BITMAP LEAF; search the leaf for Index: +// +// Since the bitmap describes Indexes digitally rather than linearly, this is +// not really a search, but just a count. + + case cJU_JPLEAF_B1: LEAFB1ABOVE(j__udyCountLeafB1); + + +#ifdef JUDY1 +// ---------------------------------------------------------------------------- +// FULL POPULATION: +// +// Return the count of Indexes AT OR ABOVE Index, which is the total population +// of the expanse (a constant) less the value of the undecoded digit remaining +// in Index (its base-0 offset in the expanse), which yields an inclusive count +// above. +// +// TBD: This only supports a 1-byte full expanse. Should this extract a +// stored value for pop0 and possibly more LSBs of Index, to handle larger full +// expanses? + + case cJ1_JPFULLPOPU1: + return(cJU_JPFULLPOPU1_POP0 + 1 - JU_DIGITATSTATE(Index, 1)); +#endif + + +// ---------------------------------------------------------------------------- +// IMMEDIATE: + + case cJU_JPIMMED_1_01: IMMABOVE_01; + case cJU_JPIMMED_2_01: IMMABOVE_01; + case cJU_JPIMMED_3_01: IMMABOVE_01; +#ifdef JU_64BIT + case cJU_JPIMMED_4_01: IMMABOVE_01; + case cJU_JPIMMED_5_01: IMMABOVE_01; + case cJU_JPIMMED_6_01: IMMABOVE_01; + case cJU_JPIMMED_7_01: IMMABOVE_01; +#endif + + case cJU_JPIMMED_1_02: IMMABOVE(j__udySearchLeaf1, 2); + case cJU_JPIMMED_1_03: IMMABOVE(j__udySearchLeaf1, 3); +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_1_04: IMMABOVE(j__udySearchLeaf1, 4); + case cJU_JPIMMED_1_05: IMMABOVE(j__udySearchLeaf1, 5); + case cJU_JPIMMED_1_06: IMMABOVE(j__udySearchLeaf1, 6); + case cJU_JPIMMED_1_07: IMMABOVE(j__udySearchLeaf1, 7); +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_1_08: IMMABOVE(j__udySearchLeaf1, 8); + case cJ1_JPIMMED_1_09: IMMABOVE(j__udySearchLeaf1, 9); + case cJ1_JPIMMED_1_10: IMMABOVE(j__udySearchLeaf1, 10); + case cJ1_JPIMMED_1_11: IMMABOVE(j__udySearchLeaf1, 11); + case cJ1_JPIMMED_1_12: IMMABOVE(j__udySearchLeaf1, 12); + case cJ1_JPIMMED_1_13: IMMABOVE(j__udySearchLeaf1, 13); + case cJ1_JPIMMED_1_14: IMMABOVE(j__udySearchLeaf1, 14); + case cJ1_JPIMMED_1_15: IMMABOVE(j__udySearchLeaf1, 15); +#endif + +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_2_02: IMMABOVE(j__udySearchLeaf2, 2); + case cJU_JPIMMED_2_03: IMMABOVE(j__udySearchLeaf2, 3); +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_2_04: IMMABOVE(j__udySearchLeaf2, 4); + case cJ1_JPIMMED_2_05: IMMABOVE(j__udySearchLeaf2, 5); + case cJ1_JPIMMED_2_06: IMMABOVE(j__udySearchLeaf2, 6); + case cJ1_JPIMMED_2_07: IMMABOVE(j__udySearchLeaf2, 7); +#endif + +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_3_02: IMMABOVE(j__udySearchLeaf3, 2); +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_3_03: IMMABOVE(j__udySearchLeaf3, 3); + case cJ1_JPIMMED_3_04: IMMABOVE(j__udySearchLeaf3, 4); + case cJ1_JPIMMED_3_05: IMMABOVE(j__udySearchLeaf3, 5); + + case cJ1_JPIMMED_4_02: IMMABOVE(j__udySearchLeaf4, 2); + case cJ1_JPIMMED_4_03: IMMABOVE(j__udySearchLeaf4, 3); + + case cJ1_JPIMMED_5_02: IMMABOVE(j__udySearchLeaf5, 2); + case cJ1_JPIMMED_5_03: IMMABOVE(j__udySearchLeaf5, 3); + + case cJ1_JPIMMED_6_02: IMMABOVE(j__udySearchLeaf6, 2); + + case cJ1_JPIMMED_7_02: IMMABOVE(j__udySearchLeaf7, 2); +#endif + + +// ---------------------------------------------------------------------------- +// OTHER CASES: + + default: JU_SET_ERRNO_NONNULL(Pjpm, JU_ERRNO_CORRUPT); return(C_JERR); + + } // switch on JP type + + /*NOTREACHED*/ + +} // j__udy1LCountSM() + + +// **************************************************************************** +// J U D Y C O U N T L E A F B 1 +// +// This is a private analog of the j__udySearchLeaf*() functions for counting +// in bitmap 1-byte leaves. Since a bitmap leaf describes Indexes digitally +// rather than linearly, this is not really a search, but just a count of the +// valid Indexes == set bits below or including Index, which should be valid. +// Return the "offset" (really the ordinal), 0 .. Pop1 - 1, of Index in Pjll; +// if Indexs bit is not set (which should never happen, so this is DEBUG-mode +// only), return the 1s-complement equivalent (== negative offset minus 1). +// +// Note: The source code for this function looks identical for both Judy1 and +// JudyL, but the JU_JLB_BITMAP macro varies. +// +// Note: For simpler calling, the first arg is of type Pjll_t but then cast to +// Pjlb_t. + +FUNCTION static int j__udyCountLeafB1( +const Pjll_t Pjll, // bitmap leaf, as Pjll_t for consistency. +const Word_t Pop1, // Population of whole leaf. +const Word_t Index) // to which to count. +{ + Pjlb_t Pjlb = (Pjlb_t) Pjll; // to proper type. + Word_t digit = Index & cJU_MASKATSTATE(1); + Word_t findsub = digit / cJU_BITSPERSUBEXPL; + Word_t findbit = digit % cJU_BITSPERSUBEXPL; + int count; // in leaf through Index. + long subexp; // for stepping through subexpanses. + + +// COUNT UPWARD: +// +// The entire bitmap should fit in one cache line, but still try to save some +// CPU time by counting the fewest possible number of subexpanses from the +// bitmap. + +#ifndef NOSMARTJLB // enable to turn off smart code for comparison purposes. + + if (findsub < (cJU_NUMSUBEXPL / 2)) + { +#ifdef SMARTMETRICS + ++jlb_upward; +#endif + count = 0; + + for (subexp = 0; subexp < findsub; ++subexp) + { + count += ((JU_JLB_BITMAP(Pjlb, subexp) == cJU_FULLBITMAPL) ? + cJU_BITSPERSUBEXPL : + j__udyCountBitsL(JU_JLB_BITMAP(Pjlb, subexp))); + } + +// This count includes findbit, which should be set, resulting in a base-1 +// offset: + + count += j__udyCountBitsL(JU_JLB_BITMAP(Pjlb, findsub) + & JU_MASKLOWERINC(JU_BITPOSMASKL(findbit))); + + DBGCODE(if (! JU_BITMAPTESTL(Pjlb, digit)) return(~count);) + assert(count >= 1); + return(count - 1); // convert to base-0 offset. + } +#endif // NOSMARTJLB + + +// COUNT DOWNWARD: +// +// Count the valid Indexes above or at Index, and subtract from Pop1. + +#ifdef SMARTMETRICS + ++jlb_downward; +#endif + count = Pop1; // base-1 for now. + + for (subexp = cJU_NUMSUBEXPL - 1; subexp > findsub; --subexp) + { + count -= ((JU_JLB_BITMAP(Pjlb, subexp) == cJU_FULLBITMAPL) ? + cJU_BITSPERSUBEXPL : + j__udyCountBitsL(JU_JLB_BITMAP(Pjlb, subexp))); + } + +// This count includes findbit, which should be set, resulting in a base-0 +// offset: + + count -= j__udyCountBitsL(JU_JLB_BITMAP(Pjlb, findsub) + & JU_MASKHIGHERINC(JU_BITPOSMASKL(findbit))); + + DBGCODE(if (! JU_BITMAPTESTL(Pjlb, digit)) return(~count);) + assert(count >= 0); // should find Index itself. + return(count); // is already a base-0 offset. + +} // j__udyCountLeafB1() + + +// **************************************************************************** +// J U D Y J P P O P 1 +// +// This function takes any type of JP other than a root-level JP (cJU_LEAFW* or +// cJU_JPBRANCH* with no number suffix) and extracts the Pop1 from it. In some +// sense this is a wrapper around the JU_JP*_POP0 macros. Why write it as a +// function instead of a complex macro containing a trinary? (See version +// Judy1.h version 4.17.) We think its cheaper to call a function containing +// a switch statement with "constant" cases than to do the variable +// calculations in a trinary. +// +// For invalid JP Types return cJU_ALLONES. Note that this is an impossibly +// high Pop1 for any JP below a top level branch. + +FUNCTION Word_t j__udyJPPop1( +const Pjp_t Pjp) // JP to count. +{ + switch (JU_JPTYPE(Pjp)) + { +#ifdef notdef // caller should shortcut and not even call with these: + + case cJU_JPNULL1: + case cJU_JPNULL2: + case cJU_JPNULL3: return(0); +#ifdef JU_64BIT + case cJU_JPNULL4: + case cJU_JPNULL5: + case cJU_JPNULL6: + case cJU_JPNULL7: return(0); +#endif +#endif // notdef + + case cJU_JPBRANCH_L2: + case cJU_JPBRANCH_B2: + case cJU_JPBRANCH_U2: return(JU_JPBRANCH_POP0(Pjp,2) + 1); + + case cJU_JPBRANCH_L3: + case cJU_JPBRANCH_B3: + case cJU_JPBRANCH_U3: return(JU_JPBRANCH_POP0(Pjp,3) + 1); + +#ifdef JU_64BIT + case cJU_JPBRANCH_L4: + case cJU_JPBRANCH_B4: + case cJU_JPBRANCH_U4: return(JU_JPBRANCH_POP0(Pjp,4) + 1); + + case cJU_JPBRANCH_L5: + case cJU_JPBRANCH_B5: + case cJU_JPBRANCH_U5: return(JU_JPBRANCH_POP0(Pjp,5) + 1); + + case cJU_JPBRANCH_L6: + case cJU_JPBRANCH_B6: + case cJU_JPBRANCH_U6: return(JU_JPBRANCH_POP0(Pjp,6) + 1); + + case cJU_JPBRANCH_L7: + case cJU_JPBRANCH_B7: + case cJU_JPBRANCH_U7: return(JU_JPBRANCH_POP0(Pjp,7) + 1); +#endif + +#if (defined(JUDYL) || (! defined(JU_64BIT))) + case cJU_JPLEAF1: +#endif + case cJU_JPLEAF2: + case cJU_JPLEAF3: +#ifdef JU_64BIT + case cJU_JPLEAF4: + case cJU_JPLEAF5: + case cJU_JPLEAF6: + case cJU_JPLEAF7: +#endif + case cJU_JPLEAF_B1: return(JU_JPLEAF_POP0(Pjp) + 1); + +#ifdef JUDY1 + case cJ1_JPFULLPOPU1: return(cJU_JPFULLPOPU1_POP0 + 1); +#endif + + case cJU_JPIMMED_1_01: + case cJU_JPIMMED_2_01: + case cJU_JPIMMED_3_01: return(1); +#ifdef JU_64BIT + case cJU_JPIMMED_4_01: + case cJU_JPIMMED_5_01: + case cJU_JPIMMED_6_01: + case cJU_JPIMMED_7_01: return(1); +#endif + + case cJU_JPIMMED_1_02: return(2); + case cJU_JPIMMED_1_03: return(3); +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_1_04: return(4); + case cJU_JPIMMED_1_05: return(5); + case cJU_JPIMMED_1_06: return(6); + case cJU_JPIMMED_1_07: return(7); +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_1_08: return(8); + case cJ1_JPIMMED_1_09: return(9); + case cJ1_JPIMMED_1_10: return(10); + case cJ1_JPIMMED_1_11: return(11); + case cJ1_JPIMMED_1_12: return(12); + case cJ1_JPIMMED_1_13: return(13); + case cJ1_JPIMMED_1_14: return(14); + case cJ1_JPIMMED_1_15: return(15); +#endif + +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_2_02: return(2); + case cJU_JPIMMED_2_03: return(3); +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_2_04: return(4); + case cJ1_JPIMMED_2_05: return(5); + case cJ1_JPIMMED_2_06: return(6); + case cJ1_JPIMMED_2_07: return(7); +#endif + +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_3_02: return(2); +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_3_03: return(3); + case cJ1_JPIMMED_3_04: return(4); + case cJ1_JPIMMED_3_05: return(5); + + case cJ1_JPIMMED_4_02: return(2); + case cJ1_JPIMMED_4_03: return(3); + + case cJ1_JPIMMED_5_02: return(2); + case cJ1_JPIMMED_5_03: return(3); + + case cJ1_JPIMMED_6_02: return(2); + + case cJ1_JPIMMED_7_02: return(2); +#endif + + default: return(cJU_ALLONES); + } + + /*NOTREACHED*/ + +} // j__udyJPPop1() diff --git a/src/libnetdata/libjudy/src/JudyL/JudyLCreateBranch.c b/src/libnetdata/libjudy/src/JudyL/JudyLCreateBranch.c new file mode 100644 index 00000000..ffe6b3bd --- /dev/null +++ b/src/libnetdata/libjudy/src/JudyL/JudyLCreateBranch.c @@ -0,0 +1,314 @@ +// Copyright (C) 2000 - 2002 Hewlett-Packard Company +// +// This program is free software; you can redistribute it and/or modify it +// under the term of the GNU Lesser General Public License as published by the +// Free Software Foundation; either version 2 of the License, or (at your +// option) any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License +// for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this program; if not, write to the Free Software Foundation, +// Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// _________________ + +// @(#) $Revision: 4.26 $ $Source: /judy/src/JudyCommon/JudyCreateBranch.c $ + +// Branch creation functions for Judy1 and JudyL. +// Compile with one of -DJUDY1 or -DJUDYL. + +#if (! (defined(JUDY1) || defined(JUDYL))) +#error: One of -DJUDY1 or -DJUDYL must be specified. +#endif + +#ifdef JUDY1 +#include "Judy1.h" +#else +#include "JudyL.h" +#endif + +#include "JudyPrivate1L.h" + + +// **************************************************************************** +// J U D Y C R E A T E B R A N C H L +// +// Build a BranchL from an array of JPs and associated 1 byte digits +// (expanses). Return with Pjp pointing to the BranchL. Caller must +// deallocate passed arrays, if necessary. +// +// We have no idea what kind of BranchL it is, so caller must set the jp_Type. +// +// Return -1 if error (details in Pjpm), otherwise return 1. + +FUNCTION int j__udyCreateBranchL( + Pjp_t Pjp, // Build JPs from this place + Pjp_t PJPs, // Array of JPs to put into Bitmap branch + uint8_t Exp[], // Array of expanses to put into bitmap + Word_t ExpCnt, // Number of above JPs and Expanses + Pvoid_t Pjpm) +{ + Pjbl_t PjblRaw; // pointer to linear branch. + Pjbl_t Pjbl; + + assert(ExpCnt <= cJU_BRANCHLMAXJPS); + + PjblRaw = j__udyAllocJBL(Pjpm); + if (PjblRaw == (Pjbl_t) NULL) return(-1); + Pjbl = P_JBL(PjblRaw); + +// Build a Linear Branch + Pjbl->jbl_NumJPs = ExpCnt; + +// Copy from the Linear branch from splayed leaves + JU_COPYMEM(Pjbl->jbl_Expanse, Exp, ExpCnt); + JU_COPYMEM(Pjbl->jbl_jp, PJPs, ExpCnt); + +// Pass back new pointer to the Linear branch in JP + Pjp->jp_Addr = (Word_t) PjblRaw; + + return(1); + +} // j__udyCreateBranchL() + + +// **************************************************************************** +// J U D Y C R E A T E B R A N C H B +// +// Build a BranchB from an array of JPs and associated 1 byte digits +// (expanses). Return with Pjp pointing to the BranchB. Caller must +// deallocate passed arrays, if necessary. +// +// We have no idea what kind of BranchB it is, so caller must set the jp_Type. +// +// Return -1 if error (details in Pjpm), otherwise return 1. + +FUNCTION int j__udyCreateBranchB( + Pjp_t Pjp, // Build JPs from this place + Pjp_t PJPs, // Array of JPs to put into Bitmap branch + uint8_t Exp[], // Array of expanses to put into bitmap + Word_t ExpCnt, // Number of above JPs and Expanses + Pvoid_t Pjpm) +{ + Pjbb_t PjbbRaw; // pointer to bitmap branch. + Pjbb_t Pjbb; + Word_t ii, jj; // Temps + uint8_t CurrSubExp; // Current sub expanse for BM + +// This assertion says the number of populated subexpanses is not too large. +// This function is only called when a BranchL overflows to a BranchB or when a +// cascade occurs, meaning a leaf overflows. Either way ExpCnt cant be very +// large, in fact a lot smaller than cJU_BRANCHBMAXJPS. (Otherwise a BranchU +// would be used.) Popping this assertion means something (unspecified) has +// gone very wrong, or else Judys design criteria have changed, although in +// fact there should be no HARM in creating a BranchB with higher actual +// fanout. + + assert(ExpCnt <= cJU_BRANCHBMAXJPS); + +// Get memory for a Bitmap branch + PjbbRaw = j__udyAllocJBB(Pjpm); + if (PjbbRaw == (Pjbb_t) NULL) return(-1); + Pjbb = P_JBB(PjbbRaw); + +// Get 1st "sub" expanse (0..7) of bitmap branch + CurrSubExp = Exp[0] / cJU_BITSPERSUBEXPB; + +// Index thru all 1 byte sized expanses: + + for (jj = ii = 0; ii <= ExpCnt; ii++) + { + Word_t SubExp; // Cannot be a uint8_t + +// Make sure we cover the last one + if (ii == ExpCnt) + { + SubExp = cJU_ALLONES; // Force last one + } + else + { +// Calculate the "sub" expanse of the byte expanse + SubExp = Exp[ii] / cJU_BITSPERSUBEXPB; // Bits 5..7. + +// Set the bit that represents the expanse in Exp[] + JU_JBB_BITMAP(Pjbb, SubExp) |= JU_BITPOSMASKB(Exp[ii]); + } +// Check if a new "sub" expanse range needed + if (SubExp != CurrSubExp) + { +// Get number of JPs in this sub expanse + Word_t NumJP = ii - jj; + Pjp_t PjpRaw; + Pjp_t Pjp; + + PjpRaw = j__udyAllocJBBJP(NumJP, Pjpm); + Pjp = P_JP(PjpRaw); + + if (PjpRaw == (Pjp_t) NULL) // out of memory. + { + +// Free any previous allocations: + + while(CurrSubExp--) + { + NumJP = j__udyCountBitsB(JU_JBB_BITMAP(Pjbb, + CurrSubExp)); + if (NumJP) + { + j__udyFreeJBBJP(JU_JBB_PJP(Pjbb, + CurrSubExp), NumJP, Pjpm); + } + } + j__udyFreeJBB(PjbbRaw, Pjpm); + return(-1); + } + +// Place the array of JPs in bitmap branch: + + JU_JBB_PJP(Pjbb, CurrSubExp) = PjpRaw; + +// Copy the JPs to new leaf: + + JU_COPYMEM(Pjp, PJPs + jj, NumJP); + +// On to the next bitmap branch "sub" expanse: + + jj = ii; + CurrSubExp = SubExp; + } + } // for each 1-byte expanse + +// Pass back some of the JP to the new Bitmap branch: + + Pjp->jp_Addr = (Word_t) PjbbRaw; + + return(1); + +} // j__udyCreateBranchB() + + +// **************************************************************************** +// J U D Y C R E A T E B R A N C H U +// +// Build a BranchU from a BranchB. Return with Pjp pointing to the BranchU. +// Free the BranchB and its JP subarrays. +// +// Return -1 if error (details in Pjpm), otherwise return 1. + +FUNCTION int j__udyCreateBranchU( + Pjp_t Pjp, + Pvoid_t Pjpm) +{ + jp_t JPNull; + Pjbu_t PjbuRaw; + Pjbu_t Pjbu; + Pjbb_t PjbbRaw; + Pjbb_t Pjbb; + Word_t ii, jj; + BITMAPB_t BitMap; + Pjp_t PDstJP; +#ifdef JU_STAGED_EXP + jbu_t BranchU; // Staged uncompressed branch +#else + +// Allocate memory for a BranchU: + + PjbuRaw = j__udyAllocJBU(Pjpm); + if (PjbuRaw == (Pjbu_t) NULL) return(-1); + Pjbu = P_JBU(PjbuRaw); +#endif + JU_JPSETADT(&JPNull, 0, 0, JU_JPTYPE(Pjp) - cJU_JPBRANCH_B2 + cJU_JPNULL1); + +// Get the pointer to the BranchB: + + PjbbRaw = (Pjbb_t) (Pjp->jp_Addr); + Pjbb = P_JBB(PjbbRaw); + +// Set the pointer to the Uncompressed branch +#ifdef JU_STAGED_EXP + PDstJP = BranchU.jbu_jp; +#else + PDstJP = Pjbu->jbu_jp; +#endif + for (ii = 0; ii < cJU_NUMSUBEXPB; ii++) + { + Pjp_t PjpA; + Pjp_t PjpB; + + PjpB = PjpA = P_JP(JU_JBB_PJP(Pjbb, ii)); + +// Get the bitmap for this subexpanse + BitMap = JU_JBB_BITMAP(Pjbb, ii); + +// NULL empty subexpanses + if (BitMap == 0) + { +// But, fill with NULLs + for (jj = 0; jj < cJU_BITSPERSUBEXPB; jj++) + { + PDstJP[jj] = JPNull; + } + PDstJP += cJU_BITSPERSUBEXPB; + continue; + } +// Check if Uncompressed subexpanse + if (BitMap == cJU_FULLBITMAPB) + { +// Copy subexpanse to the Uncompressed branch intact + JU_COPYMEM(PDstJP, PjpA, cJU_BITSPERSUBEXPB); + +// Bump to next subexpanse + PDstJP += cJU_BITSPERSUBEXPB; + +// Set length of subexpanse + jj = cJU_BITSPERSUBEXPB; + } + else + { + for (jj = 0; jj < cJU_BITSPERSUBEXPB; jj++) + { +// Copy JP or NULLJP depending on bit + if (BitMap & 1) { *PDstJP = *PjpA++; } + else { *PDstJP = JPNull; } + + PDstJP++; // advance to next JP + BitMap >>= 1; + } + jj = PjpA - PjpB; + } + +// Free the subexpanse: + + j__udyFreeJBBJP(JU_JBB_PJP(Pjbb, ii), jj, Pjpm); + + } // for each JP in BranchU + +#ifdef JU_STAGED_EXP + +// Allocate memory for a BranchU: + + PjbuRaw = j__udyAllocJBU(Pjpm); + if (PjbuRaw == (Pjbu_t) NULL) return(-1); + Pjbu = P_JBU(PjbuRaw); + +// Copy staged branch to newly allocated branch: +// +// TBD: I think this code is broken. + + *Pjbu = BranchU; + +#endif // JU_STAGED_EXP + +// Finally free the BranchB and put the BranchU in its place: + + j__udyFreeJBB(PjbbRaw, Pjpm); + + Pjp->jp_Addr = (Word_t) PjbuRaw; + Pjp->jp_Type += cJU_JPBRANCH_U - cJU_JPBRANCH_B; + + return(1); + +} // j__udyCreateBranchU() diff --git a/src/libnetdata/libjudy/src/JudyL/JudyLDecascade.c b/src/libnetdata/libjudy/src/JudyL/JudyLDecascade.c new file mode 100644 index 00000000..c2bf81ea --- /dev/null +++ b/src/libnetdata/libjudy/src/JudyL/JudyLDecascade.c @@ -0,0 +1,1208 @@ +// Copyright (C) 2000 - 2002 Hewlett-Packard Company +// +// This program is free software; you can redistribute it and/or modify it +// under the term of the GNU Lesser General Public License as published by the +// Free Software Foundation; either version 2 of the License, or (at your +// option) any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License +// for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this program; if not, write to the Free Software Foundation, +// Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// _________________ + +// @(#) $Revision: 4.25 $ $Source: /judy/src/JudyCommon/JudyDecascade.c $ +// +// "Decascade" support functions for JudyDel.c: These functions convert +// smaller-index-size leaves to larger-index-size leaves, and also, bitmap +// leaves (LeafB1s) to Leaf1s, and some types of branches to smaller branches +// at the same index size. Some "decascading" occurs explicitly in JudyDel.c, +// but rare or large subroutines appear as functions here, and the overhead to +// call them is negligible. +// +// Compile with one of -DJUDY1 or -DJUDYL. Note: Function names are converted +// to Judy1 or JudyL specific values by external #defines. + +#if (! (defined(JUDY1) || defined(JUDYL))) +#error: One of -DJUDY1 or -DJUDYL must be specified. +#endif + +#ifdef JUDY1 +#include "Judy1.h" +#endif +#ifdef JUDYL +#include "JudyL.h" +#endif + +#include "JudyPrivate1L.h" + +DBGCODE(extern void JudyCheckSorted(Pjll_t Pjll, Word_t Pop1, long IndexSize);) + + +// **************************************************************************** +// __ J U D Y C O P Y 2 T O 3 +// +// Copy one or more 2-byte Indexes to a series of 3-byte Indexes. + +FUNCTION static void j__udyCopy2to3( + uint8_t * PDest, // to where to copy 3-byte Indexes. + uint16_t * PSrc, // from where to copy 2-byte indexes. + Word_t Pop1, // number of Indexes to copy. + Word_t MSByte) // most-significant byte, prefix to each Index. +{ + Word_t Temp; // for building 3-byte Index. + + assert(Pop1); + + do { + Temp = MSByte | *PSrc++; + JU_COPY3_LONG_TO_PINDEX(PDest, Temp); + PDest += 3; + } while (--Pop1); + +} // j__udyCopy2to3() + + +#ifdef JU_64BIT + +// **************************************************************************** +// __ J U D Y C O P Y 3 T O 4 +// +// Copy one or more 3-byte Indexes to a series of 4-byte Indexes. + +FUNCTION static void j__udyCopy3to4( + uint32_t * PDest, // to where to copy 4-byte Indexes. + uint8_t * PSrc, // from where to copy 3-byte indexes. + Word_t Pop1, // number of Indexes to copy. + Word_t MSByte) // most-significant byte, prefix to each Index. +{ + Word_t Temp; // for building 4-byte Index. + + assert(Pop1); + + do { + JU_COPY3_PINDEX_TO_LONG(Temp, PSrc); + Temp |= MSByte; + PSrc += 3; + *PDest++ = Temp; // truncates to uint32_t. + } while (--Pop1); + +} // j__udyCopy3to4() + + +// **************************************************************************** +// __ J U D Y C O P Y 4 T O 5 +// +// Copy one or more 4-byte Indexes to a series of 5-byte Indexes. + +FUNCTION static void j__udyCopy4to5( + uint8_t * PDest, // to where to copy 4-byte Indexes. + uint32_t * PSrc, // from where to copy 4-byte indexes. + Word_t Pop1, // number of Indexes to copy. + Word_t MSByte) // most-significant byte, prefix to each Index. +{ + Word_t Temp; // for building 5-byte Index. + + assert(Pop1); + + do { + Temp = MSByte | *PSrc++; + JU_COPY5_LONG_TO_PINDEX(PDest, Temp); + PDest += 5; + } while (--Pop1); + +} // j__udyCopy4to5() + + +// **************************************************************************** +// __ J U D Y C O P Y 5 T O 6 +// +// Copy one or more 5-byte Indexes to a series of 6-byte Indexes. + +FUNCTION static void j__udyCopy5to6( + uint8_t * PDest, // to where to copy 6-byte Indexes. + uint8_t * PSrc, // from where to copy 5-byte indexes. + Word_t Pop1, // number of Indexes to copy. + Word_t MSByte) // most-significant byte, prefix to each Index. +{ + Word_t Temp; // for building 6-byte Index. + + assert(Pop1); + + do { + JU_COPY5_PINDEX_TO_LONG(Temp, PSrc); + Temp |= MSByte; + JU_COPY6_LONG_TO_PINDEX(PDest, Temp); + PSrc += 5; + PDest += 6; + } while (--Pop1); + +} // j__udyCopy5to6() + + +// **************************************************************************** +// __ J U D Y C O P Y 6 T O 7 +// +// Copy one or more 6-byte Indexes to a series of 7-byte Indexes. + +FUNCTION static void j__udyCopy6to7( + uint8_t * PDest, // to where to copy 6-byte Indexes. + uint8_t * PSrc, // from where to copy 5-byte indexes. + Word_t Pop1, // number of Indexes to copy. + Word_t MSByte) // most-significant byte, prefix to each Index. +{ + Word_t Temp; // for building 6-byte Index. + + assert(Pop1); + + do { + JU_COPY6_PINDEX_TO_LONG(Temp, PSrc); + Temp |= MSByte; + JU_COPY7_LONG_TO_PINDEX(PDest, Temp); + PSrc += 6; + PDest += 7; + } while (--Pop1); + +} // j__udyCopy6to7() + +#endif // JU_64BIT + + +#ifndef JU_64BIT // 32-bit + +// **************************************************************************** +// __ J U D Y C O P Y 3 T O W +// +// Copy one or more 3-byte Indexes to a series of longs (words, always 4-byte). + +FUNCTION static void j__udyCopy3toW( + PWord_t PDest, // to where to copy full-word Indexes. + uint8_t * PSrc, // from where to copy 3-byte indexes. + Word_t Pop1, // number of Indexes to copy. + Word_t MSByte) // most-significant byte, prefix to each Index. +{ + assert(Pop1); + + do { + JU_COPY3_PINDEX_TO_LONG(*PDest, PSrc); + *PDest++ |= MSByte; + PSrc += 3; + } while (--Pop1); + +} // j__udyCopy3toW() + + +#else // JU_64BIT + +// **************************************************************************** +// __ J U D Y C O P Y 7 T O W +// +// Copy one or more 7-byte Indexes to a series of longs (words, always 8-byte). + +FUNCTION static void j__udyCopy7toW( + PWord_t PDest, // to where to copy full-word Indexes. + uint8_t * PSrc, // from where to copy 7-byte indexes. + Word_t Pop1, // number of Indexes to copy. + Word_t MSByte) // most-significant byte, prefix to each Index. +{ + assert(Pop1); + + do { + JU_COPY7_PINDEX_TO_LONG(*PDest, PSrc); + *PDest++ |= MSByte; + PSrc += 7; + } while (--Pop1); + +} // j__udyCopy7toW() + +#endif // JU_64BIT + + +// **************************************************************************** +// __ J U D Y B R A N C H B T O B R A N C H L +// +// When a BranchB shrinks to have few enough JPs, call this function to convert +// it to a BranchL. Return 1 for success, or -1 for failure (with details in +// Pjpm). + +FUNCTION int j__udyBranchBToBranchL( + Pjp_t Pjp, // points to BranchB to shrink. + Pvoid_t Pjpm) // for global accounting. +{ + Pjbb_t PjbbRaw; // old BranchB to shrink. + Pjbb_t Pjbb; + Pjbl_t PjblRaw; // new BranchL to create. + Pjbl_t Pjbl; + Word_t Digit; // in BranchB. + Word_t NumJPs; // non-null JPs in BranchB. + uint8_t Expanse[cJU_BRANCHLMAXJPS]; // for building jbl_Expanse[]. + Pjp_t Pjpjbl; // current JP in BranchL. + Word_t SubExp; // in BranchB. + + assert(JU_JPTYPE(Pjp) >= cJU_JPBRANCH_B2); + assert(JU_JPTYPE(Pjp) <= cJU_JPBRANCH_B); + + PjbbRaw = (Pjbb_t) (Pjp->jp_Addr); + Pjbb = P_JBB(PjbbRaw); + +// Copy 1-byte subexpanse digits from BranchB to temporary buffer for BranchL, +// for each bit set in the BranchB: +// +// TBD: The following supports variable-sized linear branches, but they are no +// longer variable; this could be simplified to save the copying. +// +// TBD: Since cJU_BRANCHLMAXJP == 7 now, and cJU_BRANCHUNUMJPS == 256, the +// following might be inefficient; is there a faster way to do it? At least +// skip wholly empty subexpanses? + + for (NumJPs = Digit = 0; Digit < cJU_BRANCHUNUMJPS; ++Digit) + { + if (JU_BITMAPTESTB(Pjbb, Digit)) + { + Expanse[NumJPs++] = Digit; + assert(NumJPs <= cJU_BRANCHLMAXJPS); // required of caller. + } + } + +// Allocate and populate the BranchL: + + if ((PjblRaw = j__udyAllocJBL(Pjpm)) == (Pjbl_t) NULL) return(-1); + Pjbl = P_JBL(PjblRaw); + + JU_COPYMEM(Pjbl->jbl_Expanse, Expanse, NumJPs); + + Pjbl->jbl_NumJPs = NumJPs; + DBGCODE(JudyCheckSorted((Pjll_t) (Pjbl->jbl_Expanse), NumJPs, 1);) + +// Copy JPs from each BranchB subexpanse subarray: + + Pjpjbl = P_JP(Pjbl->jbl_jp); // start at first JP in array. + + for (SubExp = 0; SubExp < cJU_NUMSUBEXPB; ++SubExp) + { + Pjp_t PjpRaw = JU_JBB_PJP(Pjbb, SubExp); // current Pjp. + Pjp_t Pjp; + + if (PjpRaw == (Pjp_t) NULL) continue; // skip empty subexpanse. + Pjp = P_JP(PjpRaw); + + NumJPs = j__udyCountBitsB(JU_JBB_BITMAP(Pjbb, SubExp)); + assert(NumJPs); + JU_COPYMEM(Pjpjbl, Pjp, NumJPs); // one subarray at a time. + + Pjpjbl += NumJPs; + j__udyFreeJBBJP(PjpRaw, NumJPs, Pjpm); // subarray. + } + j__udyFreeJBB(PjbbRaw, Pjpm); // BranchB itself. + +// Finish up: Calculate new JP type (same index size = level in new class), +// and tie new BranchB into parent JP: + + Pjp->jp_Type += cJU_JPBRANCH_L - cJU_JPBRANCH_B; + Pjp->jp_Addr = (Word_t) PjblRaw; + + return(1); + +} // j__udyBranchBToBranchL() + + +#ifdef notdef + +// **************************************************************************** +// __ J U D Y B R A N C H U T O B R A N C H B +// +// When a BranchU shrinks to need little enough memory, call this function to +// convert it to a BranchB to save memory (at the cost of some speed). Return +// 1 for success, or -1 for failure (with details in Pjpm). +// +// TBD: Fill out if/when needed. Not currently used in JudyDel.c for reasons +// explained there. + +FUNCTION int j__udyBranchUToBranchB( + Pjp_t Pjp, // points to BranchU to shrink. + Pvoid_t Pjpm) // for global accounting. +{ + assert(FALSE); + return(1); +} +#endif // notdef + + +#if (defined(JUDYL) || (! defined(JU_64BIT))) + +// **************************************************************************** +// __ J U D Y L E A F B 1 T O L E A F 1 +// +// Shrink a bitmap leaf (cJU_LEAFB1) to linear leaf (cJU_JPLEAF1). +// Return 1 for success, or -1 for failure (with details in Pjpm). +// +// Note: This function is different than the other JudyLeaf*ToLeaf*() +// functions because it receives a Pjp, not just a leaf, and handles its own +// allocation and free, in order to allow the caller to continue with a LeafB1 +// if allocation fails. + +__attribute__((no_sanitize("shift"))) +FUNCTION int j__udyLeafB1ToLeaf1( + Pjp_t Pjp, // points to LeafB1 to shrink. + Pvoid_t Pjpm) // for global accounting. +{ + Pjlb_t PjlbRaw; // bitmap in old leaf. + Pjlb_t Pjlb; + Pjll_t PjllRaw; // new Leaf1. + uint8_t * Pleaf1; // Leaf1 pointer type. + Word_t Digit; // in LeafB1 bitmap. +#ifdef JUDYL + Pjv_t PjvNew; // value area in new Leaf1. + Word_t Pop1; + Word_t SubExp; +#endif + + assert(JU_JPTYPE(Pjp) == cJU_JPLEAF_B1); + assert(((JU_JPDCDPOP0(Pjp) & 0xFF) + 1) == cJU_LEAF1_MAXPOP1); + +// Allocate JPLEAF1 and prepare pointers: + + if ((PjllRaw = j__udyAllocJLL1(cJU_LEAF1_MAXPOP1, Pjpm)) == 0) + return(-1); + + Pleaf1 = (uint8_t *) P_JLL(PjllRaw); + PjlbRaw = (Pjlb_t) (Pjp->jp_Addr); + Pjlb = P_JLB(PjlbRaw); + JUDYLCODE(PjvNew = JL_LEAF1VALUEAREA(Pleaf1, cJL_LEAF1_MAXPOP1);) + +// Copy 1-byte indexes from old LeafB1 to new Leaf1: + + for (Digit = 0; Digit < cJU_BRANCHUNUMJPS; ++Digit) + if (JU_BITMAPTESTL(Pjlb, Digit)) + *Pleaf1++ = Digit; + +#ifdef JUDYL + +// Copy all old-LeafB1 value areas from value subarrays to new Leaf1: + + for (SubExp = 0; SubExp < cJU_NUMSUBEXPL; ++SubExp) + { + Pjv_t PjvRaw = JL_JLB_PVALUE(Pjlb, SubExp); + Pjv_t Pjv = P_JV(PjvRaw); + + if (Pjv == (Pjv_t) NULL) continue; // skip empty subarray. + + Pop1 = j__udyCountBitsL(JU_JLB_BITMAP(Pjlb, SubExp)); // subarray. + assert(Pop1); + + JU_COPYMEM(PjvNew, Pjv, Pop1); // copy value areas. + j__udyLFreeJV(PjvRaw, Pop1, Pjpm); + PjvNew += Pop1; // advance through new. + } + + assert((((Word_t) Pleaf1) - (Word_t) P_JLL(PjllRaw)) + == (PjvNew - JL_LEAF1VALUEAREA(P_JLL(PjllRaw), cJL_LEAF1_MAXPOP1))); +#endif // JUDYL + + DBGCODE(JudyCheckSorted((Pjll_t) P_JLL(PjllRaw), + (((Word_t) Pleaf1) - (Word_t) P_JLL(PjllRaw)), 1);) + +// Finish up: Free the old LeafB1 and plug the new Leaf1 into the JP: +// +// Note: jp_DcdPopO does not change here. + + j__udyFreeJLB1(PjlbRaw, Pjpm); + + Pjp->jp_Addr = (Word_t) PjllRaw; + Pjp->jp_Type = cJU_JPLEAF1; + + return(1); + +} // j__udyLeafB1ToLeaf1() + +#endif // (JUDYL || (! JU_64BIT)) + + +// **************************************************************************** +// __ J U D Y L E A F 1 T O L E A F 2 +// +// Copy 1-byte Indexes from a LeafB1 or Leaf1 to 2-byte Indexes in a Leaf2. +// Pjp MUST be one of: cJU_JPLEAF_B1, cJU_JPLEAF1, or cJU_JPIMMED_1_*. +// Return number of Indexes copied. +// +// TBD: In this and all following functions, the caller should already be able +// to compute the Pop1 return value, so why return it? + +__attribute__((no_sanitize("shift"))) +FUNCTION Word_t j__udyLeaf1ToLeaf2( + uint16_t * PLeaf2, // destination uint16_t * Index portion of leaf. +#ifdef JUDYL + Pjv_t Pjv2, // destination value part of leaf. +#endif + Pjp_t Pjp, // 1-byte-index object from which to copy. + Word_t MSByte, // most-significant byte, prefix to each Index. + Pvoid_t Pjpm) // for global accounting. +{ + Word_t Pop1; // Indexes in leaf. + Word_t Offset; // in linear leaf list. +JUDYLCODE(Pjv_t Pjv1Raw;) // source object value area. +JUDYLCODE(Pjv_t Pjv1;) + + switch (JU_JPTYPE(Pjp)) + { + + +// JPLEAF_B1: + + case cJU_JPLEAF_B1: + { + Pjlb_t Pjlb = P_JLB(Pjp->jp_Addr); + Word_t Digit; // in LeafB1 bitmap. + JUDYLCODE(Word_t SubExp;) // in LeafB1. + + Pop1 = JU_JPBRANCH_POP0(Pjp, 1) + 1; assert(Pop1); + +// Copy 1-byte indexes from old LeafB1 to new Leaf2, including splicing in +// the missing MSByte needed in the Leaf2: + + for (Digit = 0; Digit < cJU_BRANCHUNUMJPS; ++Digit) + if (JU_BITMAPTESTL(Pjlb, Digit)) + *PLeaf2++ = MSByte | Digit; + +#ifdef JUDYL + +// Copy all old-LeafB1 value areas from value subarrays to new Leaf2: + + for (SubExp = 0; SubExp < cJU_NUMSUBEXPL; ++SubExp) + { + Word_t SubExpPop1; + + Pjv1Raw = JL_JLB_PVALUE(Pjlb, SubExp); + if (Pjv1Raw == (Pjv_t) NULL) continue; // skip empty. + Pjv1 = P_JV(Pjv1Raw); + + SubExpPop1 = j__udyCountBitsL(JU_JLB_BITMAP(Pjlb, SubExp)); + assert(SubExpPop1); + + JU_COPYMEM(Pjv2, Pjv1, SubExpPop1); // copy value areas. + j__udyLFreeJV(Pjv1Raw, SubExpPop1, Pjpm); + Pjv2 += SubExpPop1; // advance through new. + } +#endif // JUDYL + + j__udyFreeJLB1((Pjlb_t) (Pjp->jp_Addr), Pjpm); // LeafB1 itself. + return(Pop1); + + } // case cJU_JPLEAF_B1 + + +#if (defined(JUDYL) || (! defined(JU_64BIT))) + +// JPLEAF1: + + case cJU_JPLEAF1: + { + uint8_t * PLeaf1 = (uint8_t *) P_JLL(Pjp->jp_Addr); + + Pop1 = JU_JPBRANCH_POP0(Pjp, 1) + 1; assert(Pop1); + JUDYLCODE(Pjv1 = JL_LEAF1VALUEAREA(PLeaf1, Pop1);) + +// Copy all Index bytes including splicing in missing MSByte needed in Leaf2 +// (plus, for JudyL, value areas): + + for (Offset = 0; Offset < Pop1; ++Offset) + { + PLeaf2[Offset] = MSByte | PLeaf1[Offset]; + JUDYLCODE(Pjv2[Offset] = Pjv1[Offset];) + } + j__udyFreeJLL1((Pjll_t) (Pjp->jp_Addr), Pop1, Pjpm); + return(Pop1); + } +#endif // (JUDYL || (! JU_64BIT)) + + +// JPIMMED_1_01: +// +// Note: jp_DcdPopO has 3 [7] bytes of Index (all but most significant byte), +// so the assignment to PLeaf2[] truncates and MSByte is not needed. + + case cJU_JPIMMED_1_01: + { + PLeaf2[0] = JU_JPDCDPOP0(Pjp); // see above. + JUDYLCODE(Pjv2[0] = Pjp->jp_Addr;) + return(1); + } + + +// JPIMMED_1_0[2+]: + + case cJU_JPIMMED_1_02: + case cJU_JPIMMED_1_03: +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_1_04: + case cJU_JPIMMED_1_05: + case cJU_JPIMMED_1_06: + case cJU_JPIMMED_1_07: +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_1_08: + case cJ1_JPIMMED_1_09: + case cJ1_JPIMMED_1_10: + case cJ1_JPIMMED_1_11: + case cJ1_JPIMMED_1_12: + case cJ1_JPIMMED_1_13: + case cJ1_JPIMMED_1_14: + case cJ1_JPIMMED_1_15: +#endif + { + Pop1 = JU_JPTYPE(Pjp) - cJU_JPIMMED_1_02 + 2; assert(Pop1); + JUDYLCODE(Pjv1Raw = (Pjv_t) (Pjp->jp_Addr);) + JUDYLCODE(Pjv1 = P_JV(Pjv1Raw);) + + for (Offset = 0; Offset < Pop1; ++Offset) + { +#ifdef JUDY1 + PLeaf2[Offset] = MSByte | Pjp->jp_1Index[Offset]; +#else + PLeaf2[Offset] = MSByte | Pjp->jp_LIndex[Offset]; + Pjv2 [Offset] = Pjv1[Offset]; +#endif + } + JUDYLCODE(j__udyLFreeJV(Pjv1Raw, Pop1, Pjpm);) + return(Pop1); + } + + +// UNEXPECTED CASES, including JPNULL1, should be handled by caller: + + default: assert(FALSE); break; + + } // switch + + return(0); + +} // j__udyLeaf1ToLeaf2() + + +// ***************************************************************************** +// __ J U D Y L E A F 2 T O L E A F 3 +// +// Copy 2-byte Indexes from a Leaf2 to 3-byte Indexes in a Leaf3. +// Pjp MUST be one of: cJU_JPLEAF2 or cJU_JPIMMED_2_*. +// Return number of Indexes copied. +// +// Note: By the time this function is called to compress a level-3 branch to a +// Leaf3, the branch has no narrow pointers under it, meaning only level-2 +// objects are below it and must be handled here. + +FUNCTION Word_t j__udyLeaf2ToLeaf3( + uint8_t * PLeaf3, // destination "uint24_t *" Index part of leaf. +#ifdef JUDYL + Pjv_t Pjv3, // destination value part of leaf. +#endif + Pjp_t Pjp, // 2-byte-index object from which to copy. + Word_t MSByte, // most-significant byte, prefix to each Index. + Pvoid_t Pjpm) // for global accounting. +{ + Word_t Pop1; // Indexes in leaf. +#if (defined(JUDYL) && defined(JU_64BIT)) + Pjv_t Pjv2Raw; // source object value area. +#endif +JUDYLCODE(Pjv_t Pjv2;) + + switch (JU_JPTYPE(Pjp)) + { + + +// JPLEAF2: + + case cJU_JPLEAF2: + { + uint16_t * PLeaf2 = (uint16_t *) P_JLL(Pjp->jp_Addr); + + Pop1 = JU_JPLEAF_POP0(Pjp) + 1; assert(Pop1); + j__udyCopy2to3(PLeaf3, PLeaf2, Pop1, MSByte); +#ifdef JUDYL + Pjv2 = JL_LEAF2VALUEAREA(PLeaf2, Pop1); + JU_COPYMEM(Pjv3, Pjv2, Pop1); +#endif + j__udyFreeJLL2((Pjll_t) (Pjp->jp_Addr), Pop1, Pjpm); + return(Pop1); + } + + +// JPIMMED_2_01: +// +// Note: jp_DcdPopO has 3 [7] bytes of Index (all but most significant byte), +// so the "assignment" to PLeaf3[] is exact [truncates] and MSByte is not +// needed. + + case cJU_JPIMMED_2_01: + { + JU_COPY3_LONG_TO_PINDEX(PLeaf3, JU_JPDCDPOP0(Pjp)); // see above. + JUDYLCODE(Pjv3[0] = Pjp->jp_Addr;) + return(1); + } + + +// JPIMMED_2_0[2+]: + +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_2_02: + case cJU_JPIMMED_2_03: +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_2_04: + case cJ1_JPIMMED_2_05: + case cJ1_JPIMMED_2_06: + case cJ1_JPIMMED_2_07: +#endif +#if (defined(JUDY1) || defined(JU_64BIT)) + { + JUDY1CODE(uint16_t * PLeaf2 = (uint16_t *) (Pjp->jp_1Index);) + JUDYLCODE(uint16_t * PLeaf2 = (uint16_t *) (Pjp->jp_LIndex);) + + Pop1 = JU_JPTYPE(Pjp) - cJU_JPIMMED_2_02 + 2; assert(Pop1); + j__udyCopy2to3(PLeaf3, PLeaf2, Pop1, MSByte); +#ifdef JUDYL + Pjv2Raw = (Pjv_t) (Pjp->jp_Addr); + Pjv2 = P_JV(Pjv2Raw); + JU_COPYMEM(Pjv3, Pjv2, Pop1); + j__udyLFreeJV(Pjv2Raw, Pop1, Pjpm); +#endif + return(Pop1); + } +#endif // (JUDY1 || JU_64BIT) + + +// UNEXPECTED CASES, including JPNULL2, should be handled by caller: + + default: assert(FALSE); break; + + } // switch + + return(0); + +} // j__udyLeaf2ToLeaf3() + + +#ifdef JU_64BIT + +// **************************************************************************** +// __ J U D Y L E A F 3 T O L E A F 4 +// +// Copy 3-byte Indexes from a Leaf3 to 4-byte Indexes in a Leaf4. +// Pjp MUST be one of: cJU_JPLEAF3 or cJU_JPIMMED_3_*. +// Return number of Indexes copied. +// +// Note: By the time this function is called to compress a level-4 branch to a +// Leaf4, the branch has no narrow pointers under it, meaning only level-3 +// objects are below it and must be handled here. + +FUNCTION Word_t j__udyLeaf3ToLeaf4( + uint32_t * PLeaf4, // destination uint32_t * Index part of leaf. +#ifdef JUDYL + Pjv_t Pjv4, // destination value part of leaf. +#endif + Pjp_t Pjp, // 3-byte-index object from which to copy. + Word_t MSByte, // most-significant byte, prefix to each Index. + Pvoid_t Pjpm) // for global accounting. +{ + Word_t Pop1; // Indexes in leaf. +JUDYLCODE(Pjv_t Pjv3Raw;) // source object value area. +JUDYLCODE(Pjv_t Pjv3;) + + switch (JU_JPTYPE(Pjp)) + { + + +// JPLEAF3: + + case cJU_JPLEAF3: + { + uint8_t * PLeaf3 = (uint8_t *) P_JLL(Pjp->jp_Addr); + + Pop1 = JU_JPLEAF_POP0(Pjp) + 1; assert(Pop1); + j__udyCopy3to4(PLeaf4, (uint8_t *) PLeaf3, Pop1, MSByte); +#ifdef JUDYL + Pjv3 = JL_LEAF3VALUEAREA(PLeaf3, Pop1); + JU_COPYMEM(Pjv4, Pjv3, Pop1); +#endif + j__udyFreeJLL3((Pjll_t) (Pjp->jp_Addr), Pop1, Pjpm); + return(Pop1); + } + + +// JPIMMED_3_01: +// +// Note: jp_DcdPopO has 7 bytes of Index (all but most significant byte), so +// the assignment to PLeaf4[] truncates and MSByte is not needed. + + case cJU_JPIMMED_3_01: + { + PLeaf4[0] = JU_JPDCDPOP0(Pjp); // see above. + JUDYLCODE(Pjv4[0] = Pjp->jp_Addr;) + return(1); + } + + +// JPIMMED_3_0[2+]: + + case cJU_JPIMMED_3_02: +#ifdef JUDY1 + case cJ1_JPIMMED_3_03: + case cJ1_JPIMMED_3_04: + case cJ1_JPIMMED_3_05: +#endif + { + JUDY1CODE(uint8_t * PLeaf3 = (uint8_t *) (Pjp->jp_1Index);) + JUDYLCODE(uint8_t * PLeaf3 = (uint8_t *) (Pjp->jp_LIndex);) + + JUDY1CODE(Pop1 = JU_JPTYPE(Pjp) - cJU_JPIMMED_3_02 + 2;) + JUDYLCODE(Pop1 = 2;) + + j__udyCopy3to4(PLeaf4, PLeaf3, Pop1, MSByte); +#ifdef JUDYL + Pjv3Raw = (Pjv_t) (Pjp->jp_Addr); + Pjv3 = P_JV(Pjv3Raw); + JU_COPYMEM(Pjv4, Pjv3, Pop1); + j__udyLFreeJV(Pjv3Raw, Pop1, Pjpm); +#endif + return(Pop1); + } + + +// UNEXPECTED CASES, including JPNULL3, should be handled by caller: + + default: assert(FALSE); break; + + } // switch + + return(0); + +} // j__udyLeaf3ToLeaf4() + + +// Note: In all following j__udyLeaf*ToLeaf*() functions, JPIMMED_*_0[2+] +// cases exist for Judy1 (&& 64-bit) only. JudyL has no equivalent Immeds. + + +// ***************************************************************************** +// __ J U D Y L E A F 4 T O L E A F 5 +// +// Copy 4-byte Indexes from a Leaf4 to 5-byte Indexes in a Leaf5. +// Pjp MUST be one of: cJU_JPLEAF4 or cJU_JPIMMED_4_*. +// Return number of Indexes copied. +// +// Note: By the time this function is called to compress a level-5 branch to a +// Leaf5, the branch has no narrow pointers under it, meaning only level-4 +// objects are below it and must be handled here. + +FUNCTION Word_t j__udyLeaf4ToLeaf5( + uint8_t * PLeaf5, // destination "uint40_t *" Index part of leaf. +#ifdef JUDYL + Pjv_t Pjv5, // destination value part of leaf. +#endif + Pjp_t Pjp, // 4-byte-index object from which to copy. + Word_t MSByte, // most-significant byte, prefix to each Index. + Pvoid_t Pjpm) // for global accounting. +{ + Word_t Pop1; // Indexes in leaf. +JUDYLCODE(Pjv_t Pjv4;) // source object value area. + + switch (JU_JPTYPE(Pjp)) + { + + +// JPLEAF4: + + case cJU_JPLEAF4: + { + uint32_t * PLeaf4 = (uint32_t *) P_JLL(Pjp->jp_Addr); + + Pop1 = JU_JPLEAF_POP0(Pjp) + 1; assert(Pop1); + j__udyCopy4to5(PLeaf5, PLeaf4, Pop1, MSByte); +#ifdef JUDYL + Pjv4 = JL_LEAF4VALUEAREA(PLeaf4, Pop1); + JU_COPYMEM(Pjv5, Pjv4, Pop1); +#endif + j__udyFreeJLL4((Pjll_t) (Pjp->jp_Addr), Pop1, Pjpm); + return(Pop1); + } + + +// JPIMMED_4_01: +// +// Note: jp_DcdPopO has 7 bytes of Index (all but most significant byte), so +// the assignment to PLeaf5[] truncates and MSByte is not needed. + + case cJU_JPIMMED_4_01: + { + JU_COPY5_LONG_TO_PINDEX(PLeaf5, JU_JPDCDPOP0(Pjp)); // see above. + JUDYLCODE(Pjv5[0] = Pjp->jp_Addr;) + return(1); + } + + +#ifdef JUDY1 + +// JPIMMED_4_0[4+]: + + case cJ1_JPIMMED_4_02: + case cJ1_JPIMMED_4_03: + { + uint32_t * PLeaf4 = (uint32_t *) (Pjp->jp_1Index); + + Pop1 = JU_JPTYPE(Pjp) - cJ1_JPIMMED_4_02 + 2; + j__udyCopy4to5(PLeaf5, PLeaf4, Pop1, MSByte); + return(Pop1); + } +#endif // JUDY1 + + +// UNEXPECTED CASES, including JPNULL4, should be handled by caller: + + default: assert(FALSE); break; + + } // switch + + return(0); + +} // j__udyLeaf4ToLeaf5() + + +// **************************************************************************** +// __ J U D Y L E A F 5 T O L E A F 6 +// +// Copy 5-byte Indexes from a Leaf5 to 6-byte Indexes in a Leaf6. +// Pjp MUST be one of: cJU_JPLEAF5 or cJU_JPIMMED_5_*. +// Return number of Indexes copied. +// +// Note: By the time this function is called to compress a level-6 branch to a +// Leaf6, the branch has no narrow pointers under it, meaning only level-5 +// objects are below it and must be handled here. + +FUNCTION Word_t j__udyLeaf5ToLeaf6( + uint8_t * PLeaf6, // destination uint8_t * Index part of leaf. +#ifdef JUDYL + Pjv_t Pjv6, // destination value part of leaf. +#endif + Pjp_t Pjp, // 5-byte-index object from which to copy. + Word_t MSByte, // most-significant byte, prefix to each Index. + Pvoid_t Pjpm) // for global accounting. +{ + Word_t Pop1; // Indexes in leaf. +JUDYLCODE(Pjv_t Pjv5;) // source object value area. + + switch (JU_JPTYPE(Pjp)) + { + + +// JPLEAF5: + + case cJU_JPLEAF5: + { + uint8_t * PLeaf5 = (uint8_t *) P_JLL(Pjp->jp_Addr); + + Pop1 = JU_JPLEAF_POP0(Pjp) + 1; assert(Pop1); + j__udyCopy5to6(PLeaf6, PLeaf5, Pop1, MSByte); +#ifdef JUDYL + Pjv5 = JL_LEAF5VALUEAREA(PLeaf5, Pop1); + JU_COPYMEM(Pjv6, Pjv5, Pop1); +#endif + j__udyFreeJLL5((Pjll_t) (Pjp->jp_Addr), Pop1, Pjpm); + return(Pop1); + } + + +// JPIMMED_5_01: +// +// Note: jp_DcdPopO has 7 bytes of Index (all but most significant byte), so +// the assignment to PLeaf6[] truncates and MSByte is not needed. + + case cJU_JPIMMED_5_01: + { + JU_COPY6_LONG_TO_PINDEX(PLeaf6, JU_JPDCDPOP0(Pjp)); // see above. + JUDYLCODE(Pjv6[0] = Pjp->jp_Addr;) + return(1); + } + + +#ifdef JUDY1 + +// JPIMMED_5_0[2+]: + + case cJ1_JPIMMED_5_02: + case cJ1_JPIMMED_5_03: + { + uint8_t * PLeaf5 = (uint8_t *) (Pjp->jp_1Index); + + Pop1 = JU_JPTYPE(Pjp) - cJ1_JPIMMED_5_02 + 2; + j__udyCopy5to6(PLeaf6, PLeaf5, Pop1, MSByte); + return(Pop1); + } +#endif // JUDY1 + + +// UNEXPECTED CASES, including JPNULL5, should be handled by caller: + + default: assert(FALSE); break; + + } // switch + + return(0); + +} // j__udyLeaf5ToLeaf6() + + +// ***************************************************************************** +// __ J U D Y L E A F 6 T O L E A F 7 +// +// Copy 6-byte Indexes from a Leaf2 to 7-byte Indexes in a Leaf7. +// Pjp MUST be one of: cJU_JPLEAF6 or cJU_JPIMMED_6_*. +// Return number of Indexes copied. +// +// Note: By the time this function is called to compress a level-7 branch to a +// Leaf7, the branch has no narrow pointers under it, meaning only level-6 +// objects are below it and must be handled here. + +FUNCTION Word_t j__udyLeaf6ToLeaf7( + uint8_t * PLeaf7, // destination "uint24_t *" Index part of leaf. +#ifdef JUDYL + Pjv_t Pjv7, // destination value part of leaf. +#endif + Pjp_t Pjp, // 6-byte-index object from which to copy. + Word_t MSByte, // most-significant byte, prefix to each Index. + Pvoid_t Pjpm) // for global accounting. +{ + Word_t Pop1; // Indexes in leaf. +JUDYLCODE(Pjv_t Pjv6;) // source object value area. + + switch (JU_JPTYPE(Pjp)) + { + + +// JPLEAF6: + + case cJU_JPLEAF6: + { + uint8_t * PLeaf6 = (uint8_t *) P_JLL(Pjp->jp_Addr); + + Pop1 = JU_JPLEAF_POP0(Pjp) + 1; + j__udyCopy6to7(PLeaf7, PLeaf6, Pop1, MSByte); +#ifdef JUDYL + Pjv6 = JL_LEAF6VALUEAREA(PLeaf6, Pop1); + JU_COPYMEM(Pjv7, Pjv6, Pop1); +#endif + j__udyFreeJLL6((Pjll_t) (Pjp->jp_Addr), Pop1, Pjpm); + return(Pop1); + } + + +// JPIMMED_6_01: +// +// Note: jp_DcdPopO has 7 bytes of Index (all but most significant byte), so +// the "assignment" to PLeaf7[] is exact and MSByte is not needed. + + case cJU_JPIMMED_6_01: + { + JU_COPY7_LONG_TO_PINDEX(PLeaf7, JU_JPDCDPOP0(Pjp)); // see above. + JUDYLCODE(Pjv7[0] = Pjp->jp_Addr;) + return(1); + } + + +#ifdef JUDY1 + +// JPIMMED_6_02: + + case cJ1_JPIMMED_6_02: + { + uint8_t * PLeaf6 = (uint8_t *) (Pjp->jp_1Index); + + j__udyCopy6to7(PLeaf7, PLeaf6, /* Pop1 = */ 2, MSByte); + return(2); + } +#endif // JUDY1 + + +// UNEXPECTED CASES, including JPNULL6, should be handled by caller: + + default: assert(FALSE); break; + + } // switch + + return(0); + +} // j__udyLeaf6ToLeaf7() + +#endif // JU_64BIT + + +#ifndef JU_64BIT // 32-bit version first + +// **************************************************************************** +// __ J U D Y L E A F 3 T O L E A F W +// +// Copy 3-byte Indexes from a Leaf3 to 4-byte Indexes in a LeafW. Pjp MUST be +// one of: cJU_JPLEAF3 or cJU_JPIMMED_3_*. Return number of Indexes copied. +// +// Note: By the time this function is called to compress a level-L branch to a +// LeafW, the branch has no narrow pointers under it, meaning only level-3 +// objects are below it and must be handled here. + +FUNCTION Word_t j__udyLeaf3ToLeafW( + Pjlw_t Pjlw, // destination Index part of leaf. +#ifdef JUDYL + Pjv_t PjvW, // destination value part of leaf. +#endif + Pjp_t Pjp, // 3-byte-index object from which to copy. + Word_t MSByte, // most-significant byte, prefix to each Index. + Pvoid_t Pjpm) // for global accounting. +{ + Word_t Pop1; // Indexes in leaf. +JUDYLCODE(Pjv_t Pjv3;) // source object value area. + + switch (JU_JPTYPE(Pjp)) + { + + +// JPLEAF3: + + case cJU_JPLEAF3: + { + uint8_t * PLeaf3 = (uint8_t *) P_JLL(Pjp->jp_Addr); + + Pop1 = JU_JPLEAF_POP0(Pjp) + 1; + j__udyCopy3toW((PWord_t) Pjlw, PLeaf3, Pop1, MSByte); +#ifdef JUDYL + Pjv3 = JL_LEAF3VALUEAREA(PLeaf3, Pop1); + JU_COPYMEM(PjvW, Pjv3, Pop1); +#endif + j__udyFreeJLL3((Pjll_t) (Pjp->jp_Addr), Pop1, Pjpm); + return(Pop1); + } + + +// JPIMMED_3_01: +// +// Note: jp_DcdPopO has 3 bytes of Index (all but most significant byte), and +// MSByte must be ord in. + + case cJU_JPIMMED_3_01: + { + Pjlw[0] = MSByte | JU_JPDCDPOP0(Pjp); // see above. + JUDYLCODE(PjvW[0] = Pjp->jp_Addr;) + return(1); + } + + +#ifdef JUDY1 + +// JPIMMED_3_02: + + case cJU_JPIMMED_3_02: + { + uint8_t * PLeaf3 = (uint8_t *) (Pjp->jp_1Index); + + j__udyCopy3toW((PWord_t) Pjlw, PLeaf3, /* Pop1 = */ 2, MSByte); + return(2); + } +#endif // JUDY1 + + +// UNEXPECTED CASES, including JPNULL3, should be handled by caller: + + default: assert(FALSE); break; + + } // switch + + return(0); + +} // j__udyLeaf3ToLeafW() + + +#else // JU_64BIT + + +// **************************************************************************** +// __ J U D Y L E A F 7 T O L E A F W +// +// Copy 7-byte Indexes from a Leaf7 to 8-byte Indexes in a LeafW. +// Pjp MUST be one of: cJU_JPLEAF7 or cJU_JPIMMED_7_*. +// Return number of Indexes copied. +// +// Note: By the time this function is called to compress a level-L branch to a +// LeafW, the branch has no narrow pointers under it, meaning only level-7 +// objects are below it and must be handled here. + +FUNCTION Word_t j__udyLeaf7ToLeafW( + Pjlw_t Pjlw, // destination Index part of leaf. +#ifdef JUDYL + Pjv_t PjvW, // destination value part of leaf. +#endif + Pjp_t Pjp, // 7-byte-index object from which to copy. + Word_t MSByte, // most-significant byte, prefix to each Index. + Pvoid_t Pjpm) // for global accounting. +{ + Word_t Pop1; // Indexes in leaf. +JUDYLCODE(Pjv_t Pjv7;) // source object value area. + + switch (JU_JPTYPE(Pjp)) + { + + +// JPLEAF7: + + case cJU_JPLEAF7: + { + uint8_t * PLeaf7 = (uint8_t *) P_JLL(Pjp->jp_Addr); + + Pop1 = JU_JPLEAF_POP0(Pjp) + 1; + j__udyCopy7toW((PWord_t) Pjlw, PLeaf7, Pop1, MSByte); +#ifdef JUDYL + Pjv7 = JL_LEAF7VALUEAREA(PLeaf7, Pop1); + JU_COPYMEM(PjvW, Pjv7, Pop1); +#endif + j__udyFreeJLL7((Pjll_t) (Pjp->jp_Addr), Pop1, Pjpm); + return(Pop1); + } + + +// JPIMMED_7_01: +// +// Note: jp_DcdPopO has 7 bytes of Index (all but most significant byte), and +// MSByte must be ord in. + + case cJU_JPIMMED_7_01: + { + Pjlw[0] = MSByte | JU_JPDCDPOP0(Pjp); // see above. + JUDYLCODE(PjvW[0] = Pjp->jp_Addr;) + return(1); + } + + +#ifdef JUDY1 + +// JPIMMED_7_02: + + case cJ1_JPIMMED_7_02: + { + uint8_t * PLeaf7 = (uint8_t *) (Pjp->jp_1Index); + + j__udyCopy7toW((PWord_t) Pjlw, PLeaf7, /* Pop1 = */ 2, MSByte); + return(2); + } +#endif + + +// UNEXPECTED CASES, including JPNULL7, should be handled by caller: + + default: assert(FALSE); break; + + } // switch + + return(0); + +} // j__udyLeaf7ToLeafW() + +#endif // JU_64BIT diff --git a/src/libnetdata/libjudy/src/JudyL/JudyLDel.c b/src/libnetdata/libjudy/src/JudyL/JudyLDel.c new file mode 100644 index 00000000..7c3d9108 --- /dev/null +++ b/src/libnetdata/libjudy/src/JudyL/JudyLDel.c @@ -0,0 +1,2147 @@ +// Copyright (C) 2000 - 2002 Hewlett-Packard Company +// +// This program is free software; you can redistribute it and/or modify it +// under the term of the GNU Lesser General Public License as published by the +// Free Software Foundation; either version 2 of the License, or (at your +// option) any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License +// for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this program; if not, write to the Free Software Foundation, +// Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// _________________ + +// @(#) $Revision: 4.68 $ $Source: /judy/src/JudyCommon/JudyDel.c $ +// +// Judy1Unset() and JudyLDel() functions for Judy1 and JudyL. +// Compile with one of -DJUDY1 or -DJUDYL. +// +// About HYSTERESIS: In the Judy code, hysteresis means leaving around a +// nominally suboptimal (not maximally compressed) data structure after a +// deletion. As a result, the shape of the tree for two identical index sets +// can differ depending on the insert/delete path taken to arrive at the index +// sets. The purpose is to minimize worst-case behavior (thrashing) that could +// result from a series of intermixed insertions and deletions. It also makes +// for MUCH simpler code, because instead of performing, "delete and then +// compress," it can say, "compress and then delete," where due to hysteresis, +// compression is not even attempted until the object IS compressible. +// +// In some cases the code has no choice and it must "ungrow" a data structure +// across a "phase transition" boundary without hysteresis. In other cases the +// amount (such as "hysteresis = 1") is indicated by the number of JP deletions +// (in branches) or index deletions (in leaves) that can occur in succession +// before compressing the data structure. (It appears that hysteresis <= 1 in +// all cases.) +// +// In general no hysteresis occurs when the data structure type remains the +// same but the allocated memory chunk for the node must shrink, because the +// relationship is hardwired and theres no way to know how much memory is +// allocated to a given data structure. Hysteresis = 0 in all these cases. +// +// TBD: Could this code be faster if memory chunk hysteresis were supported +// somehow along with data structure type hysteresis? +// +// TBD: Should some of the assertions here be converted to product code that +// returns JU_ERRNO_CORRUPT? +// +// TBD: Dougs code had an odd mix of function-wide and limited-scope +// variables. Should some of the function-wide variables appear only in +// limited scopes, or more likely, vice-versa? + +#if (! (defined(JUDY1) || defined(JUDYL))) +#error: One of -DJUDY1 or -DJUDYL must be specified. +#endif + +#ifdef JUDY1 +#include "Judy1.h" +#else +#include "JudyL.h" +#endif + +#include "JudyPrivate1L.h" + +DBGCODE(extern void JudyCheckPop(Pvoid_t PArray);) +DBGCODE(extern void JudyCheckSorted(Pjll_t Pjll, Word_t Pop1, long IndexSize);) + +#ifdef TRACEJP +#include "JudyPrintJP.c" +#endif + +// These are defined to generic values in JudyCommon/JudyPrivateTypes.h: +// +// TBD: These should be exported from a header file, but perhaps not, as they +// are only used here, and exported from JudyDecascade.c, which is a separate +// file for profiling reasons (to prevent inlining), but which potentially +// could be merged with this file, either in SoftCM or at compile-time: + +#ifdef JUDY1 + +extern int j__udy1BranchBToBranchL(Pjp_t Pjp, Pvoid_t Pjpm); +#ifndef JU_64BIT +extern int j__udy1LeafB1ToLeaf1(Pjp_t, Pvoid_t); +#endif +extern Word_t j__udy1Leaf1ToLeaf2(uint16_t *, Pjp_t, Word_t, Pvoid_t); +extern Word_t j__udy1Leaf2ToLeaf3(uint8_t *, Pjp_t, Word_t, Pvoid_t); +#ifndef JU_64BIT +extern Word_t j__udy1Leaf3ToLeafW(Pjlw_t, Pjp_t, Word_t, Pvoid_t); +#else +extern Word_t j__udy1Leaf3ToLeaf4(uint32_t *, Pjp_t, Word_t, Pvoid_t); +extern Word_t j__udy1Leaf4ToLeaf5(uint8_t *, Pjp_t, Word_t, Pvoid_t); +extern Word_t j__udy1Leaf5ToLeaf6(uint8_t *, Pjp_t, Word_t, Pvoid_t); +extern Word_t j__udy1Leaf6ToLeaf7(uint8_t *, Pjp_t, Word_t, Pvoid_t); +extern Word_t j__udy1Leaf7ToLeafW(Pjlw_t, Pjp_t, Word_t, Pvoid_t); +#endif + +#else // JUDYL + +extern int j__udyLBranchBToBranchL(Pjp_t Pjp, Pvoid_t Pjpm); +extern int j__udyLLeafB1ToLeaf1(Pjp_t, Pvoid_t); +extern Word_t j__udyLLeaf1ToLeaf2(uint16_t *, Pjv_t, Pjp_t, Word_t, Pvoid_t); +extern Word_t j__udyLLeaf2ToLeaf3(uint8_t *, Pjv_t, Pjp_t, Word_t, Pvoid_t); +#ifndef JU_64BIT +extern Word_t j__udyLLeaf3ToLeafW(Pjlw_t, Pjv_t, Pjp_t, Word_t, Pvoid_t); +#else +extern Word_t j__udyLLeaf3ToLeaf4(uint32_t *, Pjv_t, Pjp_t, Word_t, Pvoid_t); +extern Word_t j__udyLLeaf4ToLeaf5(uint8_t *, Pjv_t, Pjp_t, Word_t, Pvoid_t); +extern Word_t j__udyLLeaf5ToLeaf6(uint8_t *, Pjv_t, Pjp_t, Word_t, Pvoid_t); +extern Word_t j__udyLLeaf6ToLeaf7(uint8_t *, Pjv_t, Pjp_t, Word_t, Pvoid_t); +extern Word_t j__udyLLeaf7ToLeafW(Pjlw_t, Pjv_t, Pjp_t, Word_t, Pvoid_t); +#endif + +#endif // JUDYL + +// For convenience in the calling code; "M1" means "minus one": + +#ifndef JU_64BIT +#define j__udyLeafM1ToLeafW j__udyLeaf3ToLeafW +#else +#define j__udyLeafM1ToLeafW j__udyLeaf7ToLeafW +#endif + + +// **************************************************************************** +// __ J U D Y D E L W A L K +// +// Given a pointer to a JP, an Index known to be valid, the number of bytes +// left to decode (== level in the tree), and a pointer to a global JPM, walk a +// Judy (sub)tree to do an unset/delete of that index, and possibly modify the +// JPM. This function is only called internally, and recursively. Unlike +// Judy1Test() and JudyLGet(), the extra time required for recursion should be +// negligible compared with the total. +// +// Return values: +// +// -1 error; details in JPM +// +// 0 Index already deleted (should never happen, Index is known to be valid) +// +// 1 previously valid Index deleted +// +// 2 same as 1, but in addition the JP now points to a BranchL containing a +// single JP, which should be compressed into the parent branch (if there +// is one, which is not the case for a top-level branch under a JPM) + +DBGCODE(uint8_t parentJPtype;) // parent branch JP type. + +__attribute__((no_sanitize("shift"))) +FUNCTION static int j__udyDelWalk( + Pjp_t Pjp, // current JP under which to delete. + Word_t Index, // to delete. + Word_t ParentLevel, // of parent branch. + Pjpm_t Pjpm) // for returning info to top level. +{ + Word_t pop1; // of a leaf. + Word_t level; // of a leaf. + uint8_t digit; // from Index, in current branch. + Pjll_t PjllnewRaw; // address of newly allocated leaf. + Pjll_t Pjllnew; + int offset; // within a branch. + int retcode; // return code: -1, 0, 1, 2. +JUDYLCODE(Pjv_t PjvRaw;) // value area. +JUDYLCODE(Pjv_t Pjv;) + + DBGCODE(level = 0;) + +ContinueDelWalk: // for modifying state without recursing. + +#ifdef TRACEJP + JudyPrintJP(Pjp, "d", __LINE__); +#endif + + switch (JU_JPTYPE(Pjp)) // entry: Pjp, Index. + { + + +// **************************************************************************** +// LINEAR BRANCH: +// +// MACROS FOR COMMON CODE: +// +// Check for population too high to compress a branch to a leaf, meaning just +// descend through the branch, with a purposeful off-by-one error that +// constitutes hysteresis = 1. In other words, do not compress until the +// branchs CURRENT population fits in the leaf, even BEFORE deleting one +// index. +// +// Next is a label for branch-type-specific common code. Variables pop1, +// level, digit, and Index are in the context. + +#define JU_BRANCH_KEEP(cLevel,MaxPop1,Next) \ + if (pop1 > (MaxPop1)) /* hysteresis = 1 */ \ + { \ + assert((cLevel) >= 2); \ + level = (cLevel); \ + digit = JU_DIGITATSTATE(Index, cLevel); \ + goto Next; \ + } + +// Support for generic calling of JudyLeaf*ToLeaf*() functions: +// +// Note: Cannot use JUDYLCODE() because this contains a comma. + +#ifdef JUDY1 +#define JU_PVALUEPASS // null. +#else +#define JU_PVALUEPASS Pjv, +#endif + +// During compression to a leaf, check if a JP contains nothing but a +// cJU_JPIMMED_*_01, in which case shortcut calling j__udyLeaf*ToLeaf*(): +// +// Copy the index bytes from the jp_DcdPopO field (with possible truncation), +// and continue the branch-JP-walk loop. Variables Pjp and Pleaf are in the +// context. + +#define JU_BRANCH_COPY_IMMED_EVEN(cLevel,Pjp,ignore) \ + if (JU_JPTYPE(Pjp) == cJU_JPIMMED_1_01 + (cLevel) - 2) \ + { \ + *Pleaf++ = JU_JPDCDPOP0(Pjp); \ + JUDYLCODE(*Pjv++ = (Pjp)->jp_Addr;) \ + continue; /* for-loop */ \ + } + +#define JU_BRANCH_COPY_IMMED_ODD(cLevel,Pjp,CopyIndex) \ + if (JU_JPTYPE(Pjp) == cJU_JPIMMED_1_01 + (cLevel) - 2) \ + { \ + CopyIndex(Pleaf, (Word_t) (JU_JPDCDPOP0(Pjp))); \ + Pleaf += (cLevel); /* index size = level */ \ + JUDYLCODE(*Pjv++ = (Pjp)->jp_Addr;) \ + continue; /* for-loop */ \ + } + +// Compress a BranchL into a leaf one index size larger: +// +// Allocate a new leaf, walk the JPs in the old BranchL and pack their contents +// into the new leaf (of type NewJPType), free the old BranchL, and finally +// restart the switch to delete Index from the new leaf. (Note that all +// BranchLs are the same size.) Variables Pjp, Pjpm, Pleaf, digit, and pop1 +// are in the context. + +#define JU_BRANCHL_COMPRESS(cLevel,LeafType,MaxPop1,NewJPType, \ + LeafToLeaf,Alloc,ValueArea, \ + CopyImmed,CopyIndex) \ + { \ + LeafType Pleaf; \ + Pjbl_t PjblRaw; \ + Pjbl_t Pjbl; \ + Word_t numJPs; \ + \ + if ((PjllnewRaw = Alloc(MaxPop1, Pjpm)) == 0) return(-1); \ + Pjllnew = P_JLL(PjllnewRaw); \ + Pleaf = (LeafType) Pjllnew; \ + JUDYLCODE(Pjv = ValueArea(Pleaf, MaxPop1);) \ + \ + PjblRaw = (Pjbl_t) (Pjp->jp_Addr); \ + Pjbl = P_JBL(PjblRaw); \ + numJPs = Pjbl->jbl_NumJPs; \ + \ + for (offset = 0; offset < numJPs; ++offset) \ + { \ + CopyImmed(cLevel, (Pjbl->jbl_jp) + offset, CopyIndex); \ + \ + pop1 = LeafToLeaf(Pleaf, JU_PVALUEPASS \ + (Pjbl->jbl_jp) + offset, \ + JU_DIGITTOSTATE(Pjbl->jbl_Expanse[offset], \ + cLevel), (Pvoid_t) Pjpm); \ + Pleaf = (LeafType) (((Word_t) Pleaf) + ((cLevel) * pop1)); \ + JUDYLCODE(Pjv += pop1;) \ + } \ + assert(((((Word_t) Pleaf) - ((Word_t) Pjllnew)) / (cLevel)) == (MaxPop1)); \ + JUDYLCODE(assert((Pjv - ValueArea(Pjllnew, MaxPop1)) == (MaxPop1));) \ + DBGCODE(JudyCheckSorted(Pjllnew, MaxPop1, cLevel);) \ + \ + j__udyFreeJBL(PjblRaw, Pjpm); \ + \ + Pjp->jp_Type = (NewJPType); \ + Pjp->jp_Addr = (Word_t) PjllnewRaw; \ + goto ContinueDelWalk; /* delete from new leaf */ \ + } + +// Overall common code for initial BranchL deletion handling: +// +// Assert that Index is in the branch, then see if the BranchL should be kept +// or else compressed to a leaf. Variables Index, Pjp, and pop1 are in the +// context. + +#define JU_BRANCHL(cLevel,MaxPop1,LeafType,NewJPType, \ + LeafToLeaf,Alloc,ValueArea,CopyImmed,CopyIndex) \ + \ + assert(! JU_DCDNOTMATCHINDEX(Index, Pjp, cLevel)); \ + assert(ParentLevel > (cLevel)); \ + \ + pop1 = JU_JPBRANCH_POP0(Pjp, cLevel) + 1; \ + JU_BRANCH_KEEP(cLevel, MaxPop1, BranchLKeep); \ + assert(pop1 == (MaxPop1)); \ + \ + JU_BRANCHL_COMPRESS(cLevel, LeafType, MaxPop1, NewJPType, \ + LeafToLeaf, Alloc, ValueArea, CopyImmed, CopyIndex) + + +// END OF MACROS, START OF CASES: + + case cJU_JPBRANCH_L2: + + JU_BRANCHL(2, cJU_LEAF2_MAXPOP1, uint16_t *, cJU_JPLEAF2, + j__udyLeaf1ToLeaf2, j__udyAllocJLL2, JL_LEAF2VALUEAREA, + JU_BRANCH_COPY_IMMED_EVEN, ignore); + + case cJU_JPBRANCH_L3: + + JU_BRANCHL(3, cJU_LEAF3_MAXPOP1, uint8_t *, cJU_JPLEAF3, + j__udyLeaf2ToLeaf3, j__udyAllocJLL3, JL_LEAF3VALUEAREA, + JU_BRANCH_COPY_IMMED_ODD, JU_COPY3_LONG_TO_PINDEX); + +#ifdef JU_64BIT + case cJU_JPBRANCH_L4: + + JU_BRANCHL(4, cJU_LEAF4_MAXPOP1, uint32_t *, cJU_JPLEAF4, + j__udyLeaf3ToLeaf4, j__udyAllocJLL4, JL_LEAF4VALUEAREA, + JU_BRANCH_COPY_IMMED_EVEN, ignore); + + case cJU_JPBRANCH_L5: + + JU_BRANCHL(5, cJU_LEAF5_MAXPOP1, uint8_t *, cJU_JPLEAF5, + j__udyLeaf4ToLeaf5, j__udyAllocJLL5, JL_LEAF5VALUEAREA, + JU_BRANCH_COPY_IMMED_ODD, JU_COPY5_LONG_TO_PINDEX); + + case cJU_JPBRANCH_L6: + + JU_BRANCHL(6, cJU_LEAF6_MAXPOP1, uint8_t *, cJU_JPLEAF6, + j__udyLeaf5ToLeaf6, j__udyAllocJLL6, JL_LEAF6VALUEAREA, + JU_BRANCH_COPY_IMMED_ODD, JU_COPY6_LONG_TO_PINDEX); + + case cJU_JPBRANCH_L7: + + JU_BRANCHL(7, cJU_LEAF7_MAXPOP1, uint8_t *, cJU_JPLEAF7, + j__udyLeaf6ToLeaf7, j__udyAllocJLL7, JL_LEAF7VALUEAREA, + JU_BRANCH_COPY_IMMED_ODD, JU_COPY7_LONG_TO_PINDEX); +#endif // JU_64BIT + +// A top-level BranchL is different and cannot use JU_BRANCHL(): Dont try to +// compress to a (LEAFW) leaf yet, but leave this for a later deletion +// (hysteresis > 0); and the next JP type depends on the system word size; so +// dont use JU_BRANCH_KEEP(): + + case cJU_JPBRANCH_L: + { + Pjbl_t Pjbl; + Word_t numJPs; + + level = cJU_ROOTSTATE; + digit = JU_DIGITATSTATE(Index, cJU_ROOTSTATE); + + // fall through: + + +// COMMON CODE FOR KEEPING AND DESCENDING THROUGH A BRANCHL: +// +// Come here with level and digit set. + +BranchLKeep: + Pjbl = P_JBL(Pjp->jp_Addr); + numJPs = Pjbl->jbl_NumJPs; + assert(numJPs > 0); + DBGCODE(parentJPtype = JU_JPTYPE(Pjp);) + +// Search for a match to the digit (valid Index => must find digit): + + for (offset = 0; (Pjbl->jbl_Expanse[offset]) != digit; ++offset) + assert(offset < numJPs - 1); + + Pjp = (Pjbl->jbl_jp) + offset; + +// If not at a (deletable) JPIMMED_*_01, continue the walk (to descend through +// the BranchL): + + assert(level >= 2); + if ((JU_JPTYPE(Pjp)) != cJU_JPIMMED_1_01 + level - 2) break; + +// At JPIMMED_*_01: Ensure the index is in the right expanse, then delete the +// Immed from the BranchL: +// +// Note: A BranchL has a fixed size and format regardless of numJPs. + + assert(JU_JPDCDPOP0(Pjp) == JU_TRIMTODCDSIZE(Index)); + + JU_DELETEINPLACE(Pjbl->jbl_Expanse, numJPs, offset, ignore); + JU_DELETEINPLACE(Pjbl->jbl_jp, numJPs, offset, ignore); + + DBGCODE(JudyCheckSorted((Pjll_t) (Pjbl->jbl_Expanse), + numJPs - 1, 1);) + +// If only one index left in the BranchL, indicate this to the caller: + + return ((--(Pjbl->jbl_NumJPs) <= 1) ? 2 : 1); + + } // case cJU_JPBRANCH_L. + + +// **************************************************************************** +// BITMAP BRANCH: +// +// MACROS FOR COMMON CODE: +// +// Note the reuse of common macros here, defined earlier: JU_BRANCH_KEEP(), +// JU_PVALUE*. +// +// Compress a BranchB into a leaf one index size larger: +// +// Allocate a new leaf, walk the JPs in the old BranchB (one bitmap subexpanse +// at a time) and pack their contents into the new leaf (of type NewJPType), +// free the old BranchB, and finally restart the switch to delete Index from +// the new leaf. Variables Pjp, Pjpm, Pleaf, digit, and pop1 are in the +// context. +// +// Note: Its no accident that the interface to JU_BRANCHB_COMPRESS() is +// identical to JU_BRANCHL_COMPRESS(). Only the details differ in how to +// traverse the branchs JPs. + +#define JU_BRANCHB_COMPRESS(cLevel,LeafType,MaxPop1,NewJPType, \ + LeafToLeaf,Alloc,ValueArea, \ + CopyImmed,CopyIndex) \ + { \ + LeafType Pleaf; \ + Pjbb_t PjbbRaw; /* BranchB to compress */ \ + Pjbb_t Pjbb; \ + Word_t subexp; /* current subexpanse number */ \ + BITMAPB_t bitmap; /* portion for this subexpanse */ \ + Pjp_t Pjp2Raw; /* one subexpanses subarray */ \ + Pjp_t Pjp2; \ + \ + if ((PjllnewRaw = Alloc(MaxPop1, Pjpm)) == 0) return(-1); \ + Pjllnew = P_JLL(PjllnewRaw); \ + Pleaf = (LeafType) Pjllnew; \ + JUDYLCODE(Pjv = ValueArea(Pleaf, MaxPop1);) \ + \ + PjbbRaw = (Pjbb_t) (Pjp->jp_Addr); \ + Pjbb = P_JBB(PjbbRaw); \ + \ + for (subexp = 0; subexp < cJU_NUMSUBEXPB; ++subexp) \ + { \ + if ((bitmap = JU_JBB_BITMAP(Pjbb, subexp)) == 0) \ + continue; /* empty subexpanse */ \ + \ + digit = subexp * cJU_BITSPERSUBEXPB; \ + Pjp2Raw = JU_JBB_PJP(Pjbb, subexp); \ + Pjp2 = P_JP(Pjp2Raw); \ + assert(Pjp2 != (Pjp_t) NULL); \ + \ + for (offset = 0; bitmap != 0; bitmap >>= 1, ++digit) \ + { \ + if (! (bitmap & 1)) \ + continue; /* empty sub-subexpanse */ \ + \ + ++offset; /* before any continue */ \ + \ + CopyImmed(cLevel, Pjp2 + offset - 1, CopyIndex); \ + \ + pop1 = LeafToLeaf(Pleaf, JU_PVALUEPASS \ + Pjp2 + offset - 1, \ + JU_DIGITTOSTATE(digit, cLevel), \ + (Pvoid_t) Pjpm); \ + Pleaf = (LeafType) (((Word_t) Pleaf) + ((cLevel) * pop1)); \ + JUDYLCODE(Pjv += pop1;) \ + } \ + j__udyFreeJBBJP(Pjp2Raw, /* pop1 = */ offset, Pjpm); \ + } \ + assert(((((Word_t) Pleaf) - ((Word_t) Pjllnew)) / (cLevel)) == (MaxPop1)); \ + JUDYLCODE(assert((Pjv - ValueArea(Pjllnew, MaxPop1)) == (MaxPop1));) \ + DBGCODE(JudyCheckSorted(Pjllnew, MaxPop1, cLevel);) \ + \ + j__udyFreeJBB(PjbbRaw, Pjpm); \ + \ + Pjp->jp_Type = (NewJPType); \ + Pjp->jp_Addr = (Word_t) PjllnewRaw; \ + goto ContinueDelWalk; /* delete from new leaf */ \ + } + +// Overall common code for initial BranchB deletion handling: +// +// Assert that Index is in the branch, then see if the BranchB should be kept +// or else compressed to a leaf. Variables Index, Pjp, and pop1 are in the +// context. + +#define JU_BRANCHB(cLevel,MaxPop1,LeafType,NewJPType, \ + LeafToLeaf,Alloc,ValueArea,CopyImmed,CopyIndex) \ + \ + assert(! JU_DCDNOTMATCHINDEX(Index, Pjp, cLevel)); \ + assert(ParentLevel > (cLevel)); \ + \ + pop1 = JU_JPBRANCH_POP0(Pjp, cLevel) + 1; \ + JU_BRANCH_KEEP(cLevel, MaxPop1, BranchBKeep); \ + assert(pop1 == (MaxPop1)); \ + \ + JU_BRANCHB_COMPRESS(cLevel, LeafType, MaxPop1, NewJPType, \ + LeafToLeaf, Alloc, ValueArea, CopyImmed, CopyIndex) + + +// END OF MACROS, START OF CASES: +// +// Note: Its no accident that the macro calls for these cases is nearly +// identical to the code for BranchLs. + + case cJU_JPBRANCH_B2: + + JU_BRANCHB(2, cJU_LEAF2_MAXPOP1, uint16_t *, cJU_JPLEAF2, + j__udyLeaf1ToLeaf2, j__udyAllocJLL2, JL_LEAF2VALUEAREA, + JU_BRANCH_COPY_IMMED_EVEN, ignore); + + case cJU_JPBRANCH_B3: + + JU_BRANCHB(3, cJU_LEAF3_MAXPOP1, uint8_t *, cJU_JPLEAF3, + j__udyLeaf2ToLeaf3, j__udyAllocJLL3, JL_LEAF3VALUEAREA, + JU_BRANCH_COPY_IMMED_ODD, JU_COPY3_LONG_TO_PINDEX); + +#ifdef JU_64BIT + case cJU_JPBRANCH_B4: + + JU_BRANCHB(4, cJU_LEAF4_MAXPOP1, uint32_t *, cJU_JPLEAF4, + j__udyLeaf3ToLeaf4, j__udyAllocJLL4, JL_LEAF4VALUEAREA, + JU_BRANCH_COPY_IMMED_EVEN, ignore); + + case cJU_JPBRANCH_B5: + + JU_BRANCHB(5, cJU_LEAF5_MAXPOP1, uint8_t *, cJU_JPLEAF5, + j__udyLeaf4ToLeaf5, j__udyAllocJLL5, JL_LEAF5VALUEAREA, + JU_BRANCH_COPY_IMMED_ODD, JU_COPY5_LONG_TO_PINDEX); + + case cJU_JPBRANCH_B6: + + JU_BRANCHB(6, cJU_LEAF6_MAXPOP1, uint8_t *, cJU_JPLEAF6, + j__udyLeaf5ToLeaf6, j__udyAllocJLL6, JL_LEAF6VALUEAREA, + JU_BRANCH_COPY_IMMED_ODD, JU_COPY6_LONG_TO_PINDEX); + + case cJU_JPBRANCH_B7: + + JU_BRANCHB(7, cJU_LEAF7_MAXPOP1, uint8_t *, cJU_JPLEAF7, + j__udyLeaf6ToLeaf7, j__udyAllocJLL7, JL_LEAF7VALUEAREA, + JU_BRANCH_COPY_IMMED_ODD, JU_COPY7_LONG_TO_PINDEX); +#endif // JU_64BIT + +// A top-level BranchB is different and cannot use JU_BRANCHB(): Dont try to +// compress to a (LEAFW) leaf yet, but leave this for a later deletion +// (hysteresis > 0); and the next JP type depends on the system word size; so +// dont use JU_BRANCH_KEEP(): + + case cJU_JPBRANCH_B: + { + Pjbb_t Pjbb; // BranchB to modify. + Word_t subexp; // current subexpanse number. + Word_t subexp2; // in second-level loop. + BITMAPB_t bitmap; // portion for this subexpanse. + BITMAPB_t bitmask; // with digits bit set. + Pjp_t Pjp2Raw; // one subexpanses subarray. + Pjp_t Pjp2; + Word_t numJPs; // in one subexpanse. + + level = cJU_ROOTSTATE; + digit = JU_DIGITATSTATE(Index, cJU_ROOTSTATE); + + // fall through: + + +// COMMON CODE FOR KEEPING AND DESCENDING THROUGH A BRANCHB: +// +// Come here with level and digit set. + +BranchBKeep: + Pjbb = P_JBB(Pjp->jp_Addr); + subexp = digit / cJU_BITSPERSUBEXPB; + bitmap = JU_JBB_BITMAP(Pjbb, subexp); + bitmask = JU_BITPOSMASKB(digit); + assert(bitmap & bitmask); // Index valid => digits bit is set. + DBGCODE(parentJPtype = JU_JPTYPE(Pjp);) + +// Compute digits offset into the bitmap, with a fast method if all bits are +// set: + + offset = ((bitmap == (cJU_FULLBITMAPB)) ? + digit % cJU_BITSPERSUBEXPB : + j__udyCountBitsB(bitmap & JU_MASKLOWEREXC(bitmask))); + + Pjp2Raw = JU_JBB_PJP(Pjbb, subexp); + Pjp2 = P_JP(Pjp2Raw); + assert(Pjp2 != (Pjp_t) NULL); // valid subexpanse pointer. + +// If not at a (deletable) JPIMMED_*_01, continue the walk (to descend through +// the BranchB): + + if (JU_JPTYPE(Pjp2 + offset) != cJU_JPIMMED_1_01 + level - 2) + { + Pjp = Pjp2 + offset; + break; + } + +// At JPIMMED_*_01: Ensure the index is in the right expanse, then delete the +// Immed from the BranchB: + + assert(JU_JPDCDPOP0(Pjp2 + offset) + == JU_TRIMTODCDSIZE(Index)); + +// If only one index is left in the subexpanse, free the JP array: + + if ((numJPs = j__udyCountBitsB(bitmap)) == 1) + { + j__udyFreeJBBJP(Pjp2Raw, /* pop1 = */ 1, Pjpm); + JU_JBB_PJP(Pjbb, subexp) = (Pjp_t) NULL; + } + +// Shrink JP array in-place: + + else if (JU_BRANCHBJPGROWINPLACE(numJPs - 1)) + { + assert(numJPs > 0); + JU_DELETEINPLACE(Pjp2, numJPs, offset, ignore); + } + +// JP array would end up too large; compress it to a smaller one: + + else + { + Pjp_t PjpnewRaw; + Pjp_t Pjpnew; + + if ((PjpnewRaw = j__udyAllocJBBJP(numJPs - 1, Pjpm)) + == (Pjp_t) NULL) return(-1); + Pjpnew = P_JP(PjpnewRaw); + + JU_DELETECOPY(Pjpnew, Pjp2, numJPs, offset, ignore); + j__udyFreeJBBJP(Pjp2Raw, numJPs, Pjpm); // old. + + JU_JBB_PJP(Pjbb, subexp) = PjpnewRaw; + } + +// Clear digits bit in the bitmap: + + JU_JBB_BITMAP(Pjbb, subexp) ^= bitmask; + +// If the current subexpanse alone is still too large for a BranchL (with +// hysteresis = 1), the delete is all done: + + if (numJPs > cJU_BRANCHLMAXJPS) return(1); + +// Consider shrinking the current BranchB to a BranchL: +// +// Check the numbers of JPs in other subexpanses in the BranchL. Upon reaching +// the critical number of numJPs (which could be right at the start; again, +// with hysteresis = 1), its faster to just watch for any non-empty subexpanse +// than to count bits in each subexpanse. Upon finding too many JPs, give up +// on shrinking the BranchB. + + for (subexp2 = 0; subexp2 < cJU_NUMSUBEXPB; ++subexp2) + { + if (subexp2 == subexp) continue; // skip current subexpanse. + + if ((numJPs == cJU_BRANCHLMAXJPS) ? + JU_JBB_BITMAP(Pjbb, subexp2) : + ((numJPs += j__udyCountBitsB(JU_JBB_BITMAP(Pjbb, subexp2))) + > cJU_BRANCHLMAXJPS)) + { + return(1); // too many JPs, cannot shrink. + } + } + +// Shrink current BranchB to a BranchL: +// +// Note: In this rare case, ignore the return value, do not pass it to the +// caller, because the deletion is already successfully completed and the +// caller(s) must decrement population counts. The only errors expected from +// this call are JU_ERRNO_NOMEM and JU_ERRNO_OVERRUN, neither of which is worth +// forwarding from this point. See also 4.1, 4.8, and 4.15 of this file. + + (void) j__udyBranchBToBranchL(Pjp, Pjpm); + return(1); + + } // case. + + +// **************************************************************************** +// UNCOMPRESSED BRANCH: +// +// MACROS FOR COMMON CODE: +// +// Note the reuse of common macros here, defined earlier: JU_PVALUE*. +// +// Compress a BranchU into a leaf one index size larger: +// +// Allocate a new leaf, walk the JPs in the old BranchU and pack their contents +// into the new leaf (of type NewJPType), free the old BranchU, and finally +// restart the switch to delete Index from the new leaf. Variables Pjp, Pjpm, +// digit, and pop1 are in the context. +// +// Note: Its no accident that the interface to JU_BRANCHU_COMPRESS() is +// nearly identical to JU_BRANCHL_COMPRESS(); just NullJPType is added. The +// details differ in how to traverse the branchs JPs -- +// +// -- and also, what to do upon encountering a cJU_JPIMMED_*_01 JP. In +// BranchLs and BranchBs the JP must be deleted, but in a BranchU its merely +// converted to a null JP, and this is done by other switch cases, so the "keep +// branch" situation is simpler here and JU_BRANCH_KEEP() is not used. Also, +// theres no code to convert a BranchU to a BranchB since counting the JPs in +// a BranchU is (at least presently) expensive, and besides, keeping around a +// BranchU is form of hysteresis. + +#define JU_BRANCHU_COMPRESS(cLevel,LeafType,MaxPop1,NullJPType,NewJPType, \ + LeafToLeaf,Alloc,ValueArea,CopyImmed,CopyIndex) \ + { \ + LeafType Pleaf; \ + Pjbu_t PjbuRaw = (Pjbu_t) (Pjp->jp_Addr); \ + Pjp_t Pjp2 = JU_JBU_PJP0(Pjp); \ + Word_t ldigit; /* larger than uint8_t */ \ + \ + if ((PjllnewRaw = Alloc(MaxPop1, Pjpm)) == 0) return(-1); \ + Pjllnew = P_JLL(PjllnewRaw); \ + Pleaf = (LeafType) Pjllnew; \ + JUDYLCODE(Pjv = ValueArea(Pleaf, MaxPop1);) \ + \ + for (ldigit = 0; ldigit < cJU_BRANCHUNUMJPS; ++ldigit, ++Pjp2) \ + { \ + /* fast-process common types: */ \ + if (JU_JPTYPE(Pjp2) == (NullJPType)) continue; \ + CopyImmed(cLevel, Pjp2, CopyIndex); \ + \ + pop1 = LeafToLeaf(Pleaf, JU_PVALUEPASS Pjp2, \ + JU_DIGITTOSTATE(ldigit, cLevel), \ + (Pvoid_t) Pjpm); \ + Pleaf = (LeafType) (((Word_t) Pleaf) + ((cLevel) * pop1)); \ + JUDYLCODE(Pjv += pop1;) \ + } \ + assert(((((Word_t) Pleaf) - ((Word_t) Pjllnew)) / (cLevel)) == (MaxPop1)); \ + JUDYLCODE(assert((Pjv - ValueArea(Pjllnew, MaxPop1)) == (MaxPop1));) \ + DBGCODE(JudyCheckSorted(Pjllnew, MaxPop1, cLevel);) \ + \ + j__udyFreeJBU(PjbuRaw, Pjpm); \ + \ + Pjp->jp_Type = (NewJPType); \ + Pjp->jp_Addr = (Word_t) PjllnewRaw; \ + goto ContinueDelWalk; /* delete from new leaf */ \ + } + +// Overall common code for initial BranchU deletion handling: +// +// Assert that Index is in the branch, then see if a BranchU should be kept or +// else compressed to a leaf. Variables level, Index, Pjp, and pop1 are in the +// context. +// +// Note: BranchU handling differs from BranchL and BranchB as described above. + +#define JU_BRANCHU(cLevel,MaxPop1,LeafType,NullJPType,NewJPType, \ + LeafToLeaf,Alloc,ValueArea,CopyImmed,CopyIndex) \ + \ + assert(! JU_DCDNOTMATCHINDEX(Index, Pjp, cLevel)); \ + assert(ParentLevel > (cLevel)); \ + DBGCODE(parentJPtype = JU_JPTYPE(Pjp);) \ + \ + pop1 = JU_JPBRANCH_POP0(Pjp, cLevel) + 1; \ + \ + if (pop1 > (MaxPop1)) /* hysteresis = 1 */ \ + { \ + level = (cLevel); \ + Pjp = P_JP(Pjp->jp_Addr) + JU_DIGITATSTATE(Index, cLevel);\ + break; /* descend to next level */ \ + } \ + assert(pop1 == (MaxPop1)); \ + \ + JU_BRANCHU_COMPRESS(cLevel, LeafType, MaxPop1, NullJPType, NewJPType, \ + LeafToLeaf, Alloc, ValueArea, CopyImmed, CopyIndex) + + +// END OF MACROS, START OF CASES: +// +// Note: Its no accident that the macro calls for these cases is nearly +// identical to the code for BranchLs, with the addition of cJU_JPNULL* +// parameters only needed for BranchUs. + + case cJU_JPBRANCH_U2: + + JU_BRANCHU(2, cJU_LEAF2_MAXPOP1, uint16_t *, + cJU_JPNULL1, cJU_JPLEAF2, + j__udyLeaf1ToLeaf2, j__udyAllocJLL2, JL_LEAF2VALUEAREA, + JU_BRANCH_COPY_IMMED_EVEN, ignore); + + case cJU_JPBRANCH_U3: + + JU_BRANCHU(3, cJU_LEAF3_MAXPOP1, uint8_t *, + cJU_JPNULL2, cJU_JPLEAF3, + j__udyLeaf2ToLeaf3, j__udyAllocJLL3, JL_LEAF3VALUEAREA, + JU_BRANCH_COPY_IMMED_ODD, JU_COPY3_LONG_TO_PINDEX); + +#ifdef JU_64BIT + case cJU_JPBRANCH_U4: + + JU_BRANCHU(4, cJU_LEAF4_MAXPOP1, uint32_t *, + cJU_JPNULL3, cJU_JPLEAF4, + j__udyLeaf3ToLeaf4, j__udyAllocJLL4, JL_LEAF4VALUEAREA, + JU_BRANCH_COPY_IMMED_EVEN, ignore); + + case cJU_JPBRANCH_U5: + + JU_BRANCHU(5, cJU_LEAF5_MAXPOP1, uint8_t *, + cJU_JPNULL4, cJU_JPLEAF5, + j__udyLeaf4ToLeaf5, j__udyAllocJLL5, JL_LEAF5VALUEAREA, + JU_BRANCH_COPY_IMMED_ODD, JU_COPY5_LONG_TO_PINDEX); + + case cJU_JPBRANCH_U6: + + JU_BRANCHU(6, cJU_LEAF6_MAXPOP1, uint8_t *, + cJU_JPNULL5, cJU_JPLEAF6, + j__udyLeaf5ToLeaf6, j__udyAllocJLL6, JL_LEAF6VALUEAREA, + JU_BRANCH_COPY_IMMED_ODD, JU_COPY6_LONG_TO_PINDEX); + + case cJU_JPBRANCH_U7: + + JU_BRANCHU(7, cJU_LEAF7_MAXPOP1, uint8_t *, + cJU_JPNULL6, cJU_JPLEAF7, + j__udyLeaf6ToLeaf7, j__udyAllocJLL7, JL_LEAF7VALUEAREA, + JU_BRANCH_COPY_IMMED_ODD, JU_COPY7_LONG_TO_PINDEX); +#endif // JU_64BIT + +// A top-level BranchU is different and cannot use JU_BRANCHU(): Dont try to +// compress to a (LEAFW) leaf yet, but leave this for a later deletion +// (hysteresis > 0); just descend through the BranchU: + + case cJU_JPBRANCH_U: + + DBGCODE(parentJPtype = JU_JPTYPE(Pjp);) + + level = cJU_ROOTSTATE; + Pjp = P_JP(Pjp->jp_Addr) + JU_DIGITATSTATE(Index, cJU_ROOTSTATE); + break; + + +// **************************************************************************** +// LINEAR LEAF: +// +// State transitions while deleting an Index, the inverse of the similar table +// that appears in JudyIns.c: +// +// Note: In JudyIns.c this table is not needed and does not appear until the +// Immed handling code; because once a Leaf is reached upon growing the tree, +// the situation remains simpler, but for deleting indexes, the complexity +// arises when leaves must compress to Immeds. +// +// Note: There are other transitions possible too, not shown here, such as to +// a leaf one level higher. +// +// (Yes, this is very terse... Study it and it will make sense.) +// (Note, parts of this diagram are repeated below for quick reference.) +// +// reformat JP here for Judy1 only, from word-1 to word-2 +// | +// JUDY1 && JU_64BIT JUDY1 || JU_64BIT | +// V +// (*) Leaf1 [[ => 1_15..08 ] => 1_07 => ... => 1_04 ] => 1_03 => 1_02 => 1_01 +// Leaf2 [[ => 2_07..04 ] => 2_03 => 2_02 ] => 2_01 +// Leaf3 [[ => 3_05..03 ] => 3_02 ] => 3_01 +// JU_64BIT only: +// Leaf4 [[ => 4_03..02 ]] => 4_01 +// Leaf5 [[ => 5_03..02 ]] => 5_01 +// Leaf6 [[ => 6_02 ]] => 6_01 +// Leaf7 [[ => 7_02 ]] => 7_01 +// +// (*) For Judy1 & 64-bit, go directly from a LeafB1 to cJU_JPIMMED_1_15; skip +// Leaf1, as described in Judy1.h regarding cJ1_JPLEAF1. +// +// MACROS FOR COMMON CODE: +// +// (De)compress a LeafX into a LeafY one index size (cIS) larger (X+1 = Y): +// +// This is only possible when the current leaf is under a narrow pointer +// ((ParentLevel - 1) > cIS) and its population fits in a higher-level leaf. +// Variables ParentLevel, pop1, PjllnewRaw, Pjllnew, Pjpm, and Index are in the +// context. +// +// Note: Doing an "uplevel" doesnt occur until the old leaf can be compressed +// up one level BEFORE deleting an index; that is, hysteresis = 1. +// +// Note: LeafType, MaxPop1, NewJPType, and Alloc refer to the up-level leaf, +// not the current leaf. +// +// Note: 010327: Fixed bug where the jp_DcdPopO next-uplevel digit (byte) +// above the current Pop0 value was not being cleared. When upleveling, one +// digit in jp_DcdPopO "moves" from being part of the Dcd subfield to the Pop0 +// subfield, but since a leaf maxpop1 is known to be <= 1 byte in size, the new +// Pop0 byte should always be zero. This is easy to overlook because +// JU_JPLEAF_POP0() "knows" to only use the LSB of Pop0 (for efficiency) and +// ignore the other bytes... Until someone uses cJU_POP0MASK() instead of +// JU_JPLEAF_POP0(), such as in JudyInsertBranch.c. +// +// TBD: Should JudyInsertBranch.c use JU_JPLEAF_POP0() rather than +// cJU_POP0MASK(), for efficiency? Does it know for sure its a narrow pointer +// under the leaf? Not necessarily. + +#define JU_LEAF_UPLEVEL(cIS,LeafType,MaxPop1,NewJPType,LeafToLeaf, \ + Alloc,ValueArea) \ + \ + assert(((ParentLevel - 1) == (cIS)) || (pop1 >= (MaxPop1))); \ + \ + if (((ParentLevel - 1) > (cIS)) /* under narrow pointer */ \ + && (pop1 == (MaxPop1))) /* hysteresis = 1 */ \ + { \ + Word_t D_cdP0; \ + if ((PjllnewRaw = Alloc(MaxPop1, Pjpm)) == 0) return(-1); \ + Pjllnew = P_JLL(PjllnewRaw); \ + JUDYLCODE(Pjv = ValueArea((LeafType) Pjllnew, MaxPop1);) \ + \ + (void) LeafToLeaf((LeafType) Pjllnew, JU_PVALUEPASS Pjp, \ + Index & cJU_DCDMASK(cIS), /* TBD, Doug says */ \ + (Pvoid_t) Pjpm); \ + DBGCODE(JudyCheckSorted(Pjllnew, MaxPop1, cIS + 1);) \ + \ + D_cdP0 = (~cJU_MASKATSTATE((cIS) + 1)) & JU_JPDCDPOP0(Pjp); \ + JU_JPSETADT(Pjp, (Word_t)PjllnewRaw, D_cdP0, NewJPType); \ + goto ContinueDelWalk; /* delete from new leaf */ \ + } + + +// For Leaf3, only support JU_LEAF_UPLEVEL on a 64-bit system, and for Leaf7, +// there is no JU_LEAF_UPLEVEL: +// +// Note: Theres no way here to go from Leaf3 [Leaf7] to LEAFW on a 32-bit +// [64-bit] system. Thats handled in the main code, because its different in +// that a JPM is involved. + +#ifndef JU_64BIT // 32-bit. +#define JU_LEAF_UPLEVEL64(cIS,LeafType,MaxPop1,NewJPType,LeafToLeaf, \ + Alloc,ValueArea) // null. +#else +#define JU_LEAF_UPLEVEL64(cIS,LeafType,MaxPop1,NewJPType,LeafToLeaf, \ + Alloc,ValueArea) \ + JU_LEAF_UPLEVEL (cIS,LeafType,MaxPop1,NewJPType,LeafToLeaf, \ + Alloc,ValueArea) +#define JU_LEAF_UPLEVEL_NONE(cIS,LeafType,MaxPop1,NewJPType,LeafToLeaf, \ + Alloc,ValueArea) // null. +#endif + +// Compress a Leaf* with pop1 = 2, or a JPIMMED_*_02, into a JPIMMED_*_01: +// +// Copy whichever Index is NOT being deleted (and assert that the other one is +// found; Index must be valid). This requires special handling of the Index +// bytes (and value area). Variables Pjp, Index, offset, and Pleaf are in the +// context, offset is modified to the undeleted Index, and Pjp is modified +// including jp_Addr. + + +#define JU_TOIMMED_01_EVEN(cIS,ignore1,ignore2) \ +{ \ + Word_t D_cdP0; \ + Word_t A_ddr = 0; \ + uint8_t T_ype = JU_JPTYPE(Pjp); \ + offset = (Pleaf[0] == JU_LEASTBYTES(Index, cIS)); /* undeleted Ind */ \ + assert(Pleaf[offset ? 0 : 1] == JU_LEASTBYTES(Index, cIS)); \ + D_cdP0 = (Index & cJU_DCDMASK(cIS)) | Pleaf[offset]; \ +JUDYLCODE(A_ddr = Pjv[offset];) \ + JU_JPSETADT(Pjp, A_ddr, D_cdP0, T_ype); \ +} + +#define JU_TOIMMED_01_ODD(cIS,SearchLeaf,CopyPIndex) \ + { \ + Word_t D_cdP0; \ + Word_t A_ddr = 0; \ + uint8_t T_ype = JU_JPTYPE(Pjp); \ + \ + offset = SearchLeaf(Pleaf, 2, Index); \ + assert(offset >= 0); /* Index must be valid */ \ + CopyPIndex(D_cdP0, & (Pleaf[offset ? 0 : cIS])); \ + D_cdP0 |= Index & cJU_DCDMASK(cIS); \ + JUDYLCODE(A_ddr = Pjv[offset ? 0 : 1];) \ + JU_JPSETADT(Pjp, A_ddr, D_cdP0, T_ype); \ + } + + +// Compress a Leaf* into a JPIMMED_*_0[2+]: +// +// This occurs as soon as its possible, with hysteresis = 0. Variables pop1, +// Pleaf, offset, and Pjpm are in the context. +// +// TBD: Explain why hysteresis = 0 here, rather than > 0. Probably because +// the insert code assumes if the population is small enough, an Immed is used, +// not a leaf. +// +// The differences between Judy1 and JudyL with respect to value area handling +// are just too large for completely common code between them... Oh well, some +// big ifdefs follow. + +#ifdef JUDY1 + +#define JU_LEAF_TOIMMED(cIS,LeafType,MaxPop1,BaseJPType,ignore1,\ + ignore2,ignore3,ignore4, \ + DeleteCopy,FreeLeaf) \ + \ + assert(pop1 > (MaxPop1)); \ + \ + if ((pop1 - 1) == (MaxPop1)) /* hysteresis = 0 */ \ + { \ + Pjll_t PjllRaw = (Pjll_t) (Pjp->jp_Addr); \ + DeleteCopy((LeafType) (Pjp->jp_1Index), Pleaf, pop1, offset, cIS); \ + DBGCODE(JudyCheckSorted((Pjll_t) (Pjp->jp_1Index), pop1-1, cIS);) \ + Pjp->jp_Type = (BaseJPType) - 1 + (MaxPop1) - 1; \ + FreeLeaf(PjllRaw, pop1, Pjpm); \ + return(1); \ + } + +#else // JUDYL + +// Pjv is also in the context. + +#define JU_LEAF_TOIMMED(cIS,LeafType,MaxPop1,BaseJPType,ignore1,\ + ignore2,ignore3,ignore4, \ + DeleteCopy,FreeLeaf) \ + \ + assert(pop1 > (MaxPop1)); \ + \ + if ((pop1 - 1) == (MaxPop1)) /* hysteresis = 0 */ \ + { \ + Pjll_t PjllRaw = (Pjll_t) (Pjp->jp_Addr); \ + Pjv_t PjvnewRaw; \ + Pjv_t Pjvnew; \ + \ + if ((PjvnewRaw = j__udyLAllocJV(pop1 - 1, Pjpm)) \ + == (Pjv_t) NULL) return(-1); \ + JUDYLCODE(Pjvnew = P_JV(PjvnewRaw);) \ + \ + DeleteCopy((LeafType) (Pjp->jp_LIndex), Pleaf, pop1, offset, cIS); \ + JU_DELETECOPY(Pjvnew, Pjv, pop1, offset, cIS); \ + DBGCODE(JudyCheckSorted((Pjll_t) (Pjp->jp_LIndex), pop1-1, cIS);) \ + FreeLeaf(PjllRaw, pop1, Pjpm); \ + Pjp->jp_Addr = (Word_t) PjvnewRaw; \ + Pjp->jp_Type = (BaseJPType) - 2 + (MaxPop1); \ + return(1); \ + } + +// A complicating factor for JudyL & 32-bit is that Leaf2..3, and for JudyL & +// 64-bit Leaf 4..7, go directly to an Immed*_01, where the value is stored in +// jp_Addr and not in a separate LeafV. For efficiency, use the following +// macro in cases where it can apply; it is rigged to do the right thing. +// Unfortunately, this requires the calling code to "know" the transition table +// and call the right macro. +// +// This variant compresses a Leaf* with pop1 = 2 into a JPIMMED_*_01: + +#define JU_LEAF_TOIMMED_01(cIS,LeafType,MaxPop1,ignore,Immed01JPType, \ + ToImmed,SearchLeaf,CopyPIndex, \ + DeleteCopy,FreeLeaf) \ + \ + assert(pop1 > (MaxPop1)); \ + \ + if ((pop1 - 1) == (MaxPop1)) /* hysteresis = 0 */ \ + { \ + Pjll_t PjllRaw = (Pjll_t) (Pjp->jp_Addr); \ + ToImmed(cIS, SearchLeaf, CopyPIndex); \ + FreeLeaf(PjllRaw, pop1, Pjpm); \ + Pjp->jp_Type = (Immed01JPType); \ + return(1); \ + } +#endif // JUDYL + +// See comments above about these: +// +// Note: Here "23" means index size 2 or 3, and "47" means 4..7. + +#if (defined(JUDY1) || defined(JU_64BIT)) +#define JU_LEAF_TOIMMED_23(cIS,LeafType,MaxPop1,BaseJPType,Immed01JPType, \ + ToImmed,SearchLeaf,CopyPIndex, \ + DeleteCopy,FreeLeaf) \ + JU_LEAF_TOIMMED( cIS,LeafType,MaxPop1,BaseJPType,ignore1, \ + ignore2,ignore3,ignore4, \ + DeleteCopy,FreeLeaf) +#else // JUDYL && 32-bit +#define JU_LEAF_TOIMMED_23(cIS,LeafType,MaxPop1,BaseJPType,Immed01JPType, \ + ToImmed,SearchLeaf,CopyPIndex, \ + DeleteCopy,FreeLeaf) \ + JU_LEAF_TOIMMED_01(cIS,LeafType,MaxPop1,ignore,Immed01JPType, \ + ToImmed,SearchLeaf,CopyPIndex, \ + DeleteCopy,FreeLeaf) +#endif + +#ifdef JU_64BIT +#ifdef JUDY1 +#define JU_LEAF_TOIMMED_47(cIS,LeafType,MaxPop1,BaseJPType,Immed01JPType, \ + ToImmed,SearchLeaf,CopyPIndex, \ + DeleteCopy,FreeLeaf) \ + JU_LEAF_TOIMMED( cIS,LeafType,MaxPop1,BaseJPType,ignore1, \ + ignore2,ignore3,ignore4, \ + DeleteCopy,FreeLeaf) +#else // JUDYL && 64-bit +#define JU_LEAF_TOIMMED_47(cIS,LeafType,MaxPop1,BaseJPType,Immed01JPType, \ + ToImmed,SearchLeaf,CopyPIndex, \ + DeleteCopy,FreeLeaf) \ + JU_LEAF_TOIMMED_01(cIS,LeafType,MaxPop1,ignore,Immed01JPType, \ + ToImmed,SearchLeaf,CopyPIndex, \ + DeleteCopy,FreeLeaf) +#endif // JUDYL +#endif // JU_64BIT + +// Compress a Leaf* in place: +// +// Here hysteresis = 0 (no memory is wasted). Variables pop1, Pleaf, and +// offset, and for JudyL, Pjv, are in the context. + +#ifdef JUDY1 +#define JU_LEAF_INPLACE(cIS,GrowInPlace,DeleteInPlace) \ + if (GrowInPlace(pop1 - 1)) /* hysteresis = 0 */ \ + { \ + DeleteInPlace(Pleaf, pop1, offset, cIS); \ + DBGCODE(JudyCheckSorted(Pleaf, pop1 - 1, cIS);) \ + return(1); \ + } +#else +#define JU_LEAF_INPLACE(cIS,GrowInPlace,DeleteInPlace) \ + if (GrowInPlace(pop1 - 1)) /* hysteresis = 0 */ \ + { \ + DeleteInPlace(Pleaf, pop1, offset, cIS); \ +/**/ JU_DELETEINPLACE(Pjv, pop1, offset, ignore); \ + DBGCODE(JudyCheckSorted(Pleaf, pop1 - 1, cIS);) \ + return(1); \ + } +#endif + +// Compress a Leaf* into a smaller memory object of the same JP type: +// +// Variables PjllnewRaw, Pjllnew, Pleafpop1, Pjpm, PleafRaw, Pleaf, and offset +// are in the context. + +#ifdef JUDY1 + +#define JU_LEAF_SHRINK(cIS,LeafType,DeleteCopy,Alloc,FreeLeaf,ValueArea) \ + if ((PjllnewRaw = Alloc(pop1 - 1, Pjpm)) == 0) return(-1); \ + Pjllnew = P_JLL(PjllnewRaw); \ + DeleteCopy((LeafType) Pjllnew, Pleaf, pop1, offset, cIS); \ + DBGCODE(JudyCheckSorted(Pjllnew, pop1 - 1, cIS);) \ + FreeLeaf(PleafRaw, pop1, Pjpm); \ + Pjp->jp_Addr = (Word_t) PjllnewRaw; \ + return(1) + +#else // JUDYL + +#define JU_LEAF_SHRINK(cIS,LeafType,DeleteCopy,Alloc,FreeLeaf,ValueArea) \ + { \ +/**/ Pjv_t Pjvnew; \ + \ + if ((PjllnewRaw = Alloc(pop1 - 1, Pjpm)) == 0) return(-1); \ + Pjllnew = P_JLL(PjllnewRaw); \ +/**/ Pjvnew = ValueArea(Pjllnew, pop1 - 1); \ + DeleteCopy((LeafType) Pjllnew, Pleaf, pop1, offset, cIS); \ +/**/ JU_DELETECOPY(Pjvnew, Pjv, pop1, offset, cIS); \ + DBGCODE(JudyCheckSorted(Pjllnew, pop1 - 1, cIS);) \ + FreeLeaf(PleafRaw, pop1, Pjpm); \ + Pjp->jp_Addr = (Word_t) PjllnewRaw; \ + return(1); \ + } +#endif // JUDYL + +// Overall common code for Leaf* deletion handling: +// +// See if the leaf can be: +// - (de)compressed to one a level higher (JU_LEAF_UPLEVEL()), or if not, +// - compressed to an Immediate JP (JU_LEAF_TOIMMED()), or if not, +// - shrunk in place (JU_LEAF_INPLACE()), or if none of those, then +// - shrink the leaf to a smaller chunk of memory (JU_LEAF_SHRINK()). +// +// Variables Pjp, pop1, Index, and offset are in the context. +// The *Up parameters refer to a leaf one level up, if there is any. + +#define JU_LEAF(cIS, \ + UpLevel, \ + LeafTypeUp,MaxPop1Up,LeafJPTypeUp,LeafToLeaf, \ + AllocUp,ValueAreaUp, \ + LeafToImmed,ToImmed,CopyPIndex, \ + LeafType,ImmedMaxPop1,ImmedBaseJPType,Immed01JPType, \ + SearchLeaf,GrowInPlace,DeleteInPlace,DeleteCopy, \ + Alloc,FreeLeaf,ValueArea) \ + { \ + Pjll_t PleafRaw; \ + LeafType Pleaf; \ + \ + assert(! JU_DCDNOTMATCHINDEX(Index, Pjp, cIS)); \ + assert(ParentLevel > (cIS)); \ + \ + PleafRaw = (Pjll_t) (Pjp->jp_Addr); \ + Pleaf = (LeafType) P_JLL(PleafRaw); \ + pop1 = JU_JPLEAF_POP0(Pjp) + 1; \ + \ + UpLevel(cIS, LeafTypeUp, MaxPop1Up, LeafJPTypeUp, \ + LeafToLeaf, AllocUp, ValueAreaUp); \ + \ + offset = SearchLeaf(Pleaf, pop1, Index); \ + assert(offset >= 0); /* Index must be valid */ \ + JUDYLCODE(Pjv = ValueArea(Pleaf, pop1);) \ + \ + LeafToImmed(cIS, LeafType, ImmedMaxPop1, \ + ImmedBaseJPType, Immed01JPType, \ + ToImmed, SearchLeaf, CopyPIndex, \ + DeleteCopy, FreeLeaf); \ + \ + JU_LEAF_INPLACE(cIS, GrowInPlace, DeleteInPlace); \ + \ + JU_LEAF_SHRINK(cIS, LeafType, DeleteCopy, Alloc, FreeLeaf, \ + ValueArea); \ + } + +// END OF MACROS, START OF CASES: +// +// (*) Leaf1 [[ => 1_15..08 ] => 1_07 => ... => 1_04 ] => 1_03 => 1_02 => 1_01 + +#if (defined(JUDYL) || (! defined(JU_64BIT))) + case cJU_JPLEAF1: + + JU_LEAF(1, + JU_LEAF_UPLEVEL, uint16_t *, cJU_LEAF2_MAXPOP1, cJU_JPLEAF2, + j__udyLeaf1ToLeaf2, j__udyAllocJLL2, JL_LEAF2VALUEAREA, + JU_LEAF_TOIMMED, ignore, ignore, + uint8_t *, cJU_IMMED1_MAXPOP1, + cJU_JPIMMED_1_02, cJU_JPIMMED_1_01, j__udySearchLeaf1, + JU_LEAF1GROWINPLACE, JU_DELETEINPLACE, JU_DELETECOPY, + j__udyAllocJLL1, j__udyFreeJLL1, JL_LEAF1VALUEAREA); +#endif + +// A complicating factor is that for JudyL & 32-bit, a Leaf2 must go directly +// to an Immed 2_01 and a Leaf3 must go directly to an Immed 3_01: +// +// Leaf2 [[ => 2_07..04 ] => 2_03 => 2_02 ] => 2_01 +// Leaf3 [[ => 3_05..03 ] => 3_02 ] => 3_01 +// +// Hence use JU_LEAF_TOIMMED_23 instead of JU_LEAF_TOIMMED in the cases below, +// and also the parameters ToImmed and, for odd index sizes, CopyPIndex, are +// required. + + case cJU_JPLEAF2: + + JU_LEAF(2, + JU_LEAF_UPLEVEL, uint8_t *, cJU_LEAF3_MAXPOP1, cJU_JPLEAF3, + j__udyLeaf2ToLeaf3, j__udyAllocJLL3, JL_LEAF3VALUEAREA, + JU_LEAF_TOIMMED_23, JU_TOIMMED_01_EVEN, ignore, + uint16_t *, cJU_IMMED2_MAXPOP1, + cJU_JPIMMED_2_02, cJU_JPIMMED_2_01, j__udySearchLeaf2, + JU_LEAF2GROWINPLACE, JU_DELETEINPLACE, JU_DELETECOPY, + j__udyAllocJLL2, j__udyFreeJLL2, JL_LEAF2VALUEAREA); + +// On 32-bit there is no transition to "uplevel" for a Leaf3, so use +// JU_LEAF_UPLEVEL64 instead of JU_LEAF_UPLEVEL: + + case cJU_JPLEAF3: + + JU_LEAF(3, + JU_LEAF_UPLEVEL64, uint32_t *, cJU_LEAF4_MAXPOP1, + cJU_JPLEAF4, + j__udyLeaf3ToLeaf4, j__udyAllocJLL4, JL_LEAF4VALUEAREA, + JU_LEAF_TOIMMED_23, + JU_TOIMMED_01_ODD, JU_COPY3_PINDEX_TO_LONG, + uint8_t *, cJU_IMMED3_MAXPOP1, + cJU_JPIMMED_3_02, cJU_JPIMMED_3_01, j__udySearchLeaf3, + JU_LEAF3GROWINPLACE, JU_DELETEINPLACE_ODD, + JU_DELETECOPY_ODD, + j__udyAllocJLL3, j__udyFreeJLL3, JL_LEAF3VALUEAREA); + +#ifdef JU_64BIT + +// A complicating factor is that for JudyL & 64-bit, a Leaf[4-7] must go +// directly to an Immed [4-7]_01: +// +// Leaf4 [[ => 4_03..02 ]] => 4_01 +// Leaf5 [[ => 5_03..02 ]] => 5_01 +// Leaf6 [[ => 6_02 ]] => 6_01 +// Leaf7 [[ => 7_02 ]] => 7_01 +// +// Hence use JU_LEAF_TOIMMED_47 instead of JU_LEAF_TOIMMED in the cases below. + + case cJU_JPLEAF4: + + JU_LEAF(4, + JU_LEAF_UPLEVEL, uint8_t *, cJU_LEAF5_MAXPOP1, cJU_JPLEAF5, + j__udyLeaf4ToLeaf5, j__udyAllocJLL5, JL_LEAF5VALUEAREA, + JU_LEAF_TOIMMED_47, JU_TOIMMED_01_EVEN, ignore, + uint32_t *, cJU_IMMED4_MAXPOP1, + cJ1_JPIMMED_4_02, cJU_JPIMMED_4_01, j__udySearchLeaf4, + JU_LEAF4GROWINPLACE, JU_DELETEINPLACE, JU_DELETECOPY, + j__udyAllocJLL4, j__udyFreeJLL4, JL_LEAF4VALUEAREA); + + case cJU_JPLEAF5: + + JU_LEAF(5, + JU_LEAF_UPLEVEL, uint8_t *, cJU_LEAF6_MAXPOP1, cJU_JPLEAF6, + j__udyLeaf5ToLeaf6, j__udyAllocJLL6, JL_LEAF6VALUEAREA, + JU_LEAF_TOIMMED_47, + JU_TOIMMED_01_ODD, JU_COPY5_PINDEX_TO_LONG, + uint8_t *, cJU_IMMED5_MAXPOP1, + cJ1_JPIMMED_5_02, cJU_JPIMMED_5_01, j__udySearchLeaf5, + JU_LEAF5GROWINPLACE, JU_DELETEINPLACE_ODD, + JU_DELETECOPY_ODD, + j__udyAllocJLL5, j__udyFreeJLL5, JL_LEAF5VALUEAREA); + + case cJU_JPLEAF6: + + JU_LEAF(6, + JU_LEAF_UPLEVEL, uint8_t *, cJU_LEAF7_MAXPOP1, cJU_JPLEAF7, + j__udyLeaf6ToLeaf7, j__udyAllocJLL7, JL_LEAF7VALUEAREA, + JU_LEAF_TOIMMED_47, + JU_TOIMMED_01_ODD, JU_COPY6_PINDEX_TO_LONG, + uint8_t *, cJU_IMMED6_MAXPOP1, + cJ1_JPIMMED_6_02, cJU_JPIMMED_6_01, j__udySearchLeaf6, + JU_LEAF6GROWINPLACE, JU_DELETEINPLACE_ODD, + JU_DELETECOPY_ODD, + j__udyAllocJLL6, j__udyFreeJLL6, JL_LEAF6VALUEAREA); + +// There is no transition to "uplevel" for a Leaf7, so use JU_LEAF_UPLEVEL_NONE +// instead of JU_LEAF_UPLEVEL, and ignore all of the parameters to that macro: + + case cJU_JPLEAF7: + + JU_LEAF(7, + JU_LEAF_UPLEVEL_NONE, ignore1, ignore2, ignore3, ignore4, + ignore5, ignore6, + JU_LEAF_TOIMMED_47, + JU_TOIMMED_01_ODD, JU_COPY7_PINDEX_TO_LONG, + uint8_t *, cJU_IMMED7_MAXPOP1, + cJ1_JPIMMED_7_02, cJU_JPIMMED_7_01, j__udySearchLeaf7, + JU_LEAF7GROWINPLACE, JU_DELETEINPLACE_ODD, + JU_DELETECOPY_ODD, + j__udyAllocJLL7, j__udyFreeJLL7, JL_LEAF7VALUEAREA); +#endif // JU_64BIT + + +// **************************************************************************** +// BITMAP LEAF: + + case cJU_JPLEAF_B1: + { +#ifdef JUDYL + Pjv_t PjvnewRaw; // new value area. + Pjv_t Pjvnew; + Word_t subexp; // 1 of 8 subexpanses in bitmap. + Pjlb_t Pjlb; // pointer to bitmap part of the leaf. + BITMAPL_t bitmap; // for one subexpanse. + BITMAPL_t bitmask; // bit set for Indexs digit. +#endif + assert(! JU_DCDNOTMATCHINDEX(Index, Pjp, 1)); + assert(ParentLevel > 1); + // valid Index: + assert(JU_BITMAPTESTL(P_JLB(Pjp->jp_Addr), Index)); + + pop1 = JU_JPLEAF_POP0(Pjp) + 1; + +// Like a Leaf1, see if its under a narrow pointer and can become a Leaf2 +// (hysteresis = 1): + + JU_LEAF_UPLEVEL(1, uint16_t *, cJU_LEAF2_MAXPOP1, cJU_JPLEAF2, + j__udyLeaf1ToLeaf2, j__udyAllocJLL2, + JL_LEAF2VALUEAREA); + +#if (defined(JUDY1) && defined(JU_64BIT)) + +// Handle the unusual special case, on Judy1 64-bit only, where a LeafB1 goes +// directly to a JPIMMED_1_15; as described in comments in Judy1.h and +// JudyIns.c. Copy 1-byte indexes from old LeafB1 to the Immed: + + if ((pop1 - 1) == cJU_IMMED1_MAXPOP1) // hysteresis = 0. + { + Pjlb_t PjlbRaw; // bitmap in old leaf. + Pjlb_t Pjlb; + uint8_t * Pleafnew; // JPIMMED as a pointer. + Word_t ldigit; // larger than uint8_t. + + PjlbRaw = (Pjlb_t) (Pjp->jp_Addr); + Pjlb = P_JLB(PjlbRaw); + Pleafnew = Pjp->jp_1Index; + + JU_BITMAPCLEARL(Pjlb, Index); // unset Indexs bit. + +// TBD: This is very slow, there must be a better way: + + for (ldigit = 0; ldigit < cJU_BRANCHUNUMJPS; ++ldigit) + { + if (JU_BITMAPTESTL(Pjlb, ldigit)) + { + *Pleafnew++ = ldigit; + assert(Pleafnew - (Pjp->jp_1Index) + <= cJU_IMMED1_MAXPOP1); + } + } + + DBGCODE(JudyCheckSorted((Pjll_t) (Pjp->jp_1Index), + cJU_IMMED1_MAXPOP1, 1);) + j__udyFreeJLB1(PjlbRaw, Pjpm); + + Pjp->jp_Type = cJ1_JPIMMED_1_15; + return(1); + } + +#else // (JUDYL || (! JU_64BIT)) + +// Compress LeafB1 to a Leaf1: +// +// Note: 4.37 of this file contained alternate code for Judy1 only that simply +// cleared the bit and allowed the LeafB1 to go below cJU_LEAF1_MAXPOP1. This +// was the ONLY case where a malloc failure was not fatal; however, it violated +// the critical assumption that the tree is always kept in least-compressed +// form. + + if (pop1 == cJU_LEAF1_MAXPOP1) // hysteresis = 1. + { + if (j__udyLeafB1ToLeaf1(Pjp, Pjpm) == -1) return(-1); + goto ContinueDelWalk; // delete Index in new Leaf1. + } +#endif // (JUDYL || (! JU_64BIT)) + +#ifdef JUDY1 + // unset Indexs bit: + + JU_BITMAPCLEARL(P_JLB(Pjp->jp_Addr), Index); +#else // JUDYL + +// This is very different from Judy1 because of the need to manage the value +// area: +// +// Get last byte to decode from Index, and pointer to bitmap leaf: + + digit = JU_DIGITATSTATE(Index, 1); + Pjlb = P_JLB(Pjp->jp_Addr); + +// Prepare additional values: + + subexp = digit / cJU_BITSPERSUBEXPL; // which subexpanse. + bitmap = JU_JLB_BITMAP(Pjlb, subexp); // subexps 32-bit map. + PjvRaw = JL_JLB_PVALUE(Pjlb, subexp); // corresponding values. + Pjv = P_JV(PjvRaw); + bitmask = JU_BITPOSMASKL(digit); // mask for Index. + + assert(bitmap & bitmask); // Index must be valid. + + if (bitmap == cJU_FULLBITMAPL) // full bitmap, take shortcut: + { + pop1 = cJU_BITSPERSUBEXPL; + offset = digit % cJU_BITSPERSUBEXPL; + } + else // compute subexpanse pop1 and value area offset: + { + pop1 = j__udyCountBitsL(bitmap); + offset = j__udyCountBitsL(bitmap & (bitmask - 1)); + } + +// Handle solitary Index remaining in subexpanse: + + if (pop1 == 1) + { + j__udyLFreeJV(PjvRaw, 1, Pjpm); + + JL_JLB_PVALUE(Pjlb, subexp) = (Pjv_t) NULL; + JU_JLB_BITMAP(Pjlb, subexp) = 0; + + return(1); + } + +// Shrink value area in place or move to a smaller value area: + + if (JL_LEAFVGROWINPLACE(pop1 - 1)) // hysteresis = 0. + { + JU_DELETEINPLACE(Pjv, pop1, offset, ignore); + } + else + { + if ((PjvnewRaw = j__udyLAllocJV(pop1 - 1, Pjpm)) + == (Pjv_t) NULL) return(-1); + Pjvnew = P_JV(PjvnewRaw); + + JU_DELETECOPY(Pjvnew, Pjv, pop1, offset, ignore); + j__udyLFreeJV(PjvRaw, pop1, Pjpm); + JL_JLB_PVALUE(Pjlb, subexp) = (Pjv_t) PjvnewRaw; + } + + JU_JLB_BITMAP(Pjlb, subexp) ^= bitmask; // clear Indexs bit. + +#endif // JUDYL + + return(1); + + } // case. + + +#ifdef JUDY1 + +// **************************************************************************** +// FULL POPULATION LEAF: +// +// Convert to a LeafB1 and delete the index. Hysteresis = 0; none is possible. +// +// Note: Earlier the second assertion below said, "== 2", but in fact the +// parent could be at a higher level if a fullpop is under a narrow pointer. + + case cJ1_JPFULLPOPU1: + { + Pjlb_t PjlbRaw; + Pjlb_t Pjlb; + Word_t subexp; + + assert(! JU_DCDNOTMATCHINDEX(Index, Pjp, 2)); + assert(ParentLevel > 1); // see above. + + if ((PjlbRaw = j__udyAllocJLB1(Pjpm)) == (Pjlb_t) NULL) + return(-1); + Pjlb = P_JLB(PjlbRaw); + +// Fully populate the leaf, then unset Indexs bit: + + for (subexp = 0; subexp < cJU_NUMSUBEXPL; ++subexp) + JU_JLB_BITMAP(Pjlb, subexp) = cJU_FULLBITMAPL; + + JU_BITMAPCLEARL(Pjlb, Index); + + Pjp->jp_Addr = (Word_t) PjlbRaw; + Pjp->jp_Type = cJU_JPLEAF_B1; + + return(1); + } +#endif // JUDY1 + + +// **************************************************************************** +// IMMEDIATE JP: +// +// If theres just the one Index in the Immed, convert the JP to a JPNULL* +// (should only happen in a BranchU); otherwise delete the Index from the +// Immed. See the state transitions table elsewhere in this file for a summary +// of which Immed types must be handled. Hysteresis = 0; none is possible with +// Immeds. +// +// MACROS FOR COMMON CODE: +// +// Single Index remains in cJU_JPIMMED_*_01; convert JP to null: +// +// Variables Pjp and parentJPtype are in the context. +// +// Note: cJU_JPIMMED_*_01 should only be encountered in BranchUs, not in +// BranchLs or BranchBs (where its improper to merely modify the JP to be a +// null JP); that is, BranchL and BranchB code should have already handled +// any cJU_JPIMMED_*_01 by different means. + +#define JU_IMMED_01(NewJPType,ParentJPType) \ + \ + assert(parentJPtype == (ParentJPType)); \ + assert(JU_JPDCDPOP0(Pjp) == JU_TRIMTODCDSIZE(Index)); \ + JU_JPSETADT(Pjp, 0, 0, NewJPType); \ + return(1) + +// Convert cJ*_JPIMMED_*_02 to cJU_JPIMMED_*_01: +// +// Move the undeleted Index, whichever does not match the least bytes of Index, +// from undecoded-bytes-only (in jp_1Index or jp_LIndex as appropriate) to +// jp_DcdPopO (full-field). Pjp, Index, and offset are in the context. + +#define JU_IMMED_02(cIS,LeafType,NewJPType) \ + { \ + LeafType Pleaf; \ + \ + assert((ParentLevel - 1) == (cIS)); \ + JUDY1CODE(Pleaf = (LeafType) (Pjp->jp_1Index);) \ + JUDYLCODE(Pleaf = (LeafType) (Pjp->jp_LIndex);) \ + JUDYLCODE(PjvRaw = (Pjv_t) (Pjp->jp_Addr);) \ + JUDYLCODE(Pjv = P_JV(PjvRaw);) \ + JU_TOIMMED_01_EVEN(cIS, ignore, ignore); \ + JUDYLCODE(j__udyLFreeJV(PjvRaw, 2, Pjpm);) \ + Pjp->jp_Type = (NewJPType); \ + return(1); \ + } + +#if (defined(JUDY1) || defined(JU_64BIT)) + +// Variation for "odd" cJ*_JPIMMED_*_02 JP types, which are very different from +// "even" types because they use leaf search code and odd-copy macros: +// +// Note: JudyL 32-bit has no "odd" JPIMMED_*_02 types. + +#define JU_IMMED_02_ODD(cIS,NewJPType,SearchLeaf,CopyPIndex) \ + { \ + uint8_t * Pleaf; \ + \ + assert((ParentLevel - 1) == (cIS)); \ + JUDY1CODE(Pleaf = (uint8_t *) (Pjp->jp_1Index);) \ + JUDYLCODE(Pleaf = (uint8_t *) (Pjp->jp_LIndex);) \ + JUDYLCODE(PjvRaw = (Pjv_t) (Pjp->jp_Addr);) \ + JUDYLCODE(Pjv = P_JV(PjvRaw);) \ + JU_TOIMMED_01_ODD(cIS, SearchLeaf, CopyPIndex); \ + JUDYLCODE(j__udyLFreeJV(PjvRaw, 2, Pjpm);) \ + Pjp->jp_Type = (NewJPType); \ + return(1); \ + } +#endif // (JUDY1 || JU_64BIT) + +// Core code for deleting one Index (and for JudyL, its value area) from a +// larger Immed: +// +// Variables Pleaf, pop1, and offset are in the context. + +#ifdef JUDY1 +#define JU_IMMED_DEL(cIS,DeleteInPlace) \ + DeleteInPlace(Pleaf, pop1, offset, cIS); \ + DBGCODE(JudyCheckSorted(Pleaf, pop1 - 1, cIS);) + +#else // JUDYL + +// For JudyL the value area might need to be shrunk: + +#define JU_IMMED_DEL(cIS,DeleteInPlace) \ + \ + if (JL_LEAFVGROWINPLACE(pop1 - 1)) /* hysteresis = 0 */ \ + { \ + DeleteInPlace( Pleaf, pop1, offset, cIS); \ + JU_DELETEINPLACE(Pjv, pop1, offset, ignore); \ + DBGCODE(JudyCheckSorted(Pleaf, pop1 - 1, cIS);) \ + } \ + else \ + { \ + Pjv_t PjvnewRaw; \ + Pjv_t Pjvnew; \ + \ + if ((PjvnewRaw = j__udyLAllocJV(pop1 - 1, Pjpm)) \ + == (Pjv_t) NULL) return(-1); \ + Pjvnew = P_JV(PjvnewRaw); \ + \ + DeleteInPlace(Pleaf, pop1, offset, cIS); \ + JU_DELETECOPY(Pjvnew, Pjv, pop1, offset, ignore); \ + DBGCODE(JudyCheckSorted(Pleaf, pop1 - 1, cIS);) \ + j__udyLFreeJV(PjvRaw, pop1, Pjpm); \ + \ + (Pjp->jp_Addr) = (Word_t) PjvnewRaw; \ + } +#endif // JUDYL + +// Delete one Index from a larger Immed where no restructuring is required: +// +// Variables pop1, Pjp, offset, and Index are in the context. + +#define JU_IMMED(cIS,LeafType,BaseJPType,SearchLeaf,DeleteInPlace) \ + { \ + LeafType Pleaf; \ + \ + assert((ParentLevel - 1) == (cIS)); \ + JUDY1CODE(Pleaf = (LeafType) (Pjp->jp_1Index);) \ + JUDYLCODE(Pleaf = (LeafType) (Pjp->jp_LIndex);) \ + JUDYLCODE(PjvRaw = (Pjv_t) (Pjp->jp_Addr);) \ + JUDYLCODE(Pjv = P_JV(PjvRaw);) \ + pop1 = (JU_JPTYPE(Pjp)) - (BaseJPType) + 2; \ + offset = SearchLeaf(Pleaf, pop1, Index); \ + assert(offset >= 0); /* Index must be valid */ \ + \ + JU_IMMED_DEL(cIS, DeleteInPlace); \ + --(Pjp->jp_Type); \ + return(1); \ + } + + +// END OF MACROS, START OF CASES: + +// Single Index remains in Immed; convert JP to null: + + case cJU_JPIMMED_1_01: JU_IMMED_01(cJU_JPNULL1, cJU_JPBRANCH_U2); + case cJU_JPIMMED_2_01: JU_IMMED_01(cJU_JPNULL2, cJU_JPBRANCH_U3); +#ifndef JU_64BIT + case cJU_JPIMMED_3_01: JU_IMMED_01(cJU_JPNULL3, cJU_JPBRANCH_U); +#else + case cJU_JPIMMED_3_01: JU_IMMED_01(cJU_JPNULL3, cJU_JPBRANCH_U4); + case cJU_JPIMMED_4_01: JU_IMMED_01(cJU_JPNULL4, cJU_JPBRANCH_U5); + case cJU_JPIMMED_5_01: JU_IMMED_01(cJU_JPNULL5, cJU_JPBRANCH_U6); + case cJU_JPIMMED_6_01: JU_IMMED_01(cJU_JPNULL6, cJU_JPBRANCH_U7); + case cJU_JPIMMED_7_01: JU_IMMED_01(cJU_JPNULL7, cJU_JPBRANCH_U); +#endif + +// Multiple Indexes remain in the Immed JP; delete the specified Index: + + case cJU_JPIMMED_1_02: + + JU_IMMED_02(1, uint8_t *, cJU_JPIMMED_1_01); + + case cJU_JPIMMED_1_03: +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_1_04: + case cJU_JPIMMED_1_05: + case cJU_JPIMMED_1_06: + case cJU_JPIMMED_1_07: +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_1_08: + case cJ1_JPIMMED_1_09: + case cJ1_JPIMMED_1_10: + case cJ1_JPIMMED_1_11: + case cJ1_JPIMMED_1_12: + case cJ1_JPIMMED_1_13: + case cJ1_JPIMMED_1_14: + case cJ1_JPIMMED_1_15: +#endif + JU_IMMED(1, uint8_t *, cJU_JPIMMED_1_02, + j__udySearchLeaf1, JU_DELETEINPLACE); + +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_2_02: + + JU_IMMED_02(2, uint16_t *, cJU_JPIMMED_2_01); + + case cJU_JPIMMED_2_03: +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_2_04: + case cJ1_JPIMMED_2_05: + case cJ1_JPIMMED_2_06: + case cJ1_JPIMMED_2_07: +#endif +#if (defined(JUDY1) || defined(JU_64BIT)) + JU_IMMED(2, uint16_t *, cJU_JPIMMED_2_02, + j__udySearchLeaf2, JU_DELETEINPLACE); + + case cJU_JPIMMED_3_02: + + JU_IMMED_02_ODD(3, cJU_JPIMMED_3_01, + j__udySearchLeaf3, JU_COPY3_PINDEX_TO_LONG); + +#endif + +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_3_03: + case cJ1_JPIMMED_3_04: + case cJ1_JPIMMED_3_05: + + JU_IMMED(3, uint8_t *, cJU_JPIMMED_3_02, + j__udySearchLeaf3, JU_DELETEINPLACE_ODD); + + case cJ1_JPIMMED_4_02: + + JU_IMMED_02(4, uint32_t *, cJU_JPIMMED_4_01); + + case cJ1_JPIMMED_4_03: + + JU_IMMED(4, uint32_t *, cJ1_JPIMMED_4_02, + j__udySearchLeaf4, JU_DELETEINPLACE); + + case cJ1_JPIMMED_5_02: + + JU_IMMED_02_ODD(5, cJU_JPIMMED_5_01, + j__udySearchLeaf5, JU_COPY5_PINDEX_TO_LONG); + + case cJ1_JPIMMED_5_03: + + JU_IMMED(5, uint8_t *, cJ1_JPIMMED_5_02, + j__udySearchLeaf5, JU_DELETEINPLACE_ODD); + + case cJ1_JPIMMED_6_02: + + JU_IMMED_02_ODD(6, cJU_JPIMMED_6_01, + j__udySearchLeaf6, JU_COPY6_PINDEX_TO_LONG); + + case cJ1_JPIMMED_7_02: + + JU_IMMED_02_ODD(7, cJU_JPIMMED_7_01, + j__udySearchLeaf7, JU_COPY7_PINDEX_TO_LONG); + +#endif // (JUDY1 && JU_64BIT) + + +// **************************************************************************** +// INVALID JP TYPE: + + default: JU_SET_ERRNO_NONNULL(Pjpm, JU_ERRNO_CORRUPT); return(-1); + + } // switch + + +// PROCESS JP -- RECURSIVELY: +// +// For non-Immed JP types, if successful, post-decrement the population count +// at this level, or collapse a BranchL if necessary by copying the remaining +// JP in the BranchL to the parent (hysteresis = 0), which implicitly creates a +// narrow pointer if there was not already one in the hierarchy. + + assert(level); + retcode = j__udyDelWalk(Pjp, Index, level, Pjpm); + assert(retcode != 0); // should never happen. + + if ((JU_JPTYPE(Pjp)) < cJU_JPIMMED_1_01) // not an Immed. + { + switch (retcode) + { + case 1: + { + jp_t JP = *Pjp; + Word_t DcdP0; + + DcdP0 = JU_JPDCDPOP0(Pjp) - 1; // decrement count. + JU_JPSETADT(Pjp, JP.jp_Addr, DcdP0, JU_JPTYPE(&JP)); + break; + } + case 2: // collapse BranchL to single JP; see above: + { + Pjbl_t PjblRaw = (Pjbl_t) (Pjp->jp_Addr); + Pjbl_t Pjbl = P_JBL(PjblRaw); + + *Pjp = Pjbl->jbl_jp[0]; + j__udyFreeJBL(PjblRaw, Pjpm); + retcode = 1; + } + } + } + + return(retcode); + +} // j__udyDelWalk() + + +// **************************************************************************** +// J U D Y 1 U N S E T +// J U D Y L D E L +// +// Main entry point. See the manual entry for details. + +#ifdef JUDY1 +FUNCTION int Judy1Unset +#else +FUNCTION int JudyLDel +#endif + ( + PPvoid_t PPArray, // in which to delete. + Word_t Index, // to delete. + PJError_t PJError // optional, for returning error info. + ) +{ + Word_t pop1; // population of leaf. + int offset; // at which to delete Index. + JUDY1CODE(int retcode;) // return code from Judy1Test(). +JUDYLCODE(PPvoid_t PPvalue;) // pointer from JudyLGet(). + + +// CHECK FOR NULL ARRAY POINTER (error by caller): + + if (PPArray == (PPvoid_t) NULL) + { + JU_SET_ERRNO(PJError, JU_ERRNO_NULLPPARRAY); + return(JERRI); + } + + +// CHECK IF INDEX IS INVALID: +// +// If so, theres nothing to do. This saves a lot of time. Pass through +// PJError, if any, from the "get" function. + +#ifdef JUDY1 + if ((retcode = Judy1Test(*PPArray, Index, PJError)) == JERRI) + return (JERRI); + + if (retcode == 0) return(0); +#else + if ((PPvalue = JudyLGet(*PPArray, Index, PJError)) == PPJERR) + return (JERRI); + + if (PPvalue == (PPvoid_t) NULL) return(0); +#endif + + +// **************************************************************************** +// PROCESS TOP LEVEL (LEAFW) BRANCHES AND LEAVES: + +// **************************************************************************** +// LEAFW LEAF, OTHER SIZE: +// +// Shrink or convert the leaf as necessary. Hysteresis = 0; none is possible. + + if (JU_LEAFW_POP0(*PPArray) < cJU_LEAFW_MAXPOP1) // must be a LEAFW + { + JUDYLCODE(Pjv_t Pjv;) // current value area. + JUDYLCODE(Pjv_t Pjvnew;) // value area in new leaf. + Pjlw_t Pjlw = P_JLW(*PPArray); // first word of leaf. + Pjlw_t Pjlwnew; // replacement leaf. + pop1 = Pjlw[0] + 1; // first word of leaf is pop0. + +// Delete single (last) Index from array: + + if (pop1 == 1) + { + j__udyFreeJLW(Pjlw, /* pop1 = */ 1, (Pjpm_t) NULL); + *PPArray = (Pvoid_t) NULL; + return(1); + } + +// Locate Index in compressible leaf: + + offset = j__udySearchLeafW(Pjlw + 1, pop1, Index); + assert(offset >= 0); // Index must be valid. + + JUDYLCODE(Pjv = JL_LEAFWVALUEAREA(Pjlw, pop1);) + +// Delete Index in-place: +// +// Note: "Grow in place from pop1 - 1" is the logical inverse of, "shrink in +// place from pop1." Also, Pjlw points to the count word, so skip that for +// doing the deletion. + + if (JU_LEAFWGROWINPLACE(pop1 - 1)) + { + JU_DELETEINPLACE(Pjlw + 1, pop1, offset, ignore); +#ifdef JUDYL // also delete from value area: + JU_DELETEINPLACE(Pjv, pop1, offset, ignore); +#endif + DBGCODE(JudyCheckSorted((Pjll_t) (Pjlw + 1), pop1 - 1, + cJU_ROOTSTATE);) + --(Pjlw[0]); // decrement population. + DBGCODE(JudyCheckPop(*PPArray);) + return(1); + } + +// Allocate new leaf for use in either case below: + + Pjlwnew = j__udyAllocJLW(pop1 - 1); + JU_CHECKALLOC(Pjlw_t, Pjlwnew, JERRI); + +// Shrink to smaller LEAFW: +// +// Note: Skip the first word = pop0 in each leaf. + + Pjlwnew[0] = (pop1 - 1) - 1; + JU_DELETECOPY(Pjlwnew + 1, Pjlw + 1, pop1, offset, ignore); + +#ifdef JUDYL // also delete from value area: + Pjvnew = JL_LEAFWVALUEAREA(Pjlwnew, pop1 - 1); + JU_DELETECOPY(Pjvnew, Pjv, pop1, offset, ignore); +#endif + DBGCODE(JudyCheckSorted(Pjlwnew + 1, pop1 - 1, cJU_ROOTSTATE);) + + j__udyFreeJLW(Pjlw, pop1, (Pjpm_t) NULL); + +//// *PPArray = (Pvoid_t) Pjlwnew | cJU_LEAFW); + *PPArray = (Pvoid_t) Pjlwnew; + DBGCODE(JudyCheckPop(*PPArray);) + return(1); + + } + else + + +// **************************************************************************** +// JRP BRANCH: +// +// Traverse through the JPM to do the deletion unless the population is small +// enough to convert immediately to a LEAFW. + + { + Pjpm_t Pjpm; + Pjp_t Pjp; // top-level JP to process. + Word_t digit; // in a branch. + JUDYLCODE(Pjv_t Pjv;) // to value area. + Pjlw_t Pjlwnew; // replacement leaf. + DBGCODE(Pjlw_t Pjlwnew_orig;) + + Pjpm = P_JPM(*PPArray); // top object in array (tree). + Pjp = &(Pjpm->jpm_JP); // next object (first branch or leaf). + + assert(((Pjpm->jpm_JP.jp_Type) == cJU_JPBRANCH_L) + || ((Pjpm->jpm_JP.jp_Type) == cJU_JPBRANCH_B) + || ((Pjpm->jpm_JP.jp_Type) == cJU_JPBRANCH_U)); + +// WALK THE TREE +// +// Note: Recursive code in j__udyDelWalk() knows how to collapse a lower-level +// BranchL containing a single JP into the parent JP as a narrow pointer, but +// the code here cant do that for a top-level BranchL. The result can be +// PArray -> JPM -> BranchL containing a single JP. This situation is +// unavoidable because a JPM cannot contain a narrow pointer; the BranchL is +// required in order to hold the top digit decoded, and it does not collapse to +// a LEAFW until the population is low enough. +// +// TBD: Should we add a topdigit field to JPMs so they can hold narrow +// pointers? + + if (j__udyDelWalk(Pjp, Index, cJU_ROOTSTATE, Pjpm) == -1) + { + JU_COPY_ERRNO(PJError, Pjpm); + return(JERRI); + } + + --(Pjpm->jpm_Pop0); // success; decrement total population. + + if ((Pjpm->jpm_Pop0 + 1) != cJU_LEAFW_MAXPOP1) + { + DBGCODE(JudyCheckPop(*PPArray);) + return(1); + } + +// COMPRESS A BRANCH[LBU] TO A LEAFW: +// + Pjlwnew = j__udyAllocJLW(cJU_LEAFW_MAXPOP1); + JU_CHECKALLOC(Pjlw_t, Pjlwnew, JERRI); + +// Plug leaf into root pointer and set population count: + +//// *PPArray = (Pvoid_t) ((Word_t) Pjlwnew | cJU_LEAFW); + *PPArray = (Pvoid_t) Pjlwnew; +#ifdef JUDYL // prepare value area: + Pjv = JL_LEAFWVALUEAREA(Pjlwnew, cJU_LEAFW_MAXPOP1); +#endif + *Pjlwnew++ = cJU_LEAFW_MAXPOP1 - 1; // set pop0. + DBGCODE(Pjlwnew_orig = Pjlwnew;) + + switch (JU_JPTYPE(Pjp)) + { + +// JPBRANCH_L: Copy each JPs indexes to the new LEAFW and free the old +// branch: + + case cJU_JPBRANCH_L: + { + Pjbl_t PjblRaw = (Pjbl_t) (Pjp->jp_Addr); + Pjbl_t Pjbl = P_JBL(PjblRaw); + + for (offset = 0; offset < Pjbl->jbl_NumJPs; ++offset) + { + pop1 = j__udyLeafM1ToLeafW(Pjlwnew, JU_PVALUEPASS + (Pjbl->jbl_jp) + offset, + JU_DIGITTOSTATE(Pjbl->jbl_Expanse[offset], + cJU_BYTESPERWORD), + (Pvoid_t) Pjpm); + Pjlwnew += pop1; // advance through indexes. + JUDYLCODE(Pjv += pop1;) // advance through values. + } + j__udyFreeJBL(PjblRaw, Pjpm); + + assert(Pjlwnew == Pjlwnew_orig + cJU_LEAFW_MAXPOP1); + break; // delete Index from new LEAFW. + } + +// JPBRANCH_B: Copy each JPs indexes to the new LEAFW and free the old +// branch, including each JP subarray: + + case cJU_JPBRANCH_B: + { + Pjbb_t PjbbRaw = (Pjbb_t) (Pjp->jp_Addr); + Pjbb_t Pjbb = P_JBB(PjbbRaw); + Word_t subexp; // current subexpanse number. + BITMAPB_t bitmap; // portion for this subexpanse. + Pjp_t Pjp2Raw; // one subexpanses subarray. + Pjp_t Pjp2; + + for (subexp = 0; subexp < cJU_NUMSUBEXPB; ++subexp) + { + if ((bitmap = JU_JBB_BITMAP(Pjbb, subexp)) == 0) + continue; // skip empty subexpanse. + + digit = subexp * cJU_BITSPERSUBEXPB; + Pjp2Raw = JU_JBB_PJP(Pjbb, subexp); + Pjp2 = P_JP(Pjp2Raw); + assert(Pjp2 != (Pjp_t) NULL); + +// Walk through bits for all possible sub-subexpanses (digits); increment +// offset for each populated subexpanse; until no more set bits: + + for (offset = 0; bitmap != 0; bitmap >>= 1, ++digit) + { + if (! (bitmap & 1)) // skip empty sub-subexpanse. + continue; + + pop1 = j__udyLeafM1ToLeafW(Pjlwnew, JU_PVALUEPASS + Pjp2 + offset, + JU_DIGITTOSTATE(digit, cJU_BYTESPERWORD), + (Pvoid_t) Pjpm); + Pjlwnew += pop1; // advance through indexes. + JUDYLCODE(Pjv += pop1;) // advance through values. + ++offset; + } + j__udyFreeJBBJP(Pjp2Raw, /* pop1 = */ offset, Pjpm); + } + j__udyFreeJBB(PjbbRaw, Pjpm); + + assert(Pjlwnew == Pjlwnew_orig + cJU_LEAFW_MAXPOP1); + break; // delete Index from new LEAFW. + + } // case cJU_JPBRANCH_B. + + +// JPBRANCH_U: Copy each JPs indexes to the new LEAFW and free the old +// branch: + + case cJU_JPBRANCH_U: + { + Pjbu_t PjbuRaw = (Pjbu_t) (Pjp->jp_Addr); + Pjbu_t Pjbu = P_JBU(PjbuRaw); + Word_t ldigit; // larger than uint8_t. + + for (Pjp = Pjbu->jbu_jp, ldigit = 0; + ldigit < cJU_BRANCHUNUMJPS; + ++Pjp, ++ldigit) + { + +// Shortcuts, to save a little time for possibly big branches: + + if ((JU_JPTYPE(Pjp)) == cJU_JPNULLMAX) // skip null JP. + continue; + +// TBD: Should the following shortcut also be used in BranchL and BranchB +// code? + +#ifndef JU_64BIT + if ((JU_JPTYPE(Pjp)) == cJU_JPIMMED_3_01) +#else + if ((JU_JPTYPE(Pjp)) == cJU_JPIMMED_7_01) +#endif + { // single Immed: + *Pjlwnew++ = JU_DIGITTOSTATE(ldigit, cJU_BYTESPERWORD) + | JU_JPDCDPOP0(Pjp); // rebuild Index. +#ifdef JUDYL + *Pjv++ = Pjp->jp_Addr; // copy value area. +#endif + continue; + } + + pop1 = j__udyLeafM1ToLeafW(Pjlwnew, JU_PVALUEPASS + Pjp, JU_DIGITTOSTATE(ldigit, cJU_BYTESPERWORD), + (Pvoid_t) Pjpm); + Pjlwnew += pop1; // advance through indexes. + JUDYLCODE(Pjv += pop1;) // advance through values. + } + j__udyFreeJBU(PjbuRaw, Pjpm); + + assert(Pjlwnew == Pjlwnew_orig + cJU_LEAFW_MAXPOP1); + break; // delete Index from new LEAFW. + + } // case cJU_JPBRANCH_U. + + +// INVALID JP TYPE in jpm_t struct + + default: JU_SET_ERRNO_NONNULL(Pjpm, JU_ERRNO_CORRUPT); + return(JERRI); + + } // end switch on sub-JP type. + + DBGCODE(JudyCheckSorted((Pjll_t) Pjlwnew_orig, cJU_LEAFW_MAXPOP1, + cJU_ROOTSTATE);) + +// FREE JPM (no longer needed): + + j__udyFreeJPM(Pjpm, (Pjpm_t) NULL); + DBGCODE(JudyCheckPop(*PPArray);) + return(1); + + } + /*NOTREACHED*/ + +} // Judy1Unset() / JudyLDel() diff --git a/src/libnetdata/libjudy/src/JudyL/JudyLFirst.c b/src/libnetdata/libjudy/src/JudyL/JudyLFirst.c new file mode 100644 index 00000000..aaf6639c --- /dev/null +++ b/src/libnetdata/libjudy/src/JudyL/JudyLFirst.c @@ -0,0 +1,213 @@ +// Copyright (C) 2000 - 2002 Hewlett-Packard Company +// +// This program is free software; you can redistribute it and/or modify it +// under the term of the GNU Lesser General Public License as published by the +// Free Software Foundation; either version 2 of the License, or (at your +// option) any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License +// for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this program; if not, write to the Free Software Foundation, +// Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// _________________ + +// @(#) $Revision: 4.12 $ $Source: /judy/src/JudyCommon/JudyFirst.c $ +// +// Judy*First[Empty]() and Judy*Last[Empty]() routines for Judy1 and JudyL. +// Compile with one of -DJUDY1 or -DJUDYL. +// +// These are inclusive versions of Judy*Next[Empty]() and Judy*Prev[Empty](). + +#if (! (defined(JUDY1) || defined(JUDYL))) +#error: One of -DJUDY1 or -DJUDYL must be specified. +#endif + +#ifdef JUDY1 +#include "Judy1.h" +#else +#include "JudyL.h" +#endif + + +// **************************************************************************** +// J U D Y 1 F I R S T +// J U D Y L F I R S T +// +// See the manual entry for details. + +#ifdef JUDY1 +FUNCTION int Judy1First +#else +FUNCTION PPvoid_t JudyLFirst +#endif + ( + Pcvoid_t PArray, // Judy array to search. + Word_t * PIndex, // starting point and result. + PJError_t PJError // optional, for returning error info. + ) +{ + if (PIndex == (PWord_t) NULL) // caller error: + { + JU_SET_ERRNO(PJError, JU_ERRNO_NULLPINDEX); + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + } + +#ifdef JUDY1 + switch (Judy1Test(PArray, *PIndex, PJError)) + { + case 1: return(1); // found *PIndex itself. + case 0: return(Judy1Next(PArray, PIndex, PJError)); + default: return(JERRI); + } +#else + { + PPvoid_t PValue; + + if ((PValue = JudyLGet(PArray, *PIndex, PJError)) == PPJERR) + return(PPJERR); + + if (PValue != (PPvoid_t) NULL) return(PValue); // found *PIndex. + + return(JudyLNext(PArray, PIndex, PJError)); + } +#endif + +} // Judy1First() / JudyLFirst() + + +// **************************************************************************** +// J U D Y 1 L A S T +// J U D Y L L A S T +// +// See the manual entry for details. + +#ifdef JUDY1 +FUNCTION int Judy1Last( +#else +FUNCTION PPvoid_t JudyLLast( +#endif + Pcvoid_t PArray, // Judy array to search. + Word_t * PIndex, // starting point and result. + PJError_t PJError) // optional, for returning error info. +{ + if (PIndex == (PWord_t) NULL) + { + JU_SET_ERRNO(PJError, JU_ERRNO_NULLPINDEX); // caller error. + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + } + +#ifdef JUDY1 + switch (Judy1Test(PArray, *PIndex, PJError)) + { + case 1: return(1); // found *PIndex itself. + case 0: return(Judy1Prev(PArray, PIndex, PJError)); + default: return(JERRI); + } +#else + { + PPvoid_t PValue; + + if ((PValue = JudyLGet(PArray, *PIndex, PJError)) == PPJERR) + return(PPJERR); + + if (PValue != (PPvoid_t) NULL) return(PValue); // found *PIndex. + + return(JudyLPrev(PArray, PIndex, PJError)); + } +#endif + +} // Judy1Last() / JudyLLast() + + +// **************************************************************************** +// J U D Y 1 F I R S T E M P T Y +// J U D Y L F I R S T E M P T Y +// +// See the manual entry for details. + +#ifdef JUDY1 +FUNCTION int Judy1FirstEmpty( +#else +FUNCTION int JudyLFirstEmpty( +#endif + Pcvoid_t PArray, // Judy array to search. + Word_t * PIndex, // starting point and result. + PJError_t PJError) // optional, for returning error info. +{ + if (PIndex == (PWord_t) NULL) // caller error: + { + JU_SET_ERRNO(PJError, JU_ERRNO_NULLPINDEX); + return(JERRI); + } + +#ifdef JUDY1 + switch (Judy1Test(PArray, *PIndex, PJError)) + { + case 0: return(1); // found *PIndex itself. + case 1: return(Judy1NextEmpty(PArray, PIndex, PJError)); + default: return(JERRI); + } +#else + { + PPvoid_t PValue; + + if ((PValue = JudyLGet(PArray, *PIndex, PJError)) == PPJERR) + return(JERRI); + + if (PValue == (PPvoid_t) NULL) return(1); // found *PIndex. + + return(JudyLNextEmpty(PArray, PIndex, PJError)); + } +#endif + +} // Judy1FirstEmpty() / JudyLFirstEmpty() + + +// **************************************************************************** +// J U D Y 1 L A S T E M P T Y +// J U D Y L L A S T E M P T Y +// +// See the manual entry for details. + +#ifdef JUDY1 +FUNCTION int Judy1LastEmpty( +#else +FUNCTION int JudyLLastEmpty( +#endif + Pcvoid_t PArray, // Judy array to search. + Word_t * PIndex, // starting point and result. + PJError_t PJError) // optional, for returning error info. +{ + if (PIndex == (PWord_t) NULL) + { + JU_SET_ERRNO(PJError, JU_ERRNO_NULLPINDEX); // caller error. + return(JERRI); + } + +#ifdef JUDY1 + switch (Judy1Test(PArray, *PIndex, PJError)) + { + case 0: return(1); // found *PIndex itself. + case 1: return(Judy1PrevEmpty(PArray, PIndex, PJError)); + default: return(JERRI); + } +#else + { + PPvoid_t PValue; + + if ((PValue = JudyLGet(PArray, *PIndex, PJError)) == PPJERR) + return(JERRI); + + if (PValue == (PPvoid_t) NULL) return(1); // found *PIndex. + + return(JudyLPrevEmpty(PArray, PIndex, PJError)); + } +#endif + +} // Judy1LastEmpty() / JudyLLastEmpty() diff --git a/src/libnetdata/libjudy/src/JudyL/JudyLFreeArray.c b/src/libnetdata/libjudy/src/JudyL/JudyLFreeArray.c new file mode 100644 index 00000000..34fac509 --- /dev/null +++ b/src/libnetdata/libjudy/src/JudyL/JudyLFreeArray.c @@ -0,0 +1,363 @@ +// Copyright (C) 2000 - 2002 Hewlett-Packard Company +// +// This program is free software; you can redistribute it and/or modify it +// under the term of the GNU Lesser General Public License as published by the +// Free Software Foundation; either version 2 of the License, or (at your +// option) any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License +// for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this program; if not, write to the Free Software Foundation, +// Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// _________________ + +// @(#) $Revision: 4.51 $ $Source: /judy/src/JudyCommon/JudyFreeArray.c $ +// +// Judy1FreeArray() and JudyLFreeArray() functions for Judy1 and JudyL. +// Compile with one of -DJUDY1 or -DJUDYL. +// Return the number of bytes freed from the array. + +#if (! (defined(JUDY1) || defined(JUDYL))) +#error: One of -DJUDY1 or -DJUDYL must be specified. +#endif + +#ifdef JUDY1 +#include "Judy1.h" +#else +#include "JudyL.h" +#endif + +#include "JudyPrivate1L.h" + +DBGCODE(extern void JudyCheckPop(Pvoid_t PArray);) + + +// **************************************************************************** +// J U D Y 1 F R E E A R R A Y +// J U D Y L F R E E A R R A Y +// +// See the Judy*(3C) manual entry for details. +// +// This code is written recursively, at least at first, because thats much +// simpler. Hope its fast enough. + +#ifdef JUDY1 +FUNCTION Word_t Judy1FreeArray +#else +FUNCTION Word_t JudyLFreeArray +#endif + ( + PPvoid_t PPArray, // array to free. + PJError_t PJError // optional, for returning error info. + ) +{ + jpm_t jpm; // local to accumulate free statistics. + +// CHECK FOR NULL POINTER (error by caller): + + if (PPArray == (PPvoid_t) NULL) + { + JU_SET_ERRNO(PJError, JU_ERRNO_NULLPPARRAY); + return(JERR); + } + + DBGCODE(JudyCheckPop(*PPArray);) + +// Zero jpm.jpm_Pop0 (meaning the array will be empty in a moment) for accurate +// logging in TRACEMI2. + + jpm.jpm_Pop0 = 0; // see above. + jpm.jpm_TotalMemWords = 0; // initialize memory freed. + +// Empty array: + + if (P_JLW(*PPArray) == (Pjlw_t) NULL) return(0); + +// PROCESS TOP LEVEL "JRP" BRANCHES AND LEAF: + + if (JU_LEAFW_POP0(*PPArray) < cJU_LEAFW_MAXPOP1) // must be a LEAFW + { + Pjlw_t Pjlw = P_JLW(*PPArray); // first word of leaf. + + j__udyFreeJLW(Pjlw, Pjlw[0] + 1, &jpm); + *PPArray = (Pvoid_t) NULL; // make an empty array. + return (-(jpm.jpm_TotalMemWords * cJU_BYTESPERWORD)); // see above. + } + else + +// Rootstate leaves: just free the leaf: + +// Common code for returning the amount of memory freed. +// +// Note: In a an ordinary LEAFW, pop0 = *PPArray[0]. +// +// Accumulate (negative) words freed, while freeing objects. +// Return the positive bytes freed. + + { + Pjpm_t Pjpm = P_JPM(*PPArray); + Word_t TotalMem = Pjpm->jpm_TotalMemWords; + + j__udyFreeSM(&(Pjpm->jpm_JP), &jpm); // recurse through tree. + j__udyFreeJPM(Pjpm, &jpm); + +// Verify the array was not corrupt. This means that amount of memory freed +// (which is negative) is equal to the initial amount: + + if (TotalMem + jpm.jpm_TotalMemWords) + { + JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); + return(JERR); + } + + *PPArray = (Pvoid_t) NULL; // make an empty array. + return (TotalMem * cJU_BYTESPERWORD); + } + +} // Judy1FreeArray() / JudyLFreeArray() + + +// **************************************************************************** +// __ J U D Y F R E E S M +// +// Given a pointer to a JP, recursively visit and free (depth first) all nodes +// in a Judy array BELOW the JP, but not the JP itself. Accumulate in *Pjpm +// the total words freed (as a negative value). "SM" = State Machine. +// +// Note: Corruption is not detected at this level because during a FreeArray, +// if the code hasnt already core dumped, its better to remain silent, even +// if some memory has not been freed, than to bother the caller about the +// corruption. TBD: Is this true? If not, must list all legitimate JPNULL +// and JPIMMED above first, and revert to returning bool_t (see 4.34). + +FUNCTION void j__udyFreeSM( + Pjp_t Pjp, // top of Judy (top-state). + Pjpm_t Pjpm) // to return words freed. +{ + Word_t Pop1; + + switch (JU_JPTYPE(Pjp)) + { + +#ifdef JUDY1 + +// FULL EXPANSE -- nothing to free for this jp_Type. + + case cJ1_JPFULLPOPU1: + break; +#endif + +// JUDY BRANCH -- free the sub-tree depth first: + +// LINEAR BRANCH -- visit each JP in the JBLs list, then free the JBL: +// +// Note: There are no null JPs in a JBL. + + case cJU_JPBRANCH_L: + case cJU_JPBRANCH_L2: + case cJU_JPBRANCH_L3: +#ifdef JU_64BIT + case cJU_JPBRANCH_L4: + case cJU_JPBRANCH_L5: + case cJU_JPBRANCH_L6: + case cJU_JPBRANCH_L7: +#endif // JU_64BIT + { + Pjbl_t Pjbl = P_JBL(Pjp->jp_Addr); + Word_t offset; + + for (offset = 0; offset < Pjbl->jbl_NumJPs; ++offset) + j__udyFreeSM((Pjbl->jbl_jp) + offset, Pjpm); + + j__udyFreeJBL((Pjbl_t) (Pjp->jp_Addr), Pjpm); + break; + } + + +// BITMAP BRANCH -- visit each JP in the JBBs list based on the bitmap, also +// +// Note: There are no null JPs in a JBB. + + case cJU_JPBRANCH_B: + case cJU_JPBRANCH_B2: + case cJU_JPBRANCH_B3: +#ifdef JU_64BIT + case cJU_JPBRANCH_B4: + case cJU_JPBRANCH_B5: + case cJU_JPBRANCH_B6: + case cJU_JPBRANCH_B7: +#endif // JU_64BIT + { + Word_t subexp; + Word_t offset; + Word_t jpcount; + + Pjbb_t Pjbb = P_JBB(Pjp->jp_Addr); + + for (subexp = 0; subexp < cJU_NUMSUBEXPB; ++subexp) + { + jpcount = j__udyCountBitsB(JU_JBB_BITMAP(Pjbb, subexp)); + + if (jpcount) + { + for (offset = 0; offset < jpcount; ++offset) + { + j__udyFreeSM(P_JP(JU_JBB_PJP(Pjbb, subexp)) + offset, + Pjpm); + } + j__udyFreeJBBJP(JU_JBB_PJP(Pjbb, subexp), jpcount, Pjpm); + } + } + j__udyFreeJBB((Pjbb_t) (Pjp->jp_Addr), Pjpm); + + break; + } + + +// UNCOMPRESSED BRANCH -- visit each JP in the JBU array, then free the JBU +// itself: +// +// Note: Null JPs are handled during recursion at a lower state. + + case cJU_JPBRANCH_U: + case cJU_JPBRANCH_U2: + case cJU_JPBRANCH_U3: +#ifdef JU_64BIT + case cJU_JPBRANCH_U4: + case cJU_JPBRANCH_U5: + case cJU_JPBRANCH_U6: + case cJU_JPBRANCH_U7: +#endif // JU_64BIT + { + Word_t offset; + Pjbu_t Pjbu = P_JBU(Pjp->jp_Addr); + + for (offset = 0; offset < cJU_BRANCHUNUMJPS; ++offset) + j__udyFreeSM((Pjbu->jbu_jp) + offset, Pjpm); + + j__udyFreeJBU((Pjbu_t) (Pjp->jp_Addr), Pjpm); + break; + } + + +// -- Cases below here terminate and do not recurse. -- + + +// LINEAR LEAF -- just free the leaf; size is computed from jp_Type: +// +// Note: cJU_JPLEAF1 is a special case, see discussion in ../Judy1/Judy1.h + +#if (defined(JUDYL) || (! defined(JU_64BIT))) + case cJU_JPLEAF1: + Pop1 = JU_JPLEAF_POP0(Pjp) + 1; + j__udyFreeJLL1((Pjll_t) (Pjp->jp_Addr), Pop1, Pjpm); + break; +#endif + + case cJU_JPLEAF2: + Pop1 = JU_JPLEAF_POP0(Pjp) + 1; + j__udyFreeJLL2((Pjll_t) (Pjp->jp_Addr), Pop1, Pjpm); + break; + + case cJU_JPLEAF3: + Pop1 = JU_JPLEAF_POP0(Pjp) + 1; + j__udyFreeJLL3((Pjll_t) (Pjp->jp_Addr), Pop1, Pjpm); + break; + +#ifdef JU_64BIT + case cJU_JPLEAF4: + Pop1 = JU_JPLEAF_POP0(Pjp) + 1; + j__udyFreeJLL4((Pjll_t) (Pjp->jp_Addr), Pop1, Pjpm); + break; + + case cJU_JPLEAF5: + Pop1 = JU_JPLEAF_POP0(Pjp) + 1; + j__udyFreeJLL5((Pjll_t) (Pjp->jp_Addr), Pop1, Pjpm); + break; + + case cJU_JPLEAF6: + Pop1 = JU_JPLEAF_POP0(Pjp) + 1; + j__udyFreeJLL6((Pjll_t) (Pjp->jp_Addr), Pop1, Pjpm); + break; + + case cJU_JPLEAF7: + Pop1 = JU_JPLEAF_POP0(Pjp) + 1; + j__udyFreeJLL7((Pjll_t) (Pjp->jp_Addr), Pop1, Pjpm); + break; +#endif // JU_64BIT + + +// BITMAP LEAF -- free sub-expanse arrays of JPs, then free the JBB. + + case cJU_JPLEAF_B1: + { +#ifdef JUDYL + Word_t subexp; + Word_t jpcount; + Pjlb_t Pjlb = P_JLB(Pjp->jp_Addr); + +// Free the value areas in the bitmap leaf: + + for (subexp = 0; subexp < cJU_NUMSUBEXPL; ++subexp) + { + jpcount = j__udyCountBitsL(JU_JLB_BITMAP(Pjlb, subexp)); + + if (jpcount) + j__udyLFreeJV(JL_JLB_PVALUE(Pjlb, subexp), jpcount, Pjpm); + } +#endif // JUDYL + + j__udyFreeJLB1((Pjlb_t) (Pjp->jp_Addr), Pjpm); + break; + + } // case cJU_JPLEAF_B1 + +#ifdef JUDYL + + +// IMMED*: +// +// For JUDYL, all non JPIMMED_*_01s have a LeafV which must be freed: + + case cJU_JPIMMED_1_02: + case cJU_JPIMMED_1_03: +#ifdef JU_64BIT + case cJU_JPIMMED_1_04: + case cJU_JPIMMED_1_05: + case cJU_JPIMMED_1_06: + case cJU_JPIMMED_1_07: +#endif + Pop1 = JU_JPTYPE(Pjp) - cJU_JPIMMED_1_02 + 2; + j__udyLFreeJV((Pjv_t) (Pjp->jp_Addr), Pop1, Pjpm); + break; + +#ifdef JU_64BIT + case cJU_JPIMMED_2_02: + case cJU_JPIMMED_2_03: + + Pop1 = JU_JPTYPE(Pjp) - cJU_JPIMMED_2_02 + 2; + j__udyLFreeJV((Pjv_t) (Pjp->jp_Addr), Pop1, Pjpm); + break; + + case cJU_JPIMMED_3_02: + j__udyLFreeJV((Pjv_t) (Pjp->jp_Addr), 2, Pjpm); + break; + +#endif // JU_64BIT +#endif // JUDYL + + +// OTHER JPNULL, JPIMMED, OR UNEXPECTED TYPE -- nothing to free for this type: +// +// Note: Lump together no-op and invalid JP types; see function header +// comments. + + default: break; + + } // switch (JU_JPTYPE(Pjp)) + +} // j__udyFreeSM() diff --git a/src/libnetdata/libjudy/src/JudyL/JudyLGet.c b/src/libnetdata/libjudy/src/JudyL/JudyLGet.c new file mode 100644 index 00000000..e6853939 --- /dev/null +++ b/src/libnetdata/libjudy/src/JudyL/JudyLGet.c @@ -0,0 +1,1096 @@ +// Copyright (C) 2000 - 2002 Hewlett-Packard Company +// +// This program is free software; you can redistribute it and/or modify it +// under the term of the GNU Lesser General Public License as published by the +// Free Software Foundation; either version 2 of the License, or (at your +// option) any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License +// for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this program; if not, write to the Free Software Foundation, +// Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// _________________ + +// @(#) $Revision: 4.43 $ $Source: /judy/src/JudyCommon/JudyGet.c $ +// +// Judy1Test() and JudyLGet() functions for Judy1 and JudyL. +// Compile with one of -DJUDY1 or -DJUDYL. + +#if (! (defined(JUDY1) || defined(JUDYL))) +#error: One of -DJUDY1 or -DJUDYL must be specified. +#endif + +#ifdef JUDY1 +#include "Judy1.h" +#else +#include "JudyL.h" +#endif + +#include "JudyPrivate1L.h" + +#ifdef TRACEJPR // different macro name, for "retrieval" only. +#include "JudyPrintJP.c" +#endif + + +// **************************************************************************** +// J U D Y 1 T E S T +// J U D Y L G E T +// +// See the manual entry for details. Note support for "shortcut" entries to +// trees known to start with a JPM. + +__attribute__((no_sanitize("shift"))) + +#ifdef JUDY1 + +#ifdef JUDYGETINLINE +FUNCTION int j__udy1Test +#else +FUNCTION int Judy1Test +#endif + +#else // JUDYL + +#ifdef JUDYGETINLINE +FUNCTION PPvoid_t j__udyLGet +#else +FUNCTION PPvoid_t JudyLGet +#endif + +#endif // JUDYL + ( +#ifdef JUDYGETINLINE + Pvoid_t PArray, // from which to retrieve. + Word_t Index // to retrieve. +#else + Pcvoid_t PArray, // from which to retrieve. + Word_t Index, // to retrieve. + PJError_t PJError // optional, for returning error info. +#endif + ) +{ + Pjp_t Pjp; // current JP while walking the tree. + Pjpm_t Pjpm; // for global accounting. + uint8_t Digit; // byte just decoded from Index. + Word_t Pop1; // leaf population (number of indexes). + Pjll_t Pjll; // pointer to LeafL. + DBGCODE(uint8_t ParentJPType;) + +#ifndef JUDYGETINLINE + + if (PArray == (Pcvoid_t) NULL) // empty array. + { + JUDY1CODE(return(0);) + JUDYLCODE(return((PPvoid_t) NULL);) + } + +// **************************************************************************** +// PROCESS TOP LEVEL BRANCHES AND LEAF: + + if (JU_LEAFW_POP0(PArray) < cJU_LEAFW_MAXPOP1) // must be a LEAFW + { + Pjlw_t Pjlw = P_JLW(PArray); // first word of leaf. + int posidx; // signed offset in leaf. + + Pop1 = Pjlw[0] + 1; + posidx = j__udySearchLeafW(Pjlw + 1, Pop1, Index); + + if (posidx >= 0) + { + JUDY1CODE(return(1);) + JUDYLCODE(return((PPvoid_t) (JL_LEAFWVALUEAREA(Pjlw, Pop1) + posidx));) + } + JUDY1CODE(return(0);) + JUDYLCODE(return((PPvoid_t) NULL);) + } + +#endif // ! JUDYGETINLINE + + Pjpm = P_JPM(PArray); + Pjp = &(Pjpm->jpm_JP); // top branch is below JPM. + +// **************************************************************************** +// WALK THE JUDY TREE USING A STATE MACHINE: + +ContinueWalk: // for going down one level; come here with Pjp set. + +#ifdef TRACEJPR + JudyPrintJP(Pjp, "g", __LINE__); +#endif + switch (JU_JPTYPE(Pjp)) + { + +// Ensure the switch table starts at 0 for speed; otherwise more code is +// executed: + + case 0: goto ReturnCorrupt; // save a little code. + + +// **************************************************************************** +// JPNULL*: +// +// Note: These are legitimate in a BranchU (only) and do not constitute a +// fault. + + case cJU_JPNULL1: + case cJU_JPNULL2: + case cJU_JPNULL3: +#ifdef JU_64BIT + case cJU_JPNULL4: + case cJU_JPNULL5: + case cJU_JPNULL6: + case cJU_JPNULL7: +#endif + assert(ParentJPType >= cJU_JPBRANCH_U2); + assert(ParentJPType <= cJU_JPBRANCH_U); + JUDY1CODE(return(0);) + JUDYLCODE(return((PPvoid_t) NULL);) + + +// **************************************************************************** +// JPBRANCH_L*: +// +// Note: The use of JU_DCDNOTMATCHINDEX() in branches is not strictly +// required,since this can be done at leaf level, but it costs nothing to do it +// sooner, and it aborts an unnecessary traversal sooner. + + case cJU_JPBRANCH_L2: + + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 2)) break; + Digit = JU_DIGITATSTATE(Index, 2); + goto JudyBranchL; + + case cJU_JPBRANCH_L3: + +#ifdef JU_64BIT // otherwise its a no-op: + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 3)) break; +#endif + Digit = JU_DIGITATSTATE(Index, 3); + goto JudyBranchL; + +#ifdef JU_64BIT + case cJU_JPBRANCH_L4: + + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 4)) break; + Digit = JU_DIGITATSTATE(Index, 4); + goto JudyBranchL; + + case cJU_JPBRANCH_L5: + + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 5)) break; + Digit = JU_DIGITATSTATE(Index, 5); + goto JudyBranchL; + + case cJU_JPBRANCH_L6: + + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 6)) break; + Digit = JU_DIGITATSTATE(Index, 6); + goto JudyBranchL; + + case cJU_JPBRANCH_L7: + + // JU_DCDNOTMATCHINDEX() would be a no-op. + Digit = JU_DIGITATSTATE(Index, 7); + goto JudyBranchL; + +#endif // JU_64BIT + + case cJU_JPBRANCH_L: + { + Pjbl_t Pjbl; + int posidx; + + Digit = JU_DIGITATSTATE(Index, cJU_ROOTSTATE); + +// Common code for all BranchLs; come here with Digit set: + +JudyBranchL: + Pjbl = P_JBL(Pjp->jp_Addr); + + posidx = 0; + + do { + if (Pjbl->jbl_Expanse[posidx] == Digit) + { // found Digit; continue traversal: + DBGCODE(ParentJPType = JU_JPTYPE(Pjp);) + Pjp = Pjbl->jbl_jp + posidx; + goto ContinueWalk; + } + } while (++posidx != Pjbl->jbl_NumJPs); + + break; + } + + +// **************************************************************************** +// JPBRANCH_B*: + + case cJU_JPBRANCH_B2: + + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 2)) break; + Digit = JU_DIGITATSTATE(Index, 2); + goto JudyBranchB; + + case cJU_JPBRANCH_B3: + +#ifdef JU_64BIT // otherwise its a no-op: + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 3)) break; +#endif + Digit = JU_DIGITATSTATE(Index, 3); + goto JudyBranchB; + + +#ifdef JU_64BIT + case cJU_JPBRANCH_B4: + + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 4)) break; + Digit = JU_DIGITATSTATE(Index, 4); + goto JudyBranchB; + + case cJU_JPBRANCH_B5: + + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 5)) break; + Digit = JU_DIGITATSTATE(Index, 5); + goto JudyBranchB; + + case cJU_JPBRANCH_B6: + + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 6)) break; + Digit = JU_DIGITATSTATE(Index, 6); + goto JudyBranchB; + + case cJU_JPBRANCH_B7: + + // JU_DCDNOTMATCHINDEX() would be a no-op. + Digit = JU_DIGITATSTATE(Index, 7); + goto JudyBranchB; + +#endif // JU_64BIT + + case cJU_JPBRANCH_B: + { + Pjbb_t Pjbb; + Word_t subexp; // in bitmap, 0..7. + BITMAPB_t BitMap; // for one subexpanse. + BITMAPB_t BitMask; // bit in BitMap for Indexs Digit. + + Digit = JU_DIGITATSTATE(Index, cJU_ROOTSTATE); + +// Common code for all BranchBs; come here with Digit set: + +JudyBranchB: + DBGCODE(ParentJPType = JU_JPTYPE(Pjp);) + Pjbb = P_JBB(Pjp->jp_Addr); + subexp = Digit / cJU_BITSPERSUBEXPB; + + BitMap = JU_JBB_BITMAP(Pjbb, subexp); + Pjp = P_JP(JU_JBB_PJP(Pjbb, subexp)); + + BitMask = JU_BITPOSMASKB(Digit); + +// No JP in subexpanse for Index => Index not found: + + if (! (BitMap & BitMask)) break; + +// Count JPs in the subexpanse below the one for Index: + + Pjp += j__udyCountBitsB(BitMap & (BitMask - 1)); + + goto ContinueWalk; + + } // case cJU_JPBRANCH_B* + + +// **************************************************************************** +// JPBRANCH_U*: +// +// Notice the reverse order of the cases, and falling through to the next case, +// for performance. + + case cJU_JPBRANCH_U: + + DBGCODE(ParentJPType = JU_JPTYPE(Pjp);) + Pjp = JU_JBU_PJP(Pjp, Index, cJU_ROOTSTATE); + +// If not a BranchU, traverse; otherwise fall into the next case, which makes +// this very fast code for a large Judy array (mainly BranchUs), especially +// when branches are already in the cache, such as for prev/next: + +#ifndef JU_64BIT + if (JU_JPTYPE(Pjp) != cJU_JPBRANCH_U3) goto ContinueWalk; +#else + if (JU_JPTYPE(Pjp) != cJU_JPBRANCH_U7) goto ContinueWalk; +#endif + +#ifdef JU_64BIT + case cJU_JPBRANCH_U7: + + // JU_DCDNOTMATCHINDEX() would be a no-op. + DBGCODE(ParentJPType = JU_JPTYPE(Pjp);) + Pjp = JU_JBU_PJP(Pjp, Index, 7); + + if (JU_JPTYPE(Pjp) != cJU_JPBRANCH_U6) goto ContinueWalk; + // and fall through. + + case cJU_JPBRANCH_U6: + + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 6)) break; + DBGCODE(ParentJPType = JU_JPTYPE(Pjp);) + Pjp = JU_JBU_PJP(Pjp, Index, 6); + + if (JU_JPTYPE(Pjp) != cJU_JPBRANCH_U5) goto ContinueWalk; + // and fall through. + + case cJU_JPBRANCH_U5: + + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 5)) break; + DBGCODE(ParentJPType = JU_JPTYPE(Pjp);) + Pjp = JU_JBU_PJP(Pjp, Index, 5); + + if (JU_JPTYPE(Pjp) != cJU_JPBRANCH_U4) goto ContinueWalk; + // and fall through. + + case cJU_JPBRANCH_U4: + + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 4)) break; + DBGCODE(ParentJPType = JU_JPTYPE(Pjp);) + Pjp = JU_JBU_PJP(Pjp, Index, 4); + + if (JU_JPTYPE(Pjp) != cJU_JPBRANCH_U3) goto ContinueWalk; + // and fall through. + +#endif // JU_64BIT + + case cJU_JPBRANCH_U3: + +#ifdef JU_64BIT // otherwise its a no-op: + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 3)) break; +#endif + DBGCODE(ParentJPType = JU_JPTYPE(Pjp);) + Pjp = JU_JBU_PJP(Pjp, Index, 3); + + if (JU_JPTYPE(Pjp) != cJU_JPBRANCH_U2) goto ContinueWalk; + // and fall through. + + case cJU_JPBRANCH_U2: + + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 2)) break; + DBGCODE(ParentJPType = JU_JPTYPE(Pjp);) + Pjp = JU_JBU_PJP(Pjp, Index, 2); + +// Note: BranchU2 is a special case that must continue traversal to a leaf, +// immed, full, or null type: + + goto ContinueWalk; + + +// **************************************************************************** +// JPLEAF*: +// +// Note: Here the calls of JU_DCDNOTMATCHINDEX() are necessary and check +// whether Index is out of the expanse of a narrow pointer. + +#if (defined(JUDYL) || (! defined(JU_64BIT))) + + case cJU_JPLEAF1: + { + int posidx; // signed offset in leaf. + + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 1)) break; + + Pop1 = JU_JPLEAF_POP0(Pjp) + 1; + Pjll = P_JLL(Pjp->jp_Addr); + + if ((posidx = j__udySearchLeaf1(Pjll, Pop1, Index)) < 0) break; + + JUDY1CODE(return(1);) + JUDYLCODE(return((PPvoid_t) (JL_LEAF1VALUEAREA(Pjll, Pop1) + posidx));) + } + +#endif // (JUDYL || (! JU_64BIT)) + + case cJU_JPLEAF2: + { + int posidx; // signed offset in leaf. + + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 2)) break; + + Pop1 = JU_JPLEAF_POP0(Pjp) + 1; + Pjll = P_JLL(Pjp->jp_Addr); + + if ((posidx = j__udySearchLeaf2(Pjll, Pop1, Index)) < 0) break; + + JUDY1CODE(return(1);) + JUDYLCODE(return((PPvoid_t) (JL_LEAF2VALUEAREA(Pjll, Pop1) + posidx));) + } + case cJU_JPLEAF3: + { + int posidx; // signed offset in leaf. + +#ifdef JU_64BIT // otherwise its a no-op: + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 3)) break; +#endif + + Pop1 = JU_JPLEAF_POP0(Pjp) + 1; + Pjll = P_JLL(Pjp->jp_Addr); + + if ((posidx = j__udySearchLeaf3(Pjll, Pop1, Index)) < 0) break; + + JUDY1CODE(return(1);) + JUDYLCODE(return((PPvoid_t) (JL_LEAF3VALUEAREA(Pjll, Pop1) + posidx));) + } +#ifdef JU_64BIT + case cJU_JPLEAF4: + { + int posidx; // signed offset in leaf. + + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 4)) break; + + Pop1 = JU_JPLEAF_POP0(Pjp) + 1; + Pjll = P_JLL(Pjp->jp_Addr); + + if ((posidx = j__udySearchLeaf4(Pjll, Pop1, Index)) < 0) break; + + JUDY1CODE(return(1);) + JUDYLCODE(return((PPvoid_t) (JL_LEAF4VALUEAREA(Pjll, Pop1) + posidx));) + } + case cJU_JPLEAF5: + { + int posidx; // signed offset in leaf. + + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 5)) break; + + Pop1 = JU_JPLEAF_POP0(Pjp) + 1; + Pjll = P_JLL(Pjp->jp_Addr); + + if ((posidx = j__udySearchLeaf5(Pjll, Pop1, Index)) < 0) break; + + JUDY1CODE(return(1);) + JUDYLCODE(return((PPvoid_t) (JL_LEAF5VALUEAREA(Pjll, Pop1) + posidx));) + } + + case cJU_JPLEAF6: + { + int posidx; // signed offset in leaf. + + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 6)) break; + + Pop1 = JU_JPLEAF_POP0(Pjp) + 1; + Pjll = P_JLL(Pjp->jp_Addr); + + if ((posidx = j__udySearchLeaf6(Pjll, Pop1, Index)) < 0) break; + + JUDY1CODE(return(1);) + JUDYLCODE(return((PPvoid_t) (JL_LEAF6VALUEAREA(Pjll, Pop1) + posidx));) + } + case cJU_JPLEAF7: + { + int posidx; // signed offset in leaf. + + // JU_DCDNOTMATCHINDEX() would be a no-op. + Pop1 = JU_JPLEAF_POP0(Pjp) + 1; + Pjll = P_JLL(Pjp->jp_Addr); + + if ((posidx = j__udySearchLeaf7(Pjll, Pop1, Index)) < 0) break; + + JUDY1CODE(return(1);) + JUDYLCODE(return((PPvoid_t) (JL_LEAF7VALUEAREA(Pjll, Pop1) + posidx));) + } +#endif // JU_64BIT + + +// **************************************************************************** +// JPLEAF_B1: + + case cJU_JPLEAF_B1: + { + Pjlb_t Pjlb; +#ifdef JUDYL + int posidx; + Word_t subexp; // in bitmap, 0..7. + BITMAPL_t BitMap; // for one subexpanse. + BITMAPL_t BitMask; // bit in BitMap for Indexs Digit. + Pjv_t Pjv; +#endif + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 1)) break; + + Pjlb = P_JLB(Pjp->jp_Addr); + +#ifdef JUDY1 + +// Simply check if Indexs bit is set in the bitmap: + + if (JU_BITMAPTESTL(Pjlb, Index)) return(1); + break; + +#else // JUDYL + +// JudyL is much more complicated because of value area subarrays: + + Digit = JU_DIGITATSTATE(Index, 1); + subexp = Digit / cJU_BITSPERSUBEXPL; + BitMap = JU_JLB_BITMAP(Pjlb, subexp); + BitMask = JU_BITPOSMASKL(Digit); + +// No value in subexpanse for Index => Index not found: + + if (! (BitMap & BitMask)) break; + +// Count value areas in the subexpanse below the one for Index: + + Pjv = P_JV(JL_JLB_PVALUE(Pjlb, subexp)); + assert(Pjv != (Pjv_t) NULL); + posidx = j__udyCountBitsL(BitMap & (BitMask - 1)); + + return((PPvoid_t) (Pjv + posidx)); + +#endif // JUDYL + + } // case cJU_JPLEAF_B1 + +#ifdef JUDY1 + +// **************************************************************************** +// JPFULLPOPU1: +// +// If the Index is in the expanse, it is necessarily valid (found). + + case cJ1_JPFULLPOPU1: + + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 1)) break; + return(1); + +#ifdef notdef // for future enhancements +#ifdef JU_64BIT + +// Note: Need ? if (JU_DCDNOTMATCHINDEX(Index, Pjp, 1)) break; + + case cJ1_JPFULLPOPU1m15: + if (Pjp->jp_1Index[14] == (uint8_t)Index) break; + case cJ1_JPFULLPOPU1m14: + if (Pjp->jp_1Index[13] == (uint8_t)Index) break; + case cJ1_JPFULLPOPU1m13: + if (Pjp->jp_1Index[12] == (uint8_t)Index) break; + case cJ1_JPFULLPOPU1m12: + if (Pjp->jp_1Index[11] == (uint8_t)Index) break; + case cJ1_JPFULLPOPU1m11: + if (Pjp->jp_1Index[10] == (uint8_t)Index) break; + case cJ1_JPFULLPOPU1m10: + if (Pjp->jp_1Index[9] == (uint8_t)Index) break; + case cJ1_JPFULLPOPU1m9: + if (Pjp->jp_1Index[8] == (uint8_t)Index) break; + case cJ1_JPFULLPOPU1m8: + if (Pjp->jp_1Index[7] == (uint8_t)Index) break; +#endif + case cJ1_JPFULLPOPU1m7: + if (Pjp->jp_1Index[6] == (uint8_t)Index) break; + case cJ1_JPFULLPOPU1m6: + if (Pjp->jp_1Index[5] == (uint8_t)Index) break; + case cJ1_JPFULLPOPU1m5: + if (Pjp->jp_1Index[4] == (uint8_t)Index) break; + case cJ1_JPFULLPOPU1m4: + if (Pjp->jp_1Index[3] == (uint8_t)Index) break; + case cJ1_JPFULLPOPU1m3: + if (Pjp->jp_1Index[2] == (uint8_t)Index) break; + case cJ1_JPFULLPOPU1m2: + if (Pjp->jp_1Index[1] == (uint8_t)Index) break; + case cJ1_JPFULLPOPU1m1: + if (Pjp->jp_1Index[0] == (uint8_t)Index) break; + + return(1); // found, not in exclusion list + +#endif // JUDY1 +#endif // notdef + +// **************************************************************************** +// JPIMMED*: +// +// Note that the contents of jp_DcdPopO are different for cJU_JPIMMED_*_01: + + case cJU_JPIMMED_1_01: + case cJU_JPIMMED_2_01: + case cJU_JPIMMED_3_01: +#ifdef JU_64BIT + case cJU_JPIMMED_4_01: + case cJU_JPIMMED_5_01: + case cJU_JPIMMED_6_01: + case cJU_JPIMMED_7_01: +#endif + if (JU_JPDCDPOP0(Pjp) != JU_TRIMTODCDSIZE(Index)) break; + + JUDY1CODE(return(1);) + JUDYLCODE(return((PPvoid_t) &(Pjp->jp_Addr));) // immediate value area. + + +// Macros to make code more readable and avoid dup errors + +#ifdef JUDY1 + +#define CHECKINDEXNATIVE(LEAF_T, PJP, IDX, INDEX) \ +if (((LEAF_T *)((PJP)->jp_1Index))[(IDX) - 1] == (LEAF_T)(INDEX)) \ + return(1) + +#define CHECKLEAFNONNAT(LFBTS, PJP, INDEX, IDX, COPY) \ +{ \ + Word_t i_ndex; \ + uint8_t *a_ddr; \ + a_ddr = (PJP)->jp_1Index + (((IDX) - 1) * (LFBTS)); \ + COPY(i_ndex, a_ddr); \ + if (i_ndex == JU_LEASTBYTES((INDEX), (LFBTS))) \ + return(1); \ +} +#endif + +#ifdef JUDYL + +#define CHECKINDEXNATIVE(LEAF_T, PJP, IDX, INDEX) \ +if (((LEAF_T *)((PJP)->jp_LIndex))[(IDX) - 1] == (LEAF_T)(INDEX)) \ + return((PPvoid_t)(P_JV((PJP)->jp_Addr) + (IDX) - 1)) + +#define CHECKLEAFNONNAT(LFBTS, PJP, INDEX, IDX, COPY) \ +{ \ + Word_t i_ndex; \ + uint8_t *a_ddr; \ + a_ddr = (PJP)->jp_LIndex + (((IDX) - 1) * (LFBTS)); \ + COPY(i_ndex, a_ddr); \ + if (i_ndex == JU_LEASTBYTES((INDEX), (LFBTS))) \ + return((PPvoid_t)(P_JV((PJP)->jp_Addr) + (IDX) - 1)); \ +} +#endif + +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_1_15: CHECKINDEXNATIVE(uint8_t, Pjp, 15, Index); + case cJ1_JPIMMED_1_14: CHECKINDEXNATIVE(uint8_t, Pjp, 14, Index); + case cJ1_JPIMMED_1_13: CHECKINDEXNATIVE(uint8_t, Pjp, 13, Index); + case cJ1_JPIMMED_1_12: CHECKINDEXNATIVE(uint8_t, Pjp, 12, Index); + case cJ1_JPIMMED_1_11: CHECKINDEXNATIVE(uint8_t, Pjp, 11, Index); + case cJ1_JPIMMED_1_10: CHECKINDEXNATIVE(uint8_t, Pjp, 10, Index); + case cJ1_JPIMMED_1_09: CHECKINDEXNATIVE(uint8_t, Pjp, 9, Index); + case cJ1_JPIMMED_1_08: CHECKINDEXNATIVE(uint8_t, Pjp, 8, Index); +#endif +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_1_07: CHECKINDEXNATIVE(uint8_t, Pjp, 7, Index); + case cJU_JPIMMED_1_06: CHECKINDEXNATIVE(uint8_t, Pjp, 6, Index); + case cJU_JPIMMED_1_05: CHECKINDEXNATIVE(uint8_t, Pjp, 5, Index); + case cJU_JPIMMED_1_04: CHECKINDEXNATIVE(uint8_t, Pjp, 4, Index); +#endif + case cJU_JPIMMED_1_03: CHECKINDEXNATIVE(uint8_t, Pjp, 3, Index); + case cJU_JPIMMED_1_02: CHECKINDEXNATIVE(uint8_t, Pjp, 2, Index); + CHECKINDEXNATIVE(uint8_t, Pjp, 1, Index); + break; + +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_2_07: CHECKINDEXNATIVE(uint16_t, Pjp, 7, Index); + case cJ1_JPIMMED_2_06: CHECKINDEXNATIVE(uint16_t, Pjp, 6, Index); + case cJ1_JPIMMED_2_05: CHECKINDEXNATIVE(uint16_t, Pjp, 5, Index); + case cJ1_JPIMMED_2_04: CHECKINDEXNATIVE(uint16_t, Pjp, 4, Index); +#endif +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_2_03: CHECKINDEXNATIVE(uint16_t, Pjp, 3, Index); + case cJU_JPIMMED_2_02: CHECKINDEXNATIVE(uint16_t, Pjp, 2, Index); + CHECKINDEXNATIVE(uint16_t, Pjp, 1, Index); + break; +#endif + +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_3_05: + CHECKLEAFNONNAT(3, Pjp, Index, 5, JU_COPY3_PINDEX_TO_LONG); + case cJ1_JPIMMED_3_04: + CHECKLEAFNONNAT(3, Pjp, Index, 4, JU_COPY3_PINDEX_TO_LONG); + case cJ1_JPIMMED_3_03: + CHECKLEAFNONNAT(3, Pjp, Index, 3, JU_COPY3_PINDEX_TO_LONG); +#endif +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_3_02: + CHECKLEAFNONNAT(3, Pjp, Index, 2, JU_COPY3_PINDEX_TO_LONG); + CHECKLEAFNONNAT(3, Pjp, Index, 1, JU_COPY3_PINDEX_TO_LONG); + break; +#endif + +#if (defined(JUDY1) && defined(JU_64BIT)) + + case cJ1_JPIMMED_4_03: CHECKINDEXNATIVE(uint32_t, Pjp, 3, Index); + case cJ1_JPIMMED_4_02: CHECKINDEXNATIVE(uint32_t, Pjp, 2, Index); + CHECKINDEXNATIVE(uint32_t, Pjp, 1, Index); + break; + + case cJ1_JPIMMED_5_03: + CHECKLEAFNONNAT(5, Pjp, Index, 3, JU_COPY5_PINDEX_TO_LONG); + case cJ1_JPIMMED_5_02: + CHECKLEAFNONNAT(5, Pjp, Index, 2, JU_COPY5_PINDEX_TO_LONG); + CHECKLEAFNONNAT(5, Pjp, Index, 1, JU_COPY5_PINDEX_TO_LONG); + break; + + case cJ1_JPIMMED_6_02: + CHECKLEAFNONNAT(6, Pjp, Index, 2, JU_COPY6_PINDEX_TO_LONG); + CHECKLEAFNONNAT(6, Pjp, Index, 1, JU_COPY6_PINDEX_TO_LONG); + break; + + case cJ1_JPIMMED_7_02: + CHECKLEAFNONNAT(7, Pjp, Index, 2, JU_COPY7_PINDEX_TO_LONG); + CHECKLEAFNONNAT(7, Pjp, Index, 1, JU_COPY7_PINDEX_TO_LONG); + break; + +#endif // (JUDY1 && JU_64BIT) + + +// **************************************************************************** +// INVALID JP TYPE: + + default: + +ReturnCorrupt: + +#ifdef JUDYGETINLINE // Pjpm is known to be non-null: + JU_SET_ERRNO_NONNULL(Pjpm, JU_ERRNO_CORRUPT); +#else + JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); +#endif + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + + } // switch on JP type + +JUDY1CODE(return(0);) +JUDYLCODE(return((PPvoid_t) NULL);) + +} // Judy1Test() / JudyLGet() + + +#ifndef JUDYGETINLINE // only compile the following function once: +#ifdef DEBUG + +// **************************************************************************** +// J U D Y C H E C K P O P +// +// Given a pointer to a Judy array, traverse the entire array to ensure +// population counts add up correctly. This can catch various coding errors. +// +// Since walking the entire tree is probably time-consuming, enable this +// function by setting env parameter $CHECKPOP to first call at which to start +// checking. Note: This function is called both from insert and delete code. +// +// Note: Even though this function does nothing useful for LEAFW leaves, its +// good practice to call it anyway, and cheap too. +// +// TBD: This is a debug-only check function similar to JudyCheckSorted(), but +// since it walks the tree it is Judy1/JudyL-specific and must live in a source +// file that is built both ways. +// +// TBD: As feared, enabling this code for every insert/delete makes Judy +// deathly slow, even for a small tree (10K indexes). Its not so bad if +// present but disabled (<1% slowdown measured). Still, should it be ifdefd +// other than DEBUG and/or called less often? +// +// TBD: Should this "population checker" be expanded to a comprehensive tree +// checker? It currently detects invalid LEAFW/JP types as well as inconsistent +// pop1s. Other possible checks, all based on essentially redundant data in +// the Judy tree, include: +// +// - Zero LS bits in jp_Addr field. +// +// - Correct Dcd bits. +// +// - Consistent JP types (always descending down the tree). +// +// - Sorted linear lists in BranchLs and leaves (using JudyCheckSorted(), but +// ideally that function is already called wherever appropriate after any +// linear list is modified). +// +// - Any others possible? + +#include <stdlib.h> // for getenv() and atol(). + +static Word_t JudyCheckPopSM(Pjp_t Pjp, Word_t RootPop1); + +FUNCTION void JudyCheckPop( + Pvoid_t PArray) +{ +static bool_t checked = FALSE; // already checked env parameter. +static bool_t enabled = FALSE; // env parameter set. +static bool_t active = FALSE; // calls >= callsmin. +static Word_t callsmin; // start point from $CHECKPOP. +static Word_t calls = 0; // times called so far. + + +// CHECK FOR EXTERNAL ENABLING: + + if (! checked) // only check once. + { + char * value; // for getenv(). + + checked = TRUE; + + if ((value = getenv("CHECKPOP")) == (char *) NULL) + { +#ifdef notdef +// Take this out because nightly tests want to be flavor-independent; its not +// OK to emit special non-error output from the debug flavor: + + (void) puts("JudyCheckPop() present but not enabled by " + "$CHECKPOP env parameter; set it to the number of " + "calls at which to begin checking"); +#endif + return; + } + + callsmin = atol(value); // note: non-number evaluates to 0. + enabled = TRUE; + + (void) printf("JudyCheckPop() present and enabled; callsmin = " + "%lu\n", callsmin); + } + else if (! enabled) return; + +// Previously or just now enabled; check if non-active or newly active: + + if (! active) + { + if (++calls < callsmin) return; + + (void) printf("JudyCheckPop() activated at call %lu\n", calls); + active = TRUE; + } + +// IGNORE LEAFW AT TOP OF TREE: + + if (JU_LEAFW_POP0(PArray) < cJU_LEAFW_MAXPOP1) // must be a LEAFW + return; + +// Check JPM pop0 against tree, recursively: +// +// Note: The traversal code in JudyCheckPopSM() is simplest when the case +// statement for each JP type compares the pop1 for that JP to its subtree (if +// any) after traversing the subtree (thats the hard part) and adding up +// actual pop1s. A top branchs JP in the JPM does not have room for a +// full-word pop1, so pass it in as a special case. + + { + Pjpm_t Pjpm = P_JPM(PArray); + (void) JudyCheckPopSM(&(Pjpm->jpm_JP), Pjpm->jpm_Pop0 + 1); + return; + } + +} // JudyCheckPop() + + +// **************************************************************************** +// J U D Y C H E C K P O P S M +// +// Recursive state machine (subroutine) for JudyCheckPop(): Given a Pjp (other +// than JPNULL*; caller should shortcut) and the root population for top-level +// branches, check the subtrees actual pop1 against its nominal value, and +// return the total pop1 for the subtree. +// +// Note: Expect RootPop1 to be ignored at lower levels, so pass down 0, which +// should pop an assertion if this expectation is violated. + +FUNCTION static Word_t JudyCheckPopSM( + Pjp_t Pjp, // top of subtree. + Word_t RootPop1) // whole array, for top-level branches only. +{ + Word_t pop1_jp; // nominal population from the JP. + Word_t pop1 = 0; // actual population at this level. + Word_t offset; // in a branch. + +#define PREPBRANCH(cPopBytes,Next) \ + pop1_jp = JU_JPBRANCH_POP0(Pjp, cPopBytes) + 1; goto Next + +assert((((Word_t) (Pjp->jp_Addr)) & 7) == 3); + switch (JU_JPTYPE(Pjp)) + { + + case cJU_JPBRANCH_L2: PREPBRANCH(2, BranchL); + case cJU_JPBRANCH_L3: PREPBRANCH(3, BranchL); +#ifdef JU_64BIT + case cJU_JPBRANCH_L4: PREPBRANCH(4, BranchL); + case cJU_JPBRANCH_L5: PREPBRANCH(5, BranchL); + case cJU_JPBRANCH_L6: PREPBRANCH(6, BranchL); + case cJU_JPBRANCH_L7: PREPBRANCH(7, BranchL); +#endif + case cJU_JPBRANCH_L: pop1_jp = RootPop1; + { + Pjbl_t Pjbl; +BranchL: + Pjbl = P_JBL(Pjp->jp_Addr); + + for (offset = 0; offset < (Pjbl->jbl_NumJPs); ++offset) + pop1 += JudyCheckPopSM((Pjbl->jbl_jp) + offset, 0); + + assert(pop1_jp == pop1); + return(pop1); + } + + case cJU_JPBRANCH_B2: PREPBRANCH(2, BranchB); + case cJU_JPBRANCH_B3: PREPBRANCH(3, BranchB); +#ifdef JU_64BIT + case cJU_JPBRANCH_B4: PREPBRANCH(4, BranchB); + case cJU_JPBRANCH_B5: PREPBRANCH(5, BranchB); + case cJU_JPBRANCH_B6: PREPBRANCH(6, BranchB); + case cJU_JPBRANCH_B7: PREPBRANCH(7, BranchB); +#endif + case cJU_JPBRANCH_B: pop1_jp = RootPop1; + { + Word_t subexp; + Word_t jpcount; + Pjbb_t Pjbb; +BranchB: + Pjbb = P_JBB(Pjp->jp_Addr); + + for (subexp = 0; subexp < cJU_NUMSUBEXPB; ++subexp) + { + jpcount = j__udyCountBitsB(JU_JBB_BITMAP(Pjbb, subexp)); + + for (offset = 0; offset < jpcount; ++offset) + { + pop1 += JudyCheckPopSM(P_JP(JU_JBB_PJP(Pjbb, subexp)) + + offset, 0); + } + } + + assert(pop1_jp == pop1); + return(pop1); + } + + case cJU_JPBRANCH_U2: PREPBRANCH(2, BranchU); + case cJU_JPBRANCH_U3: PREPBRANCH(3, BranchU); +#ifdef JU_64BIT + case cJU_JPBRANCH_U4: PREPBRANCH(4, BranchU); + case cJU_JPBRANCH_U5: PREPBRANCH(5, BranchU); + case cJU_JPBRANCH_U6: PREPBRANCH(6, BranchU); + case cJU_JPBRANCH_U7: PREPBRANCH(7, BranchU); +#endif + case cJU_JPBRANCH_U: pop1_jp = RootPop1; + { + Pjbu_t Pjbu; +BranchU: + Pjbu = P_JBU(Pjp->jp_Addr); + + for (offset = 0; offset < cJU_BRANCHUNUMJPS; ++offset) + { + if (((Pjbu->jbu_jp[offset].jp_Type) >= cJU_JPNULL1) + && ((Pjbu->jbu_jp[offset].jp_Type) <= cJU_JPNULLMAX)) + { + continue; // skip null JP to save time. + } + + pop1 += JudyCheckPopSM((Pjbu->jbu_jp) + offset, 0); + } + + assert(pop1_jp == pop1); + return(pop1); + } + + +// -- Cases below here terminate and do not recurse. -- +// +// For all of these cases except JPLEAF_B1, there is no way to check the JPs +// pop1 against the object itself; just return the pop1; but for linear leaves, +// a bounds check is possible. + +#define CHECKLEAF(MaxPop1) \ + pop1 = JU_JPLEAF_POP0(Pjp) + 1; \ + assert(pop1 >= 1); \ + assert(pop1 <= (MaxPop1)); \ + return(pop1) + +#if (defined(JUDYL) || (! defined(JU_64BIT))) + case cJU_JPLEAF1: CHECKLEAF(cJU_LEAF1_MAXPOP1); +#endif + case cJU_JPLEAF2: CHECKLEAF(cJU_LEAF2_MAXPOP1); + case cJU_JPLEAF3: CHECKLEAF(cJU_LEAF3_MAXPOP1); +#ifdef JU_64BIT + case cJU_JPLEAF4: CHECKLEAF(cJU_LEAF4_MAXPOP1); + case cJU_JPLEAF5: CHECKLEAF(cJU_LEAF5_MAXPOP1); + case cJU_JPLEAF6: CHECKLEAF(cJU_LEAF6_MAXPOP1); + case cJU_JPLEAF7: CHECKLEAF(cJU_LEAF7_MAXPOP1); +#endif + + case cJU_JPLEAF_B1: + { + Word_t subexp; + Pjlb_t Pjlb; + + pop1_jp = JU_JPLEAF_POP0(Pjp) + 1; + + Pjlb = P_JLB(Pjp->jp_Addr); + + for (subexp = 0; subexp < cJU_NUMSUBEXPL; ++subexp) + pop1 += j__udyCountBitsL(JU_JLB_BITMAP(Pjlb, subexp)); + + assert(pop1_jp == pop1); + return(pop1); + } + + JUDY1CODE(case cJ1_JPFULLPOPU1: return(cJU_JPFULLPOPU1_POP0);) + + case cJU_JPIMMED_1_01: return(1); + case cJU_JPIMMED_2_01: return(1); + case cJU_JPIMMED_3_01: return(1); +#ifdef JU_64BIT + case cJU_JPIMMED_4_01: return(1); + case cJU_JPIMMED_5_01: return(1); + case cJU_JPIMMED_6_01: return(1); + case cJU_JPIMMED_7_01: return(1); +#endif + + case cJU_JPIMMED_1_02: return(2); + case cJU_JPIMMED_1_03: return(3); +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_1_04: return(4); + case cJU_JPIMMED_1_05: return(5); + case cJU_JPIMMED_1_06: return(6); + case cJU_JPIMMED_1_07: return(7); +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_1_08: return(8); + case cJ1_JPIMMED_1_09: return(9); + case cJ1_JPIMMED_1_10: return(10); + case cJ1_JPIMMED_1_11: return(11); + case cJ1_JPIMMED_1_12: return(12); + case cJ1_JPIMMED_1_13: return(13); + case cJ1_JPIMMED_1_14: return(14); + case cJ1_JPIMMED_1_15: return(15); +#endif + +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_2_02: return(2); + case cJU_JPIMMED_2_03: return(3); +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_2_04: return(4); + case cJ1_JPIMMED_2_05: return(5); + case cJ1_JPIMMED_2_06: return(6); + case cJ1_JPIMMED_2_07: return(7); +#endif + +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_3_02: return(2); +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_3_03: return(3); + case cJ1_JPIMMED_3_04: return(4); + case cJ1_JPIMMED_3_05: return(5); + + case cJ1_JPIMMED_4_02: return(2); + case cJ1_JPIMMED_4_03: return(3); + case cJ1_JPIMMED_5_02: return(2); + case cJ1_JPIMMED_5_03: return(3); + case cJ1_JPIMMED_6_02: return(2); + case cJ1_JPIMMED_7_02: return(2); +#endif + + } // switch (JU_JPTYPE(Pjp)) + + assert(FALSE); // unrecognized JP type => corruption. + return(0); // to make some compilers happy. + +} // JudyCheckPopSM() + +#endif // DEBUG +#endif // ! JUDYGETINLINE diff --git a/src/libnetdata/libjudy/src/JudyL/JudyLIns.c b/src/libnetdata/libjudy/src/JudyL/JudyLIns.c new file mode 100644 index 00000000..256a1ef3 --- /dev/null +++ b/src/libnetdata/libjudy/src/JudyL/JudyLIns.c @@ -0,0 +1,1874 @@ +// Copyright (C) 2000 - 2002 Hewlett-Packard Company +// +// This program is free software; you can redistribute it and/or modify it +// under the term of the GNU Lesser General Public License as published by the +// Free Software Foundation; either version 2 of the License, or (at your +// option) any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License +// for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this program; if not, write to the Free Software Foundation, +// Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// _________________ + +// @(#) $Revision: 4.116 $ $Source: /judy/src/JudyCommon/JudyIns.c $ +// +// Judy1Set() and JudyLIns() functions for Judy1 and JudyL. +// Compile with one of -DJUDY1 or -DJUDYL. +// +// TBD: Should some of the assertions here be converted to product code that +// returns JU_ERRNO_CORRUPT? + +#if (! (defined(JUDY1) || defined(JUDYL))) +#error: One of -DJUDY1 or -DJUDYL must be specified. +#endif + +#ifdef JUDY1 +#include "Judy1.h" +#else +#include "JudyL.h" +#endif + +#include "JudyPrivate1L.h" + +// Note: Call JudyCheckPop() even before "already inserted" returns, to catch +// population errors; see fix in 4.84: + +DBGCODE(extern void JudyCheckPop(Pvoid_t PArray);) +DBGCODE(extern void JudyCheckSorted(Pjll_t Pjll, Word_t Pop1, long IndexSize);) + +#ifdef TRACEJP +#include "JudyPrintJP.c" +#endif + + +// These are defined to generic values in JudyCommon/JudyPrivateTypes.h: +// +// TBD: These should be exported from a header file, but perhaps not, as they +// are only used here, and exported from Judy*Decascade, which is a separate +// file for profiling reasons (to prevent inlining), but which potentially +// could be merged with this file, either in SoftCM or at compile-time. + +#ifdef JUDY1 +extern int j__udy1CreateBranchB(Pjp_t, Pjp_t, uint8_t *, Word_t, Pvoid_t); +extern int j__udy1CreateBranchU(Pjp_t, Pvoid_t); + +#ifndef JU_64BIT +extern int j__udy1Cascade1(Pjp_t, Pvoid_t); +#endif +extern int j__udy1Cascade2(Pjp_t, Pvoid_t); +extern int j__udy1Cascade3(Pjp_t, Pvoid_t); +#ifdef JU_64BIT +extern int j__udy1Cascade4(Pjp_t, Pvoid_t); +extern int j__udy1Cascade5(Pjp_t, Pvoid_t); +extern int j__udy1Cascade6(Pjp_t, Pvoid_t); +extern int j__udy1Cascade7(Pjp_t, Pvoid_t); +#endif +extern int j__udy1CascadeL(Pjp_t, Pvoid_t); + +extern int j__udy1InsertBranch(Pjp_t Pjp, Word_t Index, Word_t Btype, Pjpm_t); + +#else // JUDYL + +extern int j__udyLCreateBranchB(Pjp_t, Pjp_t, uint8_t *, Word_t, Pvoid_t); +extern int j__udyLCreateBranchU(Pjp_t, Pvoid_t); + +extern int j__udyLCascade1(Pjp_t, Pvoid_t); +extern int j__udyLCascade2(Pjp_t, Pvoid_t); +extern int j__udyLCascade3(Pjp_t, Pvoid_t); +#ifdef JU_64BIT +extern int j__udyLCascade4(Pjp_t, Pvoid_t); +extern int j__udyLCascade5(Pjp_t, Pvoid_t); +extern int j__udyLCascade6(Pjp_t, Pvoid_t); +extern int j__udyLCascade7(Pjp_t, Pvoid_t); +#endif +extern int j__udyLCascadeL(Pjp_t, Pvoid_t); + +extern int j__udyLInsertBranch(Pjp_t Pjp, Word_t Index, Word_t Btype, Pjpm_t); +#endif + + +// **************************************************************************** +// MACROS FOR COMMON CODE: +// +// Check if Index is an outlier to (that is, not a member of) this expanse: +// +// An outlier is an Index in-the-expanse of the slot containing the pointer, +// but not-in-the-expanse of the "narrow" pointer in that slot. (This means +// the Dcd part of the Index differs from the equivalent part of jp_DcdPopO.) +// Therefore, the remedy is to put a cJU_JPBRANCH_L* between the narrow pointer +// and the object to which it points, and add the outlier Index as an Immediate +// in the cJU_JPBRANCH_L*. The "trick" is placing the cJU_JPBRANCH_L* at a +// Level that is as low as possible. This is determined by counting the digits +// in the existing narrow pointer that are the same as the digits in the new +// Index (see j__udyInsertBranch()). +// +// Note: At some high Levels, cJU_DCDMASK() is all zeros => dead code; assume +// the compiler optimizes this out. + +#define JU_CHECK_IF_OUTLIER(Pjp, Index, cLevel, Pjpm) \ + if (JU_DCDNOTMATCHINDEX(Index, Pjp, cLevel)) \ + return(j__udyInsertBranch(Pjp, Index, cLevel, Pjpm)) + +// Check if an Index is already in a leaf or immediate, after calling +// j__udySearchLeaf*() to set Offset: +// +// A non-negative Offset means the Index already exists, so return 0; otherwise +// complement Offset to proceed. + +#ifdef JUDY1 +#define Pjv ignore // placeholder. +#define JU_CHECK_IF_EXISTS(Offset,ignore,Pjpm) \ + { \ + if ((Offset) >= 0) return(0); \ + (Offset) = ~(Offset); \ + } +#else +// For JudyL, also set the value area pointer in the Pjpm: + +#define JU_CHECK_IF_EXISTS(Offset,Pjv,Pjpm) \ + { \ + if ((Offset) >= 0) \ + { \ + (Pjpm)->jpm_PValue = (Pjv) + (Offset); \ + return(0); \ + } \ + (Offset) = ~(Offset); \ + } +#endif + + +// **************************************************************************** +// __ J U D Y I N S W A L K +// +// Walk the Judy tree to do a set/insert. This is only called internally, and +// recursively. Unlike Judy1Test() and JudyLGet(), the extra time required for +// recursion should be negligible compared with the total. +// +// Return -1 for error (details in JPM), 0 for Index already inserted, 1 for +// new Index inserted. + +__attribute__((no_sanitize("shift"))) +FUNCTION static int j__udyInsWalk( + Pjp_t Pjp, // current JP to descend. + Word_t Index, // to insert. + Pjpm_t Pjpm) // for returning info to top Level. +{ + uint8_t digit; // from Index, current offset into a branch. + jp_t newJP; // for creating a new Immed JP. + Word_t exppop1; // expanse (leaf) population. + int retcode; // return codes: -1, 0, 1. + +#ifdef SUBEXPCOUNTS +// Pointer to BranchB/U subexpanse counter: +// +// Note: Very important for performance reasons (avoids cache fills). + + PWord_t PSubExp = (PWord_t) NULL; +#endif + +ContinueInsWalk: // for modifying state without recursing. + +#ifdef TRACEJP + JudyPrintJP(Pjp, "i", __LINE__); +#endif + + switch (JU_JPTYPE(Pjp)) // entry: Pjp, Index. + { + + +// **************************************************************************** +// JPNULL*: +// +// Convert JP in place from current null type to cJU_JPIMMED_*_01 by +// calculating new JP type. + + case cJU_JPNULL1: + case cJU_JPNULL2: + case cJU_JPNULL3: +#ifdef JU_64BIT + case cJU_JPNULL4: + case cJU_JPNULL5: + case cJU_JPNULL6: + case cJU_JPNULL7: +#endif + assert((Pjp->jp_Addr) == 0); + JU_JPSETADT(Pjp, 0, Index, JU_JPTYPE(Pjp) + cJU_JPIMMED_1_01 - cJU_JPNULL1); +#ifdef JUDYL + // value area is first word of new Immed_01 JP: + Pjpm->jpm_PValue = (Pjv_t) (&(Pjp->jp_Addr)); +#endif + return(1); + + +// **************************************************************************** +// JPBRANCH_L*: +// +// If the new Index is not an outlier to the branchs expanse, and the branch +// should not be converted to uncompressed, extract the digit and record the +// Immediate type to create for a new Immed JP, before going to common code. +// +// Note: JU_CHECK_IF_OUTLIER() is a no-op for BranchB3[7] on 32[64]-bit. + +#define JU_BRANCH_OUTLIER(DIGIT,POP1,cLEVEL,PJP,INDEX,PJPM) \ + JU_CHECK_IF_OUTLIER(PJP, INDEX, cLEVEL, PJPM); \ + (DIGIT) = JU_DIGITATSTATE(INDEX, cLEVEL); \ + (POP1) = JU_JPBRANCH_POP0(PJP, cLEVEL) + + case cJU_JPBRANCH_L2: + JU_BRANCH_OUTLIER(digit, exppop1, 2, Pjp, Index, Pjpm); + goto JudyBranchL; + + case cJU_JPBRANCH_L3: + JU_BRANCH_OUTLIER(digit, exppop1, 3, Pjp, Index, Pjpm); + goto JudyBranchL; + +#ifdef JU_64BIT + case cJU_JPBRANCH_L4: + JU_BRANCH_OUTLIER(digit, exppop1, 4, Pjp, Index, Pjpm); + goto JudyBranchL; + + case cJU_JPBRANCH_L5: + JU_BRANCH_OUTLIER(digit, exppop1, 5, Pjp, Index, Pjpm); + goto JudyBranchL; + + case cJU_JPBRANCH_L6: + JU_BRANCH_OUTLIER(digit, exppop1, 6, Pjp, Index, Pjpm); + goto JudyBranchL; + + case cJU_JPBRANCH_L7: + JU_BRANCH_OUTLIER(digit, exppop1, 7, Pjp, Index, Pjpm); + goto JudyBranchL; +#endif + +// Similar to common code above, but no outlier check is needed, and the Immed +// type depends on the word size: + + case cJU_JPBRANCH_L: + { + Pjbl_t PjblRaw; // pointer to old linear branch. + Pjbl_t Pjbl; + Pjbu_t PjbuRaw; // pointer to new uncompressed branch. + Pjbu_t Pjbu; + Word_t numJPs; // number of JPs = populated expanses. + int offset; // in branch. + + digit = JU_DIGITATSTATE(Index, cJU_ROOTSTATE); + exppop1 = Pjpm->jpm_Pop0; + + // fall through: + +// COMMON CODE FOR LINEAR BRANCHES: +// +// Come here with digit and exppop1 already set. + +JudyBranchL: + PjblRaw = (Pjbl_t) (Pjp->jp_Addr); + Pjbl = P_JBL(PjblRaw); + +// If population under this branch greater than: + + if (exppop1 > JU_BRANCHL_MAX_POP) + goto ConvertBranchLtoU; + + numJPs = Pjbl->jbl_NumJPs; + + if ((numJPs == 0) || (numJPs > cJU_BRANCHLMAXJPS)) + { + JU_SET_ERRNO_NONNULL(Pjpm, JU_ERRNO_CORRUPT); + return(-1); + } + +// Search for a match to the digit: + + offset = j__udySearchLeaf1((Pjll_t) (Pjbl->jbl_Expanse), numJPs, + digit); + +// If Index is found, offset is into an array of 1..cJU_BRANCHLMAXJPS JPs: + + if (offset >= 0) + { + Pjp = (Pjbl->jbl_jp) + offset; // address of next JP. + break; // continue walk. + } + +// Expanse is missing (not populated) for the passed Index, so insert an Immed +// -- if theres room: + + if (numJPs < cJU_BRANCHLMAXJPS) + { + offset = ~offset; // insertion offset. + + JU_JPSETADT(&newJP, 0, Index, + JU_JPTYPE(Pjp) + cJU_JPIMMED_1_01-cJU_JPBRANCH_L2); + + JU_INSERTINPLACE(Pjbl->jbl_Expanse, numJPs, offset, digit); + JU_INSERTINPLACE(Pjbl->jbl_jp, numJPs, offset, newJP); + + DBGCODE(JudyCheckSorted((Pjll_t) (Pjbl->jbl_Expanse), + numJPs + 1, /* IndexSize = */ 1);) + ++(Pjbl->jbl_NumJPs); +#ifdef JUDYL + // value area is first word of new Immed 01 JP: + Pjpm->jpm_PValue = (Pjv_t) ((Pjbl->jbl_jp) + offset); +#endif + return(1); + } + + +// MAXED OUT LINEAR BRANCH, CONVERT TO A BITMAP BRANCH, THEN INSERT: +// +// Copy the linear branch to a bitmap branch. +// +// TBD: Consider renaming j__udyCreateBranchB() to j__udyConvertBranchLtoB(). + + assert((numJPs) <= cJU_BRANCHLMAXJPS); + + if (j__udyCreateBranchB(Pjp, Pjbl->jbl_jp, Pjbl->jbl_Expanse, + numJPs, Pjpm) == -1) + { + return(-1); + } + +// Convert jp_Type from linear branch to equivalent bitmap branch: + + Pjp->jp_Type += cJU_JPBRANCH_B - cJU_JPBRANCH_L; + + j__udyFreeJBL(PjblRaw, Pjpm); // free old BranchL. + +// Having changed branch types, now do the insert in the new branch type: + + goto ContinueInsWalk; + + +// OPPORTUNISTICALLY CONVERT FROM BRANCHL TO BRANCHU: +// +// Memory efficiency is no object because the branchs pop1 is large enough, so +// speed up array access. Come here with PjblRaw set. Note: This is goto +// code because the previous block used to fall through into it as well, but no +// longer. + +ConvertBranchLtoU: + +// Allocate memory for an uncompressed branch: + + if ((PjbuRaw = j__udyAllocJBU(Pjpm)) == (Pjbu_t) NULL) + return(-1); + Pjbu = P_JBU(PjbuRaw); + +// Set the proper NULL type for most of the uncompressed branchs JPs: + + JU_JPSETADT(&newJP, 0, 0, + JU_JPTYPE(Pjp) - cJU_JPBRANCH_L2 + cJU_JPNULL1); + +// Initialize: Pre-set uncompressed branch to mostly JPNULL*s: + + for (numJPs = 0; numJPs < cJU_BRANCHUNUMJPS; ++numJPs) + Pjbu->jbu_jp[numJPs] = newJP; + +// Copy JPs from linear branch to uncompressed branch: + + { +#ifdef SUBEXPCOUNTS + Word_t popmask = cJU_POP0MASK(JU_JPTYPE(Pjp)) + - cJU_JPBRANCH_L2 - 2; + + for (numJPs = 0; numJPs < cJU_NUMSUBEXPU; ++numJPs) + Pjbu->jbu_subPop1[numJPs] = 0; +#endif + for (numJPs = 0; numJPs < Pjbl->jbl_NumJPs; ++numJPs) + { + Pjp_t Pjp1 = &(Pjbl->jbl_jp[numJPs]); + offset = Pjbl->jbl_Expanse[numJPs]; + Pjbu->jbu_jp[offset] = *Pjp1; +#ifdef SUBEXPCOUNTS + Pjbu->jbu_subPop1[offset/cJU_NUMSUBEXPU] += + JU_JPDCDPOP0(Pjp1) & popmask + 1; +#endif + } + } + j__udyFreeJBL(PjblRaw, Pjpm); // free old BranchL. + +// Plug new values into parent JP: + + Pjp->jp_Addr = (Word_t) PjbuRaw; + Pjp->jp_Type += cJU_JPBRANCH_U - cJU_JPBRANCH_L; // to BranchU. + +// Save global population of last BranchU conversion: + + Pjpm->jpm_LastUPop0 = Pjpm->jpm_Pop0; + goto ContinueInsWalk; + + } // case cJU_JPBRANCH_L. + + +// **************************************************************************** +// JPBRANCH_B*: +// +// If the new Index is not an outlier to the branchs expanse, extract the +// digit and record the Immediate type to create for a new Immed JP, before +// going to common code. +// +// Note: JU_CHECK_IF_OUTLIER() is a no-op for BranchB3[7] on 32[64]-bit. + + case cJU_JPBRANCH_B2: + JU_BRANCH_OUTLIER(digit, exppop1, 2, Pjp, Index, Pjpm); + goto JudyBranchB; + + case cJU_JPBRANCH_B3: + JU_BRANCH_OUTLIER(digit, exppop1, 3, Pjp, Index, Pjpm); + goto JudyBranchB; + +#ifdef JU_64BIT + case cJU_JPBRANCH_B4: + JU_BRANCH_OUTLIER(digit, exppop1, 4, Pjp, Index, Pjpm); + goto JudyBranchB; + + case cJU_JPBRANCH_B5: + JU_BRANCH_OUTLIER(digit, exppop1, 5, Pjp, Index, Pjpm); + goto JudyBranchB; + + case cJU_JPBRANCH_B6: + JU_BRANCH_OUTLIER(digit, exppop1, 6, Pjp, Index, Pjpm); + goto JudyBranchB; + + case cJU_JPBRANCH_B7: + JU_BRANCH_OUTLIER(digit, exppop1, 7, Pjp, Index, Pjpm); + goto JudyBranchB; +#endif + + case cJU_JPBRANCH_B: + { + Pjbb_t Pjbb; // pointer to bitmap branch. + Pjbb_t PjbbRaw; // pointer to bitmap branch. + Pjp_t Pjp2Raw; // 1 of N arrays of JPs. + Pjp_t Pjp2; // 1 of N arrays of JPs. + Word_t subexp; // 1 of N subexpanses in bitmap. + BITMAPB_t bitmap; // for one subexpanse. + BITMAPB_t bitmask; // bit set for Indexs digit. + Word_t numJPs; // number of JPs = populated expanses. + int offset; // in bitmap branch. + +// Similar to common code above, but no outlier check is needed, and the Immed +// type depends on the word size: + + digit = JU_DIGITATSTATE(Index, cJU_ROOTSTATE); + exppop1 = Pjpm->jpm_Pop0; + + // fall through: + + +// COMMON CODE FOR BITMAP BRANCHES: +// +// Come here with digit and exppop1 already set. + +JudyBranchB: + +// If population increment is greater than.. (300): + + if ((Pjpm->jpm_Pop0 - Pjpm->jpm_LastUPop0) > JU_BTOU_POP_INCREMENT) + { + +// If total population of array is greater than.. (750): + + if (Pjpm->jpm_Pop0 > JU_BRANCHB_MAX_POP) + { + +// If population under the branch is greater than.. (135): + + if (exppop1 > JU_BRANCHB_MIN_POP) + { + if (j__udyCreateBranchU(Pjp, Pjpm) == -1) return(-1); + +// Save global population of last BranchU conversion: + + Pjpm->jpm_LastUPop0 = Pjpm->jpm_Pop0; + + goto ContinueInsWalk; + } + } + } + +// CONTINUE TO USE BRANCHB: +// +// Get pointer to bitmap branch (JBB): + + PjbbRaw = (Pjbb_t) (Pjp->jp_Addr); + Pjbb = P_JBB(PjbbRaw); + +// Form the Int32 offset, and Bit offset values: +// +// 8 bit Decode | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 | +// |SubExpanse | Bit offset | +// +// Get the 1 of 8 expanses from digit, Bits 5..7 = 1 of 8, and get the 32-bit +// word that may have a bit set: + + subexp = digit / cJU_BITSPERSUBEXPB; + bitmap = JU_JBB_BITMAP(Pjbb, subexp); + + Pjp2Raw = JU_JBB_PJP(Pjbb, subexp); + Pjp2 = P_JP(Pjp2Raw); + +// Get the bit position that represents the desired expanse, and get the offset +// into the array of JPs for the JP that matches the bit. + + bitmask = JU_BITPOSMASKB(digit); + offset = j__udyCountBitsB(bitmap & (bitmask - 1)); + +// If JP is already in this expanse, get Pjp and continue the walk: + + if (bitmap & bitmask) + { +#ifdef SUBEXPCOUNTS + PSubExp = &(Pjbb->jbb_Counts[subexp]); // ptr to subexp counts. +#endif + Pjp = Pjp2 + offset; + break; // continue walk. + } + + +// ADD NEW EXPANSE FOR NEW INDEX: +// +// The new expanse always an cJU_JPIMMED_*_01 containing just the new Index, so +// finish setting up an Immed JP. + + JU_JPSETADT(&newJP, 0, Index, + JU_JPTYPE(Pjp) + cJU_JPIMMED_1_01-cJU_JPBRANCH_B2); + +// Get 1 of the 8 JP arrays and calculate number of JPs in subexpanse array: + + Pjp2Raw = JU_JBB_PJP(Pjbb, subexp); + Pjp2 = P_JP(Pjp2Raw); + numJPs = j__udyCountBitsB(bitmap); + +// Expand branch JP subarray in-place: + + if (JU_BRANCHBJPGROWINPLACE(numJPs)) + { + assert(numJPs > 0); + JU_INSERTINPLACE(Pjp2, numJPs, offset, newJP); +#ifdef JUDYL + // value area is first word of new Immed 01 JP: + Pjpm->jpm_PValue = (Pjv_t) (Pjp2 + offset); +#endif + } + +// No room, allocate a bigger bitmap branch JP subarray: + + else + { + Pjp_t PjpnewRaw; + Pjp_t Pjpnew; + + if ((PjpnewRaw = j__udyAllocJBBJP(numJPs + 1, Pjpm)) == 0) + return(-1); + Pjpnew = P_JP(PjpnewRaw); + +// If there was an old JP array, then copy it, insert the new Immed JP, and +// free the old array: + + if (numJPs) + { + JU_INSERTCOPY(Pjpnew, Pjp2, numJPs, offset, newJP); + j__udyFreeJBBJP(Pjp2Raw, numJPs, Pjpm); +#ifdef JUDYL + // value area is first word of new Immed 01 JP: + Pjpm->jpm_PValue = (Pjv_t) (Pjpnew + offset); +#endif + } + +// New JP subarray; point to cJU_JPIMMED_*_01 and place it: + + else + { + assert(JU_JBB_PJP(Pjbb, subexp) == (Pjp_t) NULL); + Pjp = Pjpnew; + *Pjp = newJP; // copy to new memory. +#ifdef JUDYL + // value area is first word of new Immed 01 JP: + Pjpm->jpm_PValue = (Pjv_t) (&(Pjp->jp_Addr)); +#endif + } + +// Place new JP subarray in BranchB: + + JU_JBB_PJP(Pjbb, subexp) = PjpnewRaw; + + } // else + +// Set the new Indexs bit: + + JU_JBB_BITMAP(Pjbb, subexp) |= bitmask; + + return(1); + + } // case + + +// **************************************************************************** +// JPBRANCH_U*: +// +// Just drop through the JP for the correct digit. If the JP turns out to be a +// JPNULL*, thats OK, the memory is already allocated, and the next walk +// simply places an Immed in it. +// +#ifdef SUBEXPCOUNTS +#define JU_GETSUBEXP(PSubExp,Pjbu,Digit) \ + (PSubExp) = &((Pjbu)->jbu_subPop1[(Digit) / cJU_NUMSUBEXPU]) +#else +#define JU_GETSUBEXP(PSubExp,Pjbu,Digit) // null. +#endif + +#define JU_JBU_PJP_SUBEXP(Pjp,PSubExp,Index,Level) \ + { \ + uint8_t digit = JU_DIGITATSTATE(Index, Level); \ + Pjbu_t P_jbu = P_JBU((Pjp)->jp_Addr); \ + (Pjp) = &(P_jbu->jbu_jp[digit]); \ + JU_GETSUBEXP(PSubExp, P_jbu, digit); \ + } + + case cJU_JPBRANCH_U2: + JU_CHECK_IF_OUTLIER(Pjp, Index, 2, Pjpm); + JU_JBU_PJP_SUBEXP(Pjp, PSubExp, Index, 2); + break; + +#ifdef JU_64BIT + case cJU_JPBRANCH_U3: + JU_CHECK_IF_OUTLIER(Pjp, Index, 3, Pjpm); + JU_JBU_PJP_SUBEXP(Pjp, PSubExp, Index, 3); + break; + + case cJU_JPBRANCH_U4: + JU_CHECK_IF_OUTLIER(Pjp, Index, 4, Pjpm); + JU_JBU_PJP_SUBEXP(Pjp, PSubExp, Index, 4); + break; + + case cJU_JPBRANCH_U5: + JU_CHECK_IF_OUTLIER(Pjp, Index, 5, Pjpm); + JU_JBU_PJP_SUBEXP(Pjp, PSubExp, Index, 5); + break; + + case cJU_JPBRANCH_U6: + JU_CHECK_IF_OUTLIER(Pjp, Index, 6, Pjpm); + JU_JBU_PJP_SUBEXP(Pjp, PSubExp, Index, 6); + break; + + case cJU_JPBRANCH_U7: + JU_JBU_PJP_SUBEXP(Pjp, PSubExp, Index, 7); +#else + case cJU_JPBRANCH_U3: + JU_JBU_PJP_SUBEXP(Pjp, PSubExp, Index, 3); +#endif + break; + + case cJU_JPBRANCH_U: + JU_JBU_PJP_SUBEXP(Pjp, PSubExp, Index, cJU_ROOTSTATE); + break; + + +// **************************************************************************** +// JPLEAF*: +// +// COMMON CODE FRAGMENTS TO MINIMIZE REDUNDANCY BELOW: +// +// These are necessary to support performance by function and loop unrolling +// while avoiding huge amounts of nearly identical code. +// +// Prepare to handle a linear leaf: Check for an outlier; set pop1 and pointer +// to leaf: + +#ifdef JUDY1 +#define JU_LEAFVALUE(Pjv) // null. +#define JU_LEAFPREPVALUE(Pjv, ValueArea) // null. +#else +#define JU_LEAFVALUE(Pjv) Pjv_t Pjv +#define JU_LEAFPREPVALUE(Pjv, ValueArea) (Pjv) = ValueArea(Pleaf, exppop1) +#endif + +#define JU_LEAFPREP(cIS,Type,MaxPop1,ValueArea) \ + Pjll_t PjllRaw; \ + Type Pleaf; /* specific type */ \ + int offset; \ + JU_LEAFVALUE(Pjv); \ + \ + JU_CHECK_IF_OUTLIER(Pjp, Index, cIS, Pjpm); \ + \ + exppop1 = JU_JPLEAF_POP0(Pjp) + 1; \ + assert(exppop1 <= (MaxPop1)); \ + PjllRaw = (Pjll_t) (Pjp->jp_Addr); \ + Pleaf = (Type) P_JLL(PjllRaw); \ + JU_LEAFPREPVALUE(Pjv, ValueArea) + +// Add to, or grow, a linear leaf: Find Index position; if the Index is +// absent, if theres room in the leaf, insert the Index [and value of 0] in +// place, otherwise grow the leaf: +// +// Note: These insertions always take place with whole words, using +// JU_INSERTINPLACE() or JU_INSERTCOPY(). + +#ifdef JUDY1 +#define JU_LEAFGROWVALUEADD(Pjv,ExpPop1,Offset) // null. +#else +#define JU_LEAFGROWVALUEADD(Pjv,ExpPop1,Offset) \ + JU_INSERTINPLACE(Pjv, ExpPop1, Offset, 0); \ + Pjpm->jpm_PValue = (Pjv) + (Offset) +#endif + +#ifdef JUDY1 +#define JU_LEAFGROWVALUENEW(ValueArea,Pjv,ExpPop1,Offset) // null. +#else +#define JU_LEAFGROWVALUENEW(ValueArea,Pjv,ExpPop1,Offset) \ + { \ + Pjv_t Pjvnew = ValueArea(Pleafnew, (ExpPop1) + 1); \ + JU_INSERTCOPY(Pjvnew, Pjv, ExpPop1, Offset, 0); \ + Pjpm->jpm_PValue = (Pjvnew) + (Offset); \ + } +#endif + +#define JU_LEAFGROW(cIS,Type,MaxPop1,Search,ValueArea,GrowInPlace, \ + InsertInPlace,InsertCopy,Alloc,Free) \ + \ + offset = Search(Pleaf, exppop1, Index); \ + JU_CHECK_IF_EXISTS(offset, Pjv, Pjpm); \ + \ + if (GrowInPlace(exppop1)) /* add to current leaf */ \ + { \ + InsertInPlace(Pleaf, exppop1, offset, Index); \ + JU_LEAFGROWVALUEADD(Pjv, exppop1, offset); \ + DBGCODE(JudyCheckSorted((Pjll_t) Pleaf, exppop1 + 1, cIS);) \ + return(1); \ + } \ + \ + if (exppop1 < (MaxPop1)) /* grow to new leaf */ \ + { \ + Pjll_t PjllnewRaw; \ + Type Pleafnew; \ + if ((PjllnewRaw = Alloc(exppop1 + 1, Pjpm)) == 0) return(-1); \ + Pleafnew = (Type) P_JLL(PjllnewRaw); \ + InsertCopy(Pleafnew, Pleaf, exppop1, offset, Index); \ + JU_LEAFGROWVALUENEW(ValueArea, Pjv, exppop1, offset); \ + DBGCODE(JudyCheckSorted((Pjll_t) Pleafnew, exppop1 + 1, cIS);) \ + Free(PjllRaw, exppop1, Pjpm); \ + (Pjp->jp_Addr) = (Word_t) PjllnewRaw; \ + return(1); \ + } \ + assert(exppop1 == (MaxPop1)) + +// Handle linear leaf overflow (cascade): Splay or compress into smaller +// leaves: + +#define JU_LEAFCASCADE(MaxPop1,Cascade,Free) \ + if (Cascade(Pjp, Pjpm) == -1) return(-1); \ + Free(PjllRaw, MaxPop1, Pjpm); \ + goto ContinueInsWalk + +// Wrapper around all of the above: + +#define JU_LEAFSET(cIS,Type,MaxPop1,Search,GrowInPlace,InsertInPlace, \ + InsertCopy,Cascade,Alloc,Free,ValueArea) \ + { \ + JU_LEAFPREP(cIS,Type,MaxPop1,ValueArea); \ + JU_LEAFGROW(cIS,Type,MaxPop1,Search,ValueArea,GrowInPlace, \ + InsertInPlace,InsertCopy,Alloc,Free); \ + JU_LEAFCASCADE(MaxPop1,Cascade,Free); \ + } + +// END OF MACROS; LEAFL CASES START HERE: +// +// 64-bit Judy1 does not have 1-byte leaves: + +#if (defined(JUDYL) || (! defined(JU_64BIT))) + + case cJU_JPLEAF1: + + JU_LEAFSET(1, uint8_t *, cJU_LEAF1_MAXPOP1, j__udySearchLeaf1, + JU_LEAF1GROWINPLACE, JU_INSERTINPLACE, JU_INSERTCOPY, + j__udyCascade1, j__udyAllocJLL1, j__udyFreeJLL1, + JL_LEAF1VALUEAREA); + +#endif // (JUDYL || ! JU_64BIT) + + case cJU_JPLEAF2: + + JU_LEAFSET(2, uint16_t *, cJU_LEAF2_MAXPOP1, j__udySearchLeaf2, + JU_LEAF2GROWINPLACE, JU_INSERTINPLACE, JU_INSERTCOPY, + j__udyCascade2, j__udyAllocJLL2, j__udyFreeJLL2, + JL_LEAF2VALUEAREA); + + case cJU_JPLEAF3: + + JU_LEAFSET(3, uint8_t *, cJU_LEAF3_MAXPOP1, j__udySearchLeaf3, + JU_LEAF3GROWINPLACE, JU_INSERTINPLACE3, JU_INSERTCOPY3, + j__udyCascade3, j__udyAllocJLL3, j__udyFreeJLL3, + JL_LEAF3VALUEAREA); + +#ifdef JU_64BIT + case cJU_JPLEAF4: + + JU_LEAFSET(4, uint32_t *, cJU_LEAF4_MAXPOP1, j__udySearchLeaf4, + JU_LEAF4GROWINPLACE, JU_INSERTINPLACE, JU_INSERTCOPY, + j__udyCascade4, j__udyAllocJLL4, j__udyFreeJLL4, + JL_LEAF4VALUEAREA); + + case cJU_JPLEAF5: + + JU_LEAFSET(5, uint8_t *, cJU_LEAF5_MAXPOP1, j__udySearchLeaf5, + JU_LEAF5GROWINPLACE, JU_INSERTINPLACE5, JU_INSERTCOPY5, + j__udyCascade5, j__udyAllocJLL5, j__udyFreeJLL5, + JL_LEAF5VALUEAREA); + + case cJU_JPLEAF6: + + JU_LEAFSET(6, uint8_t *, cJU_LEAF6_MAXPOP1, j__udySearchLeaf6, + JU_LEAF6GROWINPLACE, JU_INSERTINPLACE6, JU_INSERTCOPY6, + j__udyCascade6, j__udyAllocJLL6, j__udyFreeJLL6, + JL_LEAF6VALUEAREA); + + case cJU_JPLEAF7: + + JU_LEAFSET(7, uint8_t *, cJU_LEAF7_MAXPOP1, j__udySearchLeaf7, + JU_LEAF7GROWINPLACE, JU_INSERTINPLACE7, JU_INSERTCOPY7, + j__udyCascade7, j__udyAllocJLL7, j__udyFreeJLL7, + JL_LEAF7VALUEAREA); +#endif // JU_64BIT + + +// **************************************************************************** +// JPLEAF_B1: +// +// 8 bit Decode | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 | +// |SubExpanse | Bit offset | +// +// Note: For JudyL, values are stored in 8 subexpanses, each a linear word +// array of up to 32 values each. + + case cJU_JPLEAF_B1: + { +#ifdef JUDYL + Pjv_t PjvRaw; // pointer to value part of the leaf. + Pjv_t Pjv; // pointer to value part of the leaf. + Pjv_t PjvnewRaw; // new value area. + Pjv_t Pjvnew; // new value area. + Word_t subexp; // 1 of 8 subexpanses in bitmap. + Pjlb_t Pjlb; // pointer to bitmap part of the leaf. + BITMAPL_t bitmap; // for one subexpanse. + BITMAPL_t bitmask; // bit set for Indexs digit. + int offset; // of index in value area. +#endif + + JU_CHECK_IF_OUTLIER(Pjp, Index, 1, Pjpm); + +#ifdef JUDY1 + +// If Index (bit) is already set, return now: + + if (JU_BITMAPTESTL(P_JLB(Pjp->jp_Addr), Index)) return(0); + +// If bitmap is not full, set the new Indexs bit; otherwise convert to a Full: + + if ((exppop1 = JU_JPLEAF_POP0(Pjp) + 1) + < cJU_JPFULLPOPU1_POP0) + { + JU_BITMAPSETL(P_JLB(Pjp->jp_Addr), Index); + } + else + { + j__udyFreeJLB1((Pjlb_t) (Pjp->jp_Addr), Pjpm); // free LeafB1. + Pjp->jp_Type = cJ1_JPFULLPOPU1; + Pjp->jp_Addr = 0; + } + +#else // JUDYL + +// This is very different from Judy1 because of the need to return a value area +// even for an existing Index, or manage the value area for a new Index, and +// because JudyL has no Full type: + +// Get last byte to decode from Index, and pointer to bitmap leaf: + + digit = JU_DIGITATSTATE(Index, 1); + Pjlb = P_JLB(Pjp->jp_Addr); + +// Prepare additional values: + + subexp = digit / cJU_BITSPERSUBEXPL; // which subexpanse. + bitmap = JU_JLB_BITMAP(Pjlb, subexp); // subexps 32-bit map. + PjvRaw = JL_JLB_PVALUE(Pjlb, subexp); // corresponding values. + Pjv = P_JV(PjvRaw); // corresponding values. + bitmask = JU_BITPOSMASKL(digit); // mask for Index. + offset = j__udyCountBitsL(bitmap & (bitmask - 1)); // of Index. + +// If Index already exists, get value pointer and exit: + + if (bitmap & bitmask) + { + assert(Pjv); + Pjpm->jpm_PValue = Pjv + offset; // existing value. + return(0); + } + +// Get the total bits set = expanse population of Value area: + + exppop1 = j__udyCountBitsL(bitmap); + +// If the value area can grow in place, do it: + + if (JL_LEAFVGROWINPLACE(exppop1)) + { + JU_INSERTINPLACE(Pjv, exppop1, offset, 0); + JU_JLB_BITMAP(Pjlb, subexp) |= bitmask; // set Indexs bit. + Pjpm->jpm_PValue = Pjv + offset; // new value area. + return(1); + } + +// Increase size of value area: + + if ((PjvnewRaw = j__udyLAllocJV(exppop1 + 1, Pjpm)) + == (Pjv_t) NULL) return(-1); + Pjvnew = P_JV(PjvnewRaw); + + if (exppop1) // have existing value area. + { + assert(Pjv); + JU_INSERTCOPY(Pjvnew, Pjv, exppop1, offset, 0); + Pjpm->jpm_PValue = Pjvnew + offset; + j__udyLFreeJV(PjvRaw, exppop1, Pjpm); // free old values. + } + else // first index, new value area: + { + Pjpm->jpm_PValue = Pjvnew; + *(Pjpm->jpm_PValue) = 0; + } + +// Set bit for new Index and place new leaf value area in bitmap: + + JU_JLB_BITMAP(Pjlb, subexp) |= bitmask; + JL_JLB_PVALUE(Pjlb, subexp) = PjvnewRaw; + +#endif // JUDYL + + return(1); + + } // case + + +#ifdef JUDY1 +// **************************************************************************** +// JPFULLPOPU1: +// +// If Index is not an outlier, then by definition its already set. + + case cJ1_JPFULLPOPU1: + + JU_CHECK_IF_OUTLIER(Pjp, Index, 1, Pjpm); + return(0); +#endif + + +// **************************************************************************** +// JPIMMED*: +// +// This is some of the most complex code in Judy considering Judy1 versus JudyL +// and 32-bit versus 64-bit variations. The following comments attempt to make +// this clearer. +// +// Of the 2 words in a JP, for immediate indexes Judy1 can use 2 words - 1 byte +// = 7 [15] bytes, but JudyL can only use 1 word - 1 byte = 3 [7] bytes because +// the other word is needed for a value area or a pointer to a value area. +// +// For both Judy1 and JudyL, cJU_JPIMMED_*_01 indexes are in word 2; otherwise +// for Judy1 only, a list of 2 or more indexes starts in word 1. JudyL keeps +// the list in word 2 because word 1 is a pointer (to a LeafV, that is, a leaf +// containing only values). Furthermore, cJU_JPIMMED_*_01 indexes are stored +// all-but-first-byte in jp_DcdPopO, not just the Index Sizes bytes. +// +// TBD: This can be confusing because Doug didnt use data structures for it. +// Instead he often directly accesses Pjp for the first word and jp_DcdPopO for +// the second word. It would be nice to use data structs, starting with +// jp_1Index and jp_LIndex where possible. +// +// Maximum Immed JP types for Judy1/JudyL, depending on Index Size (cIS): +// +// 32-bit 64-bit +// +// bytes: 7/ 3 15/ 7 (Judy1/JudyL) +// +// cIS +// 1_ 07/03 15/07 (as in: cJ1_JPIMMED_1_07) +// 2_ 03/01 07/03 +// 3_ 02/01 05/02 +// 4_ 03/01 +// 5_ 03/01 +// 6_ 02/01 +// 7_ 02/01 +// +// State transitions while inserting an Index, matching the above table: +// (Yes, this is very terse... Study it and it will make sense.) +// (Note, parts of this diagram are repeated below for quick reference.) +// +// +-- reformat JP here for Judy1 only, from word-2 to word-1 +// | +// | JUDY1 || JU_64BIT JUDY1 && JU_64BIT +// V +// 1_01 => 1_02 => 1_03 => [ 1_04 => ... => 1_07 => [ 1_08..15 => ]] Leaf1 (*) +// 2_01 => [ 2_02 => 2_03 => [ 2_04..07 => ]] Leaf2 +// 3_01 => [ 3_02 => [ 3_03..05 => ]] Leaf3 +// JU_64BIT only: +// 4_01 => [[ 4_02..03 => ]] Leaf4 +// 5_01 => [[ 5_02..03 => ]] Leaf5 +// 6_01 => [[ 6_02 => ]] Leaf6 +// 7_01 => [[ 7_02 => ]] Leaf7 +// +// (*) For Judy1 & 64-bit, go directly from cJU_JPIMMED_1_15 to a LeafB1; skip +// Leaf1, as described in Judy1.h regarding cJ1_JPLEAF1. + + +// COMMON CODE FRAGMENTS TO MINIMIZE REDUNDANCY BELOW: +// +// These are necessary to support performance by function and loop unrolling +// while avoiding huge amounts of nearly identical code. +// +// The differences between Judy1 and JudyL with respect to value area handling +// are just too large for completely common code between them... Oh well, some +// big ifdefs follow. However, even in the following ifdefd code, use cJU_*, +// JU_*, and Judy*() instead of cJ1_* / cJL_*, J1_* / JL_*, and +// Judy1*()/JudyL*(), for minimum diffs. +// +// Handle growth of cJU_JPIMMED_*_01 to cJU_JPIMMED_*_02, for an even or odd +// Index Size (cIS), given oldIndex, Index, and Pjll in the context: +// +// Put oldIndex and Index in their proper order. For odd indexes, must copy +// bytes. + +#ifdef JUDY1 + +#define JU_IMMSET_01_COPY_EVEN(ignore1,ignore2) \ + if (oldIndex < Index) { Pjll[0] = oldIndex; Pjll[1] = Index; } \ + else { Pjll[0] = Index; Pjll[1] = oldIndex; } + +#define JU_IMMSET_01_COPY_ODD(cIS,CopyWord) \ + if (oldIndex < Index) \ + { \ + CopyWord(Pjll + 0, oldIndex); \ + CopyWord(Pjll + (cIS), Index); \ + } \ + else \ + { \ + CopyWord(Pjll + 0, Index); \ + CopyWord(Pjll + (cIS), oldIndex); \ + } + +// The "real" *_01 Copy macro: +// +// Trim the high byte off Index, look for a match with the old Index, and if +// none, insert the new Index in the leaf in the correct place, given Pjp and +// Index in the context. +// +// Note: A single immediate index lives in the jp_DcdPopO field, but two or +// more reside starting at Pjp->jp_1Index. + +#define JU_IMMSET_01_COPY(cIS,LeafType,NewJPType,Copy,CopyWord) \ + { \ + LeafType Pjll; \ + Word_t oldIndex = JU_JPDCDPOP0(Pjp); \ + \ + Index = JU_TRIMTODCDSIZE(Index); \ + if (oldIndex == Index) return(0); \ + \ + Pjll = (LeafType) (Pjp->jp_1Index); \ + Copy(cIS,CopyWord); \ + DBGCODE(JudyCheckSorted(Pjll, 2, cIS);) \ + \ + Pjp->jp_Type = (NewJPType); \ + return(1); \ + } + +#else // JUDYL + +// Variations to also handle value areas; see comments above: +// +// For JudyL, Pjv (start of value area) and oldValue are also in the context; +// leave Pjv set to the value area for Index. + +#define JU_IMMSET_01_COPY_EVEN(cIS,CopyWord) \ + if (oldIndex < Index) \ + { \ + Pjll[0] = oldIndex; \ + Pjv [0] = oldValue; \ + Pjll[1] = Index; \ + ++Pjv; \ + } \ + else \ + { \ + Pjll[0] = Index; \ + Pjll[1] = oldIndex; \ + Pjv [1] = oldValue; \ + } + +#define JU_IMMSET_01_COPY_ODD(cIS,CopyWord) \ + if (oldIndex < Index) \ + { \ + CopyWord(Pjll + 0, oldIndex); \ + CopyWord(Pjll + (cIS), Index); \ + Pjv[0] = oldValue; \ + ++Pjv; \ + } \ + else \ + { \ + CopyWord(Pjll + 0, Index); \ + CopyWord(Pjll + (cIS), oldIndex); \ + Pjv[1] = oldValue; \ + } + +// The old value area is in the first word (*Pjp), and Pjv and Pjpm are also in +// the context. Also, unlike Judy1, indexes remain in word 2 (jp_LIndex), +// meaning insert-in-place rather than copy. +// +// Return jpm_PValue pointing to Indexs value area. If Index is new, allocate +// a 2-value-leaf and attach it to the JP. + +#define JU_IMMSET_01_COPY(cIS,LeafType,NewJPType,Copy,CopyWord) \ + { \ + LeafType Pjll; \ + Word_t oldIndex = JU_JPDCDPOP0(Pjp); \ + Word_t oldValue; \ + Pjv_t PjvRaw; \ + Pjv_t Pjv; \ + \ + Index = JU_TRIMTODCDSIZE(Index); \ + \ + if (oldIndex == Index) \ + { \ + Pjpm->jpm_PValue = (Pjv_t) Pjp; \ + return(0); \ + } \ + \ + if ((PjvRaw = j__udyLAllocJV(2, Pjpm)) == (Pjv_t) NULL) \ + return(-1); \ + Pjv = P_JV(PjvRaw); \ + \ + oldValue = Pjp->jp_Addr; \ + (Pjp->jp_Addr) = (Word_t) PjvRaw; \ + Pjll = (LeafType) (Pjp->jp_LIndex); \ + \ + Copy(cIS,CopyWord); \ + DBGCODE(JudyCheckSorted(Pjll, 2, cIS);) \ + \ + Pjp->jp_Type = (NewJPType); \ + *Pjv = 0; \ + Pjpm->jpm_PValue = Pjv; \ + return(1); \ + } + +// The following is a unique mix of JU_IMMSET_01() and JU_IMMSETCASCADE() for +// going from cJU_JPIMMED_*_01 directly to a cJU_JPLEAF* for JudyL: +// +// If Index is not already set, allocate a leaf, copy the old and new indexes +// into it, clear and return the new value area, and modify the current JP. +// Note that jp_DcdPop is set to a pop0 of 0 for now, and incremented later. + + +#define JU_IMMSET_01_CASCADE(cIS,LeafType,NewJPType,ValueArea, \ + Copy,CopyWord,Alloc) \ + { \ + Word_t D_P0; \ + LeafType PjllRaw; \ + LeafType Pjll; \ + Word_t oldIndex = JU_JPDCDPOP0(Pjp); \ + Word_t oldValue; \ + Pjv_t Pjv; \ + \ + Index = JU_TRIMTODCDSIZE(Index); \ + \ + if (oldIndex == Index) \ + { \ + Pjpm->jpm_PValue = (Pjv_t) (&(Pjp->jp_Addr)); \ + return(0); \ + } \ + \ + if ((PjllRaw = (LeafType) Alloc(2, Pjpm)) == (LeafType) NULL) \ + return(-1); \ + Pjll = (LeafType) P_JLL(PjllRaw); \ + Pjv = ValueArea(Pjll, 2); \ + \ + oldValue = Pjp->jp_Addr; \ + \ + Copy(cIS,CopyWord); \ + DBGCODE(JudyCheckSorted(Pjll, 2, cIS);) \ + \ + *Pjv = 0; \ + Pjpm->jpm_PValue = Pjv; \ + D_P0 = Index & cJU_DCDMASK(cIS); /* pop0 = 0 */ \ + JU_JPSETADT(Pjp, (Word_t)PjllRaw, D_P0, NewJPType); \ + \ + return(1); \ + } + +#endif // JUDYL + +// Handle growth of cJU_JPIMMED_*_[02..15]: + +#ifdef JUDY1 + +// Insert an Index into an immediate JP that has room for more, if the Index is +// not already present; given Pjp, Index, exppop1, Pjv, and Pjpm in the +// context: +// +// Note: Use this only when the JP format doesnt change, that is, going from +// cJU_JPIMMED_X_0Y to cJU_JPIMMED_X_0Z, where X >= 2 and Y+1 = Z. +// +// Note: Incrementing jp_Type is how to increase the Index population. + +#define JU_IMMSETINPLACE(cIS,LeafType,BaseJPType_02,Search,InsertInPlace) \ + { \ + LeafType Pjll; \ + int offset; \ + \ + exppop1 = JU_JPTYPE(Pjp) - (BaseJPType_02) + 2; \ + offset = Search((Pjll_t) (Pjp->jp_1Index), exppop1, Index); \ + \ + JU_CHECK_IF_EXISTS(offset, ignore, Pjpm); \ + \ + Pjll = (LeafType) (Pjp->jp_1Index); \ + InsertInPlace(Pjll, exppop1, offset, Index); \ + DBGCODE(JudyCheckSorted(Pjll, exppop1 + 1, cIS);) \ + ++(Pjp->jp_Type); \ + return(1); \ + } + +// Insert an Index into an immediate JP that has no room for more: +// +// If the Index is not already present, do a cascade (to a leaf); given Pjp, +// Index, Pjv, and Pjpm in the context. + + +#define JU_IMMSETCASCADE(cIS,OldPop1,LeafType,NewJPType, \ + ignore,Search,InsertCopy,Alloc) \ + { \ + Word_t D_P0; \ + Pjll_t PjllRaw; \ + Pjll_t Pjll; \ + int offset; \ + \ + offset = Search((Pjll_t) (Pjp->jp_1Index), (OldPop1), Index); \ + JU_CHECK_IF_EXISTS(offset, ignore, Pjpm); \ + \ + if ((PjllRaw = Alloc((OldPop1) + 1, Pjpm)) == 0) return(-1); \ + Pjll = P_JLL(PjllRaw); \ + \ + InsertCopy((LeafType) Pjll, (LeafType) (Pjp->jp_1Index), \ + OldPop1, offset, Index); \ + DBGCODE(JudyCheckSorted(Pjll, (OldPop1) + 1, cIS);) \ + \ + D_P0 = (Index & cJU_DCDMASK(cIS)) + (OldPop1) - 1; \ + JU_JPSETADT(Pjp, (Word_t)PjllRaw, D_P0, NewJPType); \ + return(1); \ + } + +#else // JUDYL + +// Variations to also handle value areas; see comments above: +// +// For JudyL, Pjv (start of value area) is also in the context. +// +// TBD: This code makes a true but weak assumption that a JudyL 32-bit 2-index +// value area must be copied to a new 3-index value area. AND it doesnt know +// anything about JudyL 64-bit cases (cJU_JPIMMED_1_0[3-7] only) where the +// value area can grow in place! However, this should not break it, just slow +// it down. + +#define JU_IMMSETINPLACE(cIS,LeafType,BaseJPType_02,Search,InsertInPlace) \ + { \ + LeafType Pleaf; \ + int offset; \ + Pjv_t PjvRaw; \ + Pjv_t Pjv; \ + Pjv_t PjvnewRaw; \ + Pjv_t Pjvnew; \ + \ + exppop1 = JU_JPTYPE(Pjp) - (BaseJPType_02) + 2; \ + offset = Search((Pjll_t) (Pjp->jp_LIndex), exppop1, Index); \ + PjvRaw = (Pjv_t) (Pjp->jp_Addr); \ + Pjv = P_JV(PjvRaw); \ + \ + JU_CHECK_IF_EXISTS(offset, Pjv, Pjpm); \ + \ + if ((PjvnewRaw = j__udyLAllocJV(exppop1 + 1, Pjpm)) \ + == (Pjv_t) NULL) return(-1); \ + Pjvnew = P_JV(PjvnewRaw); \ + \ + Pleaf = (LeafType) (Pjp->jp_LIndex); \ + \ + InsertInPlace(Pleaf, exppop1, offset, Index); \ + /* see TBD above about this: */ \ + JU_INSERTCOPY(Pjvnew, Pjv, exppop1, offset, 0); \ + DBGCODE(JudyCheckSorted(Pleaf, exppop1 + 1, cIS);) \ + j__udyLFreeJV(PjvRaw, exppop1, Pjpm); \ + Pjp->jp_Addr = (Word_t) PjvnewRaw; \ + Pjpm->jpm_PValue = Pjvnew + offset; \ + \ + ++(Pjp->jp_Type); \ + return(1); \ + } + +#define JU_IMMSETCASCADE(cIS,OldPop1,LeafType,NewJPType, \ + ValueArea,Search,InsertCopy,Alloc) \ + { \ + Word_t D_P0; \ + Pjll_t PjllRaw; \ + Pjll_t Pjll; \ + int offset; \ + Pjv_t PjvRaw; \ + Pjv_t Pjv; \ + Pjv_t Pjvnew; \ + \ + PjvRaw = (Pjv_t) (Pjp->jp_Addr); \ + Pjv = P_JV(PjvRaw); \ + offset = Search((Pjll_t) (Pjp->jp_LIndex), (OldPop1), Index); \ + JU_CHECK_IF_EXISTS(offset, Pjv, Pjpm); \ + \ + if ((PjllRaw = Alloc((OldPop1) + 1, Pjpm)) == 0) \ + return(-1); \ + Pjll = P_JLL(PjllRaw); \ + InsertCopy((LeafType) Pjll, (LeafType) (Pjp->jp_LIndex), \ + OldPop1, offset, Index); \ + DBGCODE(JudyCheckSorted(Pjll, (OldPop1) + 1, cIS);) \ + \ + Pjvnew = ValueArea(Pjll, (OldPop1) + 1); \ + JU_INSERTCOPY(Pjvnew, Pjv, OldPop1, offset, 0); \ + j__udyLFreeJV(PjvRaw, (OldPop1), Pjpm); \ + Pjpm->jpm_PValue = Pjvnew + offset; \ + \ + D_P0 = (Index & cJU_DCDMASK(cIS)) + (OldPop1) - 1; \ + JU_JPSETADT(Pjp, (Word_t)PjllRaw, D_P0, NewJPType); \ + return(1); \ + } + +#endif // JUDYL + +// Common convenience/shorthand wrappers around JU_IMMSET_01_COPY() for +// even/odd index sizes: + +#define JU_IMMSET_01( cIS, LeafType, NewJPType) \ + JU_IMMSET_01_COPY(cIS, LeafType, NewJPType, JU_IMMSET_01_COPY_EVEN, \ + ignore) + +#define JU_IMMSET_01_ODD( cIS, NewJPType, CopyWord) \ + JU_IMMSET_01_COPY(cIS, uint8_t *, NewJPType, JU_IMMSET_01_COPY_ODD, \ + CopyWord) + + +// END OF MACROS; IMMED CASES START HERE: + +// cJU_JPIMMED_*_01 cases: +// +// 1_01 always leads to 1_02: +// +// (1_01 => 1_02 => 1_03 => [ 1_04 => ... => 1_07 => [ 1_08..15 => ]] LeafL) + + case cJU_JPIMMED_1_01: JU_IMMSET_01(1, uint8_t *, cJU_JPIMMED_1_02); + +// 2_01 leads to 2_02, and 3_01 leads to 3_02, except for JudyL 32-bit, where +// they lead to a leaf: +// +// (2_01 => [ 2_02 => 2_03 => [ 2_04..07 => ]] LeafL) +// (3_01 => [ 3_02 => [ 3_03..05 => ]] LeafL) + +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_2_01: JU_IMMSET_01(2, uint16_t *, cJU_JPIMMED_2_02); + case cJU_JPIMMED_3_01: JU_IMMSET_01_ODD (3, cJU_JPIMMED_3_02, + JU_COPY3_LONG_TO_PINDEX); +#else + case cJU_JPIMMED_2_01: + JU_IMMSET_01_CASCADE(2, uint16_t *, cJU_JPLEAF2, JL_LEAF2VALUEAREA, + JU_IMMSET_01_COPY_EVEN, ignore, + j__udyAllocJLL2); + case cJU_JPIMMED_3_01: + JU_IMMSET_01_CASCADE(3, uint8_t *, cJU_JPLEAF3, JL_LEAF3VALUEAREA, + JU_IMMSET_01_COPY_ODD, + JU_COPY3_LONG_TO_PINDEX, j__udyAllocJLL3); +#endif + +#ifdef JU_64BIT + +// [4-7]_01 lead to [4-7]_02 for Judy1, and to leaves for JudyL: +// +// (4_01 => [[ 4_02..03 => ]] LeafL) +// (5_01 => [[ 5_02..03 => ]] LeafL) +// (6_01 => [[ 6_02 => ]] LeafL) +// (7_01 => [[ 7_02 => ]] LeafL) + +#ifdef JUDY1 + case cJU_JPIMMED_4_01: JU_IMMSET_01(4, uint32_t *, cJ1_JPIMMED_4_02); + case cJU_JPIMMED_5_01: JU_IMMSET_01_ODD(5, cJ1_JPIMMED_5_02, + JU_COPY5_LONG_TO_PINDEX); + case cJU_JPIMMED_6_01: JU_IMMSET_01_ODD(6, cJ1_JPIMMED_6_02, + JU_COPY6_LONG_TO_PINDEX); + case cJU_JPIMMED_7_01: JU_IMMSET_01_ODD(7, cJ1_JPIMMED_7_02, + JU_COPY7_LONG_TO_PINDEX); +#else // JUDYL + case cJU_JPIMMED_4_01: + JU_IMMSET_01_CASCADE(4, uint32_t *, cJU_JPLEAF4, JL_LEAF4VALUEAREA, + JU_IMMSET_01_COPY_EVEN, ignore, + j__udyAllocJLL4); + case cJU_JPIMMED_5_01: + JU_IMMSET_01_CASCADE(5, uint8_t *, cJU_JPLEAF5, JL_LEAF5VALUEAREA, + JU_IMMSET_01_COPY_ODD, + JU_COPY5_LONG_TO_PINDEX, j__udyAllocJLL5); + case cJU_JPIMMED_6_01: + JU_IMMSET_01_CASCADE(6, uint8_t *, cJU_JPLEAF6, JL_LEAF6VALUEAREA, + JU_IMMSET_01_COPY_ODD, + JU_COPY6_LONG_TO_PINDEX, j__udyAllocJLL6); + case cJU_JPIMMED_7_01: + JU_IMMSET_01_CASCADE(7, uint8_t *, cJU_JPLEAF7, JL_LEAF7VALUEAREA, + JU_IMMSET_01_COPY_ODD, + JU_COPY7_LONG_TO_PINDEX, j__udyAllocJLL7); +#endif // JUDYL +#endif // JU_64BIT + +// cJU_JPIMMED_1_* cases that can grow in place: +// +// (1_01 => 1_02 => 1_03 => [ 1_04 => ... => 1_07 => [ 1_08..15 => ]] LeafL) + + case cJU_JPIMMED_1_02: +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_1_03: + case cJU_JPIMMED_1_04: + case cJU_JPIMMED_1_05: + case cJU_JPIMMED_1_06: +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJU_JPIMMED_1_07: + case cJ1_JPIMMED_1_08: + case cJ1_JPIMMED_1_09: + case cJ1_JPIMMED_1_10: + case cJ1_JPIMMED_1_11: + case cJ1_JPIMMED_1_12: + case cJ1_JPIMMED_1_13: + case cJ1_JPIMMED_1_14: +#endif + JU_IMMSETINPLACE(1, uint8_t *, cJU_JPIMMED_1_02, j__udySearchLeaf1, + JU_INSERTINPLACE); + +// cJU_JPIMMED_1_* cases that must cascade: +// +// (1_01 => 1_02 => 1_03 => [ 1_04 => ... => 1_07 => [ 1_08..15 => ]] LeafL) + +#if (defined(JUDYL) && (! defined(JU_64BIT))) + case cJU_JPIMMED_1_03: + JU_IMMSETCASCADE(1, 3, uint8_t *, cJU_JPLEAF1, JL_LEAF1VALUEAREA, + j__udySearchLeaf1, JU_INSERTCOPY, + j__udyAllocJLL1); +#endif +#if (defined(JUDY1) && (! defined(JU_64BIT))) + case cJU_JPIMMED_1_07: + JU_IMMSETCASCADE(1, 7, uint8_t *, cJU_JPLEAF1, ignore, + j__udySearchLeaf1, JU_INSERTCOPY, + j__udyAllocJLL1); + +#endif +#if (defined(JUDYL) && defined(JU_64BIT)) + case cJU_JPIMMED_1_07: + JU_IMMSETCASCADE(1, 7, uint8_t *, cJU_JPLEAF1, JL_LEAF1VALUEAREA, + j__udySearchLeaf1, JU_INSERTCOPY, + j__udyAllocJLL1); + +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) +// Special case, as described above, go directly from Immed to LeafB1: + + case cJ1_JPIMMED_1_15: + { + Word_t DcdP0; + int offset; + Pjlb_t PjlbRaw; + Pjlb_t Pjlb; + + offset = j__udySearchLeaf1((Pjll_t) Pjp->jp_1Index, 15, Index); + + JU_CHECK_IF_EXISTS(offset, ignore, Pjpm); + +// Create a bitmap leaf (special case for Judy1 64-bit only, see usage): Set +// new Index in bitmap, copy an Immed1_15 to the bitmap, and set the parent JP +// EXCEPT jp_DcdPopO, leaving any followup to the caller: + + if ((PjlbRaw = j__udyAllocJLB1(Pjpm)) == (Pjlb_t) NULL) + return(-1); + Pjlb = P_JLB(PjlbRaw); + + JU_BITMAPSETL(Pjlb, Index); + + for (offset = 0; offset < 15; ++offset) + JU_BITMAPSETL(Pjlb, Pjp->jp_1Index[offset]); + +// Set jp_DcdPopO including the current pop0; incremented later: + DcdP0 = (Index & cJU_DCDMASK(1)) + 15 - 1; + JU_JPSETADT(Pjp, (Word_t)PjlbRaw, DcdP0, cJU_JPLEAF_B1); + + return(1); + } +#endif + +// cJU_JPIMMED_[2..7]_[02..15] cases that grow in place or cascade: +// +// (2_01 => [ 2_02 => 2_03 => [ 2_04..07 => ]] LeafL) + +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_2_02: +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJU_JPIMMED_2_03: + case cJ1_JPIMMED_2_04: + case cJ1_JPIMMED_2_05: + case cJ1_JPIMMED_2_06: +#endif +#if (defined(JUDY1) || defined(JU_64BIT)) + JU_IMMSETINPLACE(2, uint16_t *, cJU_JPIMMED_2_02, j__udySearchLeaf2, + JU_INSERTINPLACE); +#endif + +#undef OLDPOP1 +#if ((defined(JUDY1) && (! defined(JU_64BIT))) || (defined(JUDYL) && defined(JU_64BIT))) + case cJU_JPIMMED_2_03: +#define OLDPOP1 3 +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_2_07: +#define OLDPOP1 7 +#endif +#if (defined(JUDY1) || defined(JU_64BIT)) + JU_IMMSETCASCADE(2, OLDPOP1, uint16_t *, cJU_JPLEAF2, + JL_LEAF2VALUEAREA, j__udySearchLeaf2, + JU_INSERTCOPY, j__udyAllocJLL2); +#endif + +// (3_01 => [ 3_02 => [ 3_03..05 => ]] LeafL) + +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJU_JPIMMED_3_02: + case cJ1_JPIMMED_3_03: + case cJ1_JPIMMED_3_04: + + JU_IMMSETINPLACE(3, uint8_t *, cJU_JPIMMED_3_02, j__udySearchLeaf3, + JU_INSERTINPLACE3); +#endif + +#undef OLDPOP1 +#if ((defined(JUDY1) && (! defined(JU_64BIT))) || (defined(JUDYL) && defined(JU_64BIT))) + case cJU_JPIMMED_3_02: +#define OLDPOP1 2 +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_3_05: +#define OLDPOP1 5 +#endif +#if (defined(JUDY1) || defined(JU_64BIT)) + JU_IMMSETCASCADE(3, OLDPOP1, uint8_t *, cJU_JPLEAF3, + JL_LEAF3VALUEAREA, j__udySearchLeaf3, + JU_INSERTCOPY3, j__udyAllocJLL3); +#endif + +#if (defined(JUDY1) && defined(JU_64BIT)) + +// (4_01 => [[ 4_02..03 => ]] LeafL) + + case cJ1_JPIMMED_4_02: + + JU_IMMSETINPLACE(4, uint32_t *, cJ1_JPIMMED_4_02, j__udySearchLeaf4, + JU_INSERTINPLACE); + + case cJ1_JPIMMED_4_03: + + JU_IMMSETCASCADE(4, 3, uint32_t *, cJU_JPLEAF4, ignore, + j__udySearchLeaf4, JU_INSERTCOPY, + j__udyAllocJLL4); + +// (5_01 => [[ 5_02..03 => ]] LeafL) + + case cJ1_JPIMMED_5_02: + + JU_IMMSETINPLACE(5, uint8_t *, cJ1_JPIMMED_5_02, j__udySearchLeaf5, + JU_INSERTINPLACE5); + + case cJ1_JPIMMED_5_03: + + JU_IMMSETCASCADE(5, 3, uint8_t *, cJU_JPLEAF5, ignore, + j__udySearchLeaf5, JU_INSERTCOPY5, + j__udyAllocJLL5); + +// (6_01 => [[ 6_02 => ]] LeafL) + + case cJ1_JPIMMED_6_02: + + JU_IMMSETCASCADE(6, 2, uint8_t *, cJU_JPLEAF6, ignore, + j__udySearchLeaf6, JU_INSERTCOPY6, + j__udyAllocJLL6); + +// (7_01 => [[ 7_02 => ]] LeafL) + + case cJ1_JPIMMED_7_02: + + JU_IMMSETCASCADE(7, 2, uint8_t *, cJU_JPLEAF7, ignore, + j__udySearchLeaf7, JU_INSERTCOPY7, + j__udyAllocJLL7); + +#endif // (JUDY1 && JU_64BIT) + + +// **************************************************************************** +// INVALID JP TYPE: + + default: JU_SET_ERRNO_NONNULL(Pjpm, JU_ERRNO_CORRUPT); return(-1); + + } // switch on JP type + + { + +#ifdef SUBEXPCOUNTS + +// This code might seem strange here. However it saves some memory read time +// during insert (~70nS) because a pipelined processor does not need to "stall" +// waiting for the memory read to complete. Hope the compiler is not too smart +// or dumb and moves the code down to where it looks like it belongs (below a +// few lines). + + Word_t SubExpCount = 0; // current subexpanse counter. + + if (PSubExp != (PWord_t) NULL) // only if BranchB/U. + SubExpCount = PSubExp[0]; +#endif + +// PROCESS JP -- RECURSIVELY: +// +// For non-Immed JP types, if successful, post-increment the population count +// at this Level. + + retcode = j__udyInsWalk(Pjp, Index, Pjpm); + +// Successful insert, increment JP and subexpanse count: + + if ((JU_JPTYPE(Pjp) < cJU_JPIMMED_1_01) && (retcode == 1)) + { + jp_t JP; + Word_t DcdP0; +#ifdef SUBEXPCOUNTS + +// Note: Pjp must be a pointer to a BranchB/U: + + if (PSubExp != (PWord_t) NULL) PSubExp[0] = SubExpCount + 1; +#endif + + JP = *Pjp; + DcdP0 = JU_JPDCDPOP0(Pjp) + 1; + JU_JPSETADT(Pjp, JP.jp_Addr, DcdP0, JU_JPTYPE(&JP)); + } + } + return(retcode); + +} // j__udyInsWalk() + + +// **************************************************************************** +// J U D Y 1 S E T +// J U D Y L I N S +// +// Main entry point. See the manual entry for details. + +#ifdef JUDY1 +FUNCTION int Judy1Set +#else +FUNCTION PPvoid_t JudyLIns +#endif + ( + PPvoid_t PPArray, // in which to insert. + Word_t Index, // to insert. + PJError_t PJError // optional, for returning error info. + ) +{ +#ifdef JUDY1 +#define Pjv ignore // placeholders for macros. +#define Pjvnew ignore +#else + Pjv_t Pjv; // value area in old leaf. + Pjv_t Pjvnew; // value area in new leaf. +#endif + Pjpm_t Pjpm; // array-global info. + int offset; // position in which to store new Index. + Pjlw_t Pjlw; + + +// CHECK FOR NULL POINTER (error by caller): + + if (PPArray == (PPvoid_t) NULL) + { + JU_SET_ERRNO(PJError, JU_ERRNO_NULLPPARRAY); + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + } + + Pjlw = P_JLW(*PPArray); // first word of leaf. + +// **************************************************************************** +// PROCESS TOP LEVEL "JRP" BRANCHES AND LEAVES: + +// **************************************************************************** +// JRPNULL (EMPTY ARRAY): BUILD A LEAFW WITH ONE INDEX: + +// if a valid empty array (null pointer), so create an array of population == 1: + + if (Pjlw == (Pjlw_t)NULL) + { + Pjlw_t Pjlwnew; + + Pjlwnew = j__udyAllocJLW(1); + JUDY1CODE(JU_CHECKALLOC(Pjlw_t, Pjlwnew, JERRI );) + JUDYLCODE(JU_CHECKALLOC(Pjlw_t, Pjlwnew, PPJERR);) + + Pjlwnew[0] = 1 - 1; // pop0 = 0. + Pjlwnew[1] = Index; + + *PPArray = (Pvoid_t) Pjlwnew; + DBGCODE(JudyCheckPop(*PPArray);) + + JUDY1CODE(return(1); ) + JUDYLCODE(Pjlwnew[2] = 0; ) // value area. + JUDYLCODE(return((PPvoid_t) (Pjlwnew + 2)); ) + + } // NULL JRP + +// **************************************************************************** +// LEAFW, OTHER SIZE: + + if (JU_LEAFW_POP0(*PPArray) < cJU_LEAFW_MAXPOP1) // must be a LEAFW + { + Pjlw_t Pjlwnew; + Word_t pop1; + + Pjlw = P_JLW(*PPArray); // first word of leaf. + pop1 = Pjlw[0] + 1; + +#ifdef JUDYL + Pjv = JL_LEAFWVALUEAREA(Pjlw, pop1); +#endif + offset = j__udySearchLeafW(Pjlw + 1, pop1, Index); + + if (offset >= 0) // index is already valid: + { + DBGCODE(JudyCheckPop(*PPArray);) + JUDY1CODE(return(0); ) + JUDYLCODE(return((PPvoid_t) (Pjv + offset)); ) + } + + offset = ~offset; + +// Insert index in cases where no new memory is needed: + + if (JU_LEAFWGROWINPLACE(pop1)) + { + ++Pjlw[0]; // increase population. + + JU_INSERTINPLACE(Pjlw + 1, pop1, offset, Index); +#ifdef JUDYL + JU_INSERTINPLACE(Pjv, pop1, offset, 0); +#endif + DBGCODE(JudyCheckPop(*PPArray);) + DBGCODE(JudyCheckSorted(Pjlw + 1, pop1 + 1, cJU_ROOTSTATE);) + + JUDY1CODE(return(1); ) + JUDYLCODE(return((PPvoid_t) (Pjv + offset)); ) + } + +// Insert index into a new, larger leaf: + + if (pop1 < cJU_LEAFW_MAXPOP1) // can grow to a larger leaf. + { + Pjlwnew = j__udyAllocJLW(pop1 + 1); + JUDY1CODE(JU_CHECKALLOC(Pjlw_t, Pjlwnew, JERRI );) + JUDYLCODE(JU_CHECKALLOC(Pjlw_t, Pjlwnew, PPJERR);) + + Pjlwnew[0] = pop1; // set pop0 in new leaf. + + JU_INSERTCOPY(Pjlwnew + 1, Pjlw + 1, pop1, offset, Index); +#ifdef JUDYL + Pjvnew = JL_LEAFWVALUEAREA(Pjlwnew, pop1 + 1); + JU_INSERTCOPY(Pjvnew, Pjv, pop1, offset, 0); +#endif + DBGCODE(JudyCheckSorted(Pjlwnew + 1, pop1 + 1, cJU_ROOTSTATE);) + + j__udyFreeJLW(Pjlw, pop1, NULL); + + *PPArray = (Pvoid_t) Pjlwnew; + DBGCODE(JudyCheckPop(*PPArray);) + + JUDY1CODE(return(1); ) + JUDYLCODE(return((PPvoid_t) (Pjvnew + offset)); ) + } + + assert(pop1 == cJU_LEAFW_MAXPOP1); + +// Leaf at max size => cannot insert new index, so cascade instead: +// +// Upon cascading from a LEAFW leaf to the first branch, must allocate and +// initialize a JPM. + + Pjpm = j__udyAllocJPM(); + JUDY1CODE(JU_CHECKALLOC(Pjpm_t, Pjpm, JERRI );) + JUDYLCODE(JU_CHECKALLOC(Pjpm_t, Pjpm, PPJERR);) + + (Pjpm->jpm_Pop0) = cJU_LEAFW_MAXPOP1 - 1; + (Pjpm->jpm_JP.jp_Addr) = (Word_t) Pjlw; + + if (j__udyCascadeL(&(Pjpm->jpm_JP), Pjpm) == -1) + { + JU_COPY_ERRNO(PJError, Pjpm); + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + } + +// Note: No need to pass Pjpm for memory decrement; LEAFW memory is never +// counted in a JPM at all: + + j__udyFreeJLW(Pjlw, cJU_LEAFW_MAXPOP1, NULL); + *PPArray = (Pvoid_t) Pjpm; + + } // JU_LEAFW + +// **************************************************************************** +// BRANCH: + + { + int retcode; // really only needed for Judy1, but free for JudyL. + + Pjpm = P_JPM(*PPArray); + retcode = j__udyInsWalk(&(Pjpm->jpm_JP), Index, Pjpm); + + if (retcode == -1) + { + JU_COPY_ERRNO(PJError, Pjpm); + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + } + + if (retcode == 1) ++(Pjpm->jpm_Pop0); // incr total array popu. + + assert(((Pjpm->jpm_JP.jp_Type) == cJU_JPBRANCH_L) + || ((Pjpm->jpm_JP.jp_Type) == cJU_JPBRANCH_B) + || ((Pjpm->jpm_JP.jp_Type) == cJU_JPBRANCH_U)); + DBGCODE(JudyCheckPop(*PPArray);) + +#ifdef JUDY1 + assert((retcode == 0) || (retcode == 1)); + return(retcode); // == JU_RET_*_JPM(). +#else + assert(Pjpm->jpm_PValue != (Pjv_t) NULL); + return((PPvoid_t) Pjpm->jpm_PValue); +#endif + } + /*NOTREACHED*/ + +} // Judy1Set() / JudyLIns() diff --git a/src/libnetdata/libjudy/src/JudyL/JudyLInsArray.c b/src/libnetdata/libjudy/src/JudyL/JudyLInsArray.c new file mode 100644 index 00000000..f8e361f2 --- /dev/null +++ b/src/libnetdata/libjudy/src/JudyL/JudyLInsArray.c @@ -0,0 +1,1178 @@ +// Copyright (C) 2000 - 2002 Hewlett-Packard Company +// +// This program is free software; you can redistribute it and/or modify it +// under the term of the GNU Lesser General Public License as published by the +// Free Software Foundation; either version 2 of the License, or (at your +// option) any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License +// for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this program; if not, write to the Free Software Foundation, +// Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// _________________ + +// TBD: It would probably be faster for the caller if the JudyL version took +// PIndex as an interleaved array of indexes and values rather than just +// indexes with a separate values array (PValue), especially considering +// indexes and values are copied here with for-loops anyway and not the +// equivalent of memcpy(). All code could be revised to simply count by two +// words for JudyL? Supports "streaming" the data to/from disk better later? +// In which case get rid of JU_ERRNO_NULLPVALUE, no longer needed, and simplify +// the API to this code. +// _________________ + +// @(#) $Revision: 4.21 $ $Source: /judy/src/JudyCommon/JudyInsArray.c $ +// +// Judy1SetArray() and JudyLInsArray() functions for Judy1 and JudyL. +// Compile with one of -DJUDY1 or -DJUDYL. + +#if (! (defined(JUDY1) || defined(JUDYL))) +#error: One of -DJUDY1 or -DJUDYL must be specified. +#endif + +#ifdef JUDY1 +#include "Judy1.h" +#else +#include "JudyL.h" +#endif + +#include "JudyPrivate1L.h" + +DBGCODE(extern void JudyCheckPop(Pvoid_t PArray);) + + +// IMMED AND LEAF SIZE AND BRANCH TYPE ARRAYS: +// +// These support fast and easy lookup by level. + +static uint8_t immed_maxpop1[] = { + 0, + cJU_IMMED1_MAXPOP1, + cJU_IMMED2_MAXPOP1, + cJU_IMMED3_MAXPOP1, +#ifdef JU_64BIT + cJU_IMMED4_MAXPOP1, + cJU_IMMED5_MAXPOP1, + cJU_IMMED6_MAXPOP1, + cJU_IMMED7_MAXPOP1, +#endif + // note: There are no IMMEDs for whole words. +}; + +static uint8_t leaf_maxpop1[] = { + 0, +#if (defined(JUDYL) || (! defined(JU_64BIT))) + cJU_LEAF1_MAXPOP1, +#else + 0, // 64-bit Judy1 has no Leaf1. +#endif + cJU_LEAF2_MAXPOP1, + cJU_LEAF3_MAXPOP1, +#ifdef JU_64BIT + cJU_LEAF4_MAXPOP1, + cJU_LEAF5_MAXPOP1, + cJU_LEAF6_MAXPOP1, + cJU_LEAF7_MAXPOP1, +#endif + // note: Root-level leaves are handled differently. +}; + +static uint8_t branchL_JPtype[] = { + 0, + 0, + cJU_JPBRANCH_L2, + cJU_JPBRANCH_L3, +#ifdef JU_64BIT + cJU_JPBRANCH_L4, + cJU_JPBRANCH_L5, + cJU_JPBRANCH_L6, + cJU_JPBRANCH_L7, +#endif + cJU_JPBRANCH_L, +}; + +static uint8_t branchB_JPtype[] = { + 0, + 0, + cJU_JPBRANCH_B2, + cJU_JPBRANCH_B3, +#ifdef JU_64BIT + cJU_JPBRANCH_B4, + cJU_JPBRANCH_B5, + cJU_JPBRANCH_B6, + cJU_JPBRANCH_B7, +#endif + cJU_JPBRANCH_B, +}; + +static uint8_t branchU_JPtype[] = { + 0, + 0, + cJU_JPBRANCH_U2, + cJU_JPBRANCH_U3, +#ifdef JU_64BIT + cJU_JPBRANCH_U4, + cJU_JPBRANCH_U5, + cJU_JPBRANCH_U6, + cJU_JPBRANCH_U7, +#endif + cJU_JPBRANCH_U, +}; + +// Subexpanse masks are similer to JU_DCDMASK() but without the need to clear +// the first digits bits. Avoid doing variable shifts by precomputing a +// lookup array. + +static Word_t subexp_mask[] = { + 0, + ~cJU_POP0MASK(1), + ~cJU_POP0MASK(2), + ~cJU_POP0MASK(3), +#ifdef JU_64BIT + ~cJU_POP0MASK(4), + ~cJU_POP0MASK(5), + ~cJU_POP0MASK(6), + ~cJU_POP0MASK(7), +#endif +}; + + +// FUNCTION PROTOTYPES: + +static bool_t j__udyInsArray(Pjp_t PjpParent, int Level, PWord_t PPop1, + PWord_t PIndex, +#ifdef JUDYL + Pjv_t PValue, +#endif + Pjpm_t Pjpm); + + +// **************************************************************************** +// J U D Y 1 S E T A R R A Y +// J U D Y L I N S A R R A Y +// +// Main entry point. See the manual entry for external overview. +// +// TBD: Until thats written, note that the function returns 1 for success or +// JERRI for serious error, including insufficient memory to build whole array; +// use Judy*Count() to see how many were stored, the first N of the total +// Count. Also, since it takes Count == Pop1, it cannot handle a full array. +// Also, "sorted" means ascending without duplicates, otherwise you get the +// "unsorted" error. +// +// The purpose of these functions is to allow rapid construction of a large +// Judy array given a sorted list of indexes (and for JudyL, corresponding +// values). At least one customer saw this as useful, and probably it would +// also be useful as a sufficient workaround for fast(er) unload/reload to/from +// disk. +// +// This code is written recursively for simplicity, until/unless someone +// decides to make it faster and more complex. Hopefully recursion is fast +// enough simply because the function is so much faster than a series of +// Set/Ins calls. + +#ifdef JUDY1 +FUNCTION int Judy1SetArray +#else +FUNCTION int JudyLInsArray +#endif + ( + PPvoid_t PPArray, // in which to insert, initially empty. + Word_t Count, // number of indexes (and values) to insert. +const Word_t * const PIndex, // list of indexes to insert. +#ifdef JUDYL +const Word_t * const PValue, // list of corresponding values. +#endif + PJError_t PJError // optional, for returning error info. + ) +{ + Pjlw_t Pjlw; // new root-level leaf. + Pjlw_t Pjlwindex; // first index in root-level leaf. + int offset; // in PIndex. + + +// CHECK FOR NULL OR NON-NULL POINTER (error by caller): + + if (PPArray == (PPvoid_t) NULL) + { JU_SET_ERRNO(PJError, JU_ERRNO_NULLPPARRAY); return(JERRI); } + + if (*PPArray != (Pvoid_t) NULL) + { JU_SET_ERRNO(PJError, JU_ERRNO_NONNULLPARRAY); return(JERRI); } + + if (PIndex == (PWord_t) NULL) + { JU_SET_ERRNO(PJError, JU_ERRNO_NULLPINDEX); return(JERRI); } + +#ifdef JUDYL + if (PValue == (PWord_t) NULL) + { JU_SET_ERRNO(PJError, JU_ERRNO_NULLPVALUE); return(JERRI); } +#endif + + +// HANDLE LARGE COUNT (= POP1) (typical case): +// +// Allocate and initialize a JPM, set the root pointer to point to it, and then +// build the tree underneath it. + +// Common code for unusual error handling when no JPM available: + + if (Count > cJU_LEAFW_MAXPOP1) // too big for root-level leaf. + { + Pjpm_t Pjpm; // new, to allocate. + +// Allocate JPM: + + Pjpm = j__udyAllocJPM(); + JU_CHECKALLOC(Pjpm_t, Pjpm, JERRI); + *PPArray = (Pvoid_t) Pjpm; + +// Set some JPM fields: + + (Pjpm->jpm_Pop0) = Count - 1; + // note: (Pjpm->jpm_TotalMemWords) is now initialized. + +// Build Judy tree: +// +// In case of error save the final Count, possibly modified, unless modified to +// 0, in which case free the JPM itself: + + if (! j__udyInsArray(&(Pjpm->jpm_JP), cJU_ROOTSTATE, &Count, + (PWord_t) PIndex, +#ifdef JUDYL + (Pjv_t) PValue, +#endif + Pjpm)) + { + JU_COPY_ERRNO(PJError, Pjpm); + + if (Count) // partial success, adjust pop0: + { + (Pjpm->jpm_Pop0) = Count - 1; + } + else // total failure, free JPM: + { + j__udyFreeJPM(Pjpm, (Pjpm_t) NULL); + *PPArray = (Pvoid_t) NULL; + } + + DBGCODE(JudyCheckPop(*PPArray);) + return(JERRI); + } + + DBGCODE(JudyCheckPop(*PPArray);) + return(1); + + } // large count + + +// HANDLE SMALL COUNT (= POP1): +// +// First ensure indexes are in sorted order: + + for (offset = 1; offset < Count; ++offset) + { + if (PIndex[offset - 1] >= PIndex[offset]) + { JU_SET_ERRNO(PJError, JU_ERRNO_UNSORTED); return(JERRI); } + } + + if (Count == 0) return(1); // *PPArray remains null. + + { + Pjlw = j__udyAllocJLW(Count + 1); + JU_CHECKALLOC(Pjlw_t, Pjlw, JERRI); + *PPArray = (Pvoid_t) Pjlw; + Pjlw[0] = Count - 1; // set pop0. + Pjlwindex = Pjlw + 1; + } + +// Copy whole-word indexes (and values) to the root-level leaf: + + JU_COPYMEM(Pjlwindex, PIndex, Count); +JUDYLCODE(JU_COPYMEM(JL_LEAFWVALUEAREA(Pjlw, Count), PValue, Count)); + + DBGCODE(JudyCheckPop(*PPArray);) + return(1); + +} // Judy1SetArray() / JudyLInsArray() + + +// **************************************************************************** +// __ J U D Y I N S A R R A Y +// +// Given: +// +// - a pointer to a JP +// +// - the JPs level in the tree, that is, the number of digits left to decode +// in the indexes under the JP (one less than the level of the JPM or branch +// in which the JP resides); cJU_ROOTSTATE on first entry (when JP is the one +// in the JPM), down to 1 for a Leaf1, LeafB1, or FullPop +// +// - a pointer to the number of indexes (and corresponding values) to store in +// this subtree, to modify in case of partial success +// +// - a list of indexes (and for JudyL, corresponding values) to store in this +// subtree +// +// - a JPM for tracking memory usage and returning errors +// +// Recursively build a subtree (immediate indexes, leaf, or branch with +// subtrees) and modify the JP accordingly. On the way down, build a BranchU +// (only) for any expanse with *PPop1 too high for a leaf; on the way out, +// convert the BranchU to a BranchL or BranchB if appropriate. Keep memory +// statistics in the JPM. +// +// Return TRUE for success, or FALSE with error information set in the JPM in +// case of error, in which case leave a partially constructed but healthy tree, +// and modify parent population counts on the way out. +// +// Note: Each call of this function makes all modifications to the PjpParent +// it receives; neither the parent nor child calls do this. + +FUNCTION static bool_t j__udyInsArray( + Pjp_t PjpParent, // parent JP in/under which to store. + int Level, // initial digits remaining to decode. + PWord_t PPop1, // number of indexes to store. + PWord_t PIndex, // list of indexes to store. +#ifdef JUDYL + Pjv_t PValue, // list of corresponding values. +#endif + Pjpm_t Pjpm) // for memory and errors. +{ + Pjp_t Pjp; // lower-level JP. + Word_t Pjbany; // any type of branch. + int levelsub; // actual, of Pjps node, <= Level. + Word_t pop1 = *PPop1; // fast local value. + Word_t pop1sub; // population of one subexpanse. + uint8_t JPtype; // current JP type. + uint8_t JPtype_null; // precomputed value for new branch. + jp_t JPnull; // precomputed for speed. + Pjbu_t PjbuRaw; // constructed BranchU. + Pjbu_t Pjbu; + int digit; // in BranchU. + Word_t digitmask; // for a digit in a BranchU. + Word_t digitshifted; // shifted to correct offset. + Word_t digitshincr; // increment for digitshifted. + int offset; // in PIndex, or a bitmap subexpanse. + int numJPs; // number non-null in a BranchU. + bool_t retval; // to return from this func. +JUDYLCODE(Pjv_t PjvRaw); // destination value area. +JUDYLCODE(Pjv_t Pjv); + + +// MACROS FOR COMMON CODE: +// +// Note: These use function and local parameters from the context. +// Note: Assume newly allocated memory is zeroed. + +// Indicate whether a sorted list of indexes in PIndex, based on the first and +// last indexes in the list using pop1, are in the same subexpanse between +// Level and L_evel: +// +// This can be confusing! Note that SAMESUBEXP(L) == TRUE means the indexes +// are the same through level L + 1, and it says nothing about level L and +// lower; they might be the same or they might differ. +// +// Note: In principle SAMESUBEXP needs a mask for the digits from Level, +// inclusive, to L_evel, exclusive. But in practice, since the indexes are all +// known to be identical above Level, it just uses a mask for the digits +// through L_evel + 1; see subexp_mask[]. + +#define SAMESUBEXP(L_evel) \ + (! ((PIndex[0] ^ PIndex[pop1 - 1]) & subexp_mask[L_evel])) + +// Set PjpParent to a null JP appropriate for the level of the node to which it +// points, which is 1 less than the level of the node in which the JP resides, +// which is by definition Level: +// +// Note: This can set the JPMs JP to an invalid jp_Type, but it doesnt +// matter because the JPM is deleted by the caller. + +#define SETJPNULL_PARENT \ + JU_JPSETADT(PjpParent, 0, 0, cJU_JPNULL1 + Level - 1); + +// Variation to set a specified JP (in a branch being built) to a precomputed +// null JP: + +#define SETJPNULL(Pjp) *(Pjp) = JPnull + +// Handle complete (as opposed to partial) memory allocation failure: Set the +// parent JP to an appropriate null type (to leave a consistent tree), zero the +// callers population count, and return FALSE: +// +// Note: At Level == cJU_ROOTSTATE this sets the JPMs JPs jp_Type to a bogus +// value, but it doesnt matter because the JPM should be deleted by the +// caller. + +#define NOMEM { SETJPNULL_PARENT; *PPop1 = 0; return(FALSE); } + +// Allocate a Leaf1-N and save the address in Pjll; in case of failure, NOMEM: + +#define ALLOCLEAF(AllocLeaf) \ + if ((PjllRaw = AllocLeaf(pop1, Pjpm)) == (Pjll_t) NULL) NOMEM; \ + Pjll = P_JLL(PjllRaw); + +// Copy indexes smaller than words (and values which are whole words) from +// given arrays to immediate indexes or a leaf: +// +// TBD: These macros overlap with some of the code in JudyCascade.c; do some +// merging? That file has functions while these are macros. + +#define COPYTOLEAF_EVEN_SUB(Pjll,LeafType) \ + { \ + LeafType * P_leaf = (LeafType *) (Pjll); \ + Word_t p_op1 = pop1; \ + PWord_t P_Index = PIndex; \ + \ + assert(pop1 > 0); \ + \ + do { *P_leaf++ = *P_Index++; /* truncates */\ + } while (--(p_op1)); \ + } + +#define COPYTOLEAF_ODD_SUB(cLevel,Pjll,Copy) \ + { \ + uint8_t * P_leaf = (uint8_t *) (Pjll); \ + Word_t p_op1 = pop1; \ + PWord_t P_Index = PIndex; \ + \ + assert(pop1 > 0); \ + \ + do { \ + Copy(P_leaf, *P_Index); \ + P_leaf += (cLevel); ++P_Index; \ + } while (--(p_op1)); \ + } + +#ifdef JUDY1 + +#define COPYTOLEAF_EVEN(Pjll,LeafType) COPYTOLEAF_EVEN_SUB(Pjll,LeafType) +#define COPYTOLEAF_ODD(cLevel,Pjll,Copy) COPYTOLEAF_ODD_SUB(cLevel,Pjll,Copy) + +#else // JUDYL adds copying of values: + +#define COPYTOLEAF_EVEN(Pjll,LeafType) \ + { \ + COPYTOLEAF_EVEN_SUB(Pjll,LeafType) \ + JU_COPYMEM(Pjv, PValue, pop1); \ + } + +#define COPYTOLEAF_ODD(cLevel,Pjll,Copy) \ + { \ + COPYTOLEAF_ODD_SUB( cLevel,Pjll,Copy) \ + JU_COPYMEM(Pjv, PValue, pop1); \ + } + +#endif + +// Set the JP type for an immediate index, where BaseJPType is JPIMMED_*_02: + +#define SETIMMTYPE(BaseJPType) (PjpParent->jp_Type) = (BaseJPType) + pop1 - 2 + +// Allocate and populate a Leaf1-N: +// +// Build MAKELEAF_EVEN() and MAKELEAF_ODD() using macros for common code. + +#define MAKELEAF_SUB1(AllocLeaf,ValueArea,LeafType) \ + ALLOCLEAF(AllocLeaf); \ + JUDYLCODE(Pjv = ValueArea(Pjll, pop1)) + + +#define MAKELEAF_SUB2(cLevel,JPType) \ +{ \ + Word_t D_cdP0; \ + assert(pop1 - 1 <= cJU_POP0MASK(cLevel)); \ + D_cdP0 = (*PIndex & cJU_DCDMASK(cLevel)) | (pop1 - 1); \ + JU_JPSETADT(PjpParent, (Word_t)PjllRaw, D_cdP0, JPType); \ +} + + +#define MAKELEAF_EVEN(cLevel,JPType,AllocLeaf,ValueArea,LeafType) \ + MAKELEAF_SUB1(AllocLeaf,ValueArea,LeafType); \ + COPYTOLEAF_EVEN(Pjll, LeafType); \ + MAKELEAF_SUB2(cLevel, JPType) + +#define MAKELEAF_ODD(cLevel,JPType,AllocLeaf,ValueArea,Copy) \ + MAKELEAF_SUB1(AllocLeaf,ValueArea,LeafType); \ + COPYTOLEAF_ODD(cLevel, Pjll, Copy); \ + MAKELEAF_SUB2(cLevel, JPType) + +// Ensure that the indexes to be stored in immediate indexes or a leaf are +// sorted: +// +// This check is pure overhead, but required in order to protect the Judy array +// against caller error, to avoid a later corruption or core dump from a +// seemingly valid Judy array. Do this check piecemeal at the leaf level while +// the indexes are already in the cache. Higher-level order-checking occurs +// while building branches. +// +// Note: Any sorting error in the expanse of a single immediate indexes JP or +// a leaf => save no indexes in that expanse. + +#define CHECKLEAFORDER \ + { \ + for (offset = 1; offset < pop1; ++offset) \ + { \ + if (PIndex[offset - 1] >= PIndex[offset]) \ + { \ + SETJPNULL_PARENT; \ + *PPop1 = 0; \ + JU_SET_ERRNO_NONNULL(Pjpm, JU_ERRNO_UNSORTED); \ + return(FALSE); \ + } \ + } \ + } + + +// ------ START OF CODE ------ + + assert( Level >= 1); + assert( Level <= cJU_ROOTSTATE); + assert((Level < cJU_ROOTSTATE) || (pop1 > cJU_LEAFW_MAXPOP1)); + + +// CHECK FOR TOP LEVEL: +// +// Special case: If at the top level (PjpParent is in the JPM), a top-level +// branch must be created, even if its a BranchL with just one JP. (The JPM +// cannot point to a leaf because the leaf would have to be a lower-level, +// higher-capacity leaf under a narrow pointer (otherwise a root-level leaf +// would suffice), and the JPMs JP cant handle a narrow pointer because the +// jp_DcdPopO field isnt big enough.) Otherwise continue to check for a pop1 +// small enough to support immediate indexes or a leaf before giving up and +// making a lower-level branch. + + if (Level == cJU_ROOTSTATE) + { + levelsub = cJU_ROOTSTATE; + goto BuildBranch2; + } + assert(Level < cJU_ROOTSTATE); + + +// SKIP JPIMMED_*_01: +// +// Immeds with pop1 == 1 should be handled in-line during branch construction. + + assert(pop1 > 1); + + +// BUILD JPIMMED_*_02+: +// +// The starting address of the indexes depends on Judy1 or JudyL; also, JudyL +// includes a pointer to a values-only leaf. + + if (pop1 <= immed_maxpop1[Level]) // note: always < root level. + { + JUDY1CODE(uint8_t * Pjll = (uint8_t *) (PjpParent->jp_1Index);) + JUDYLCODE(uint8_t * Pjll = (uint8_t *) (PjpParent->jp_LIndex);) + + CHECKLEAFORDER; // indexes to be stored are sorted. + +#ifdef JUDYL + if ((PjvRaw = j__udyLAllocJV(pop1, Pjpm)) == (Pjv_t) NULL) + NOMEM; + (PjpParent->jp_Addr) = (Word_t) PjvRaw; + Pjv = P_JV(PjvRaw); +#endif + + switch (Level) + { + case 1: COPYTOLEAF_EVEN(Pjll, uint8_t); + SETIMMTYPE(cJU_JPIMMED_1_02); + break; +#if (defined(JUDY1) || defined(JU_64BIT)) + case 2: COPYTOLEAF_EVEN(Pjll, uint16_t); + SETIMMTYPE(cJU_JPIMMED_2_02); + break; + case 3: COPYTOLEAF_ODD(3, Pjll, JU_COPY3_LONG_TO_PINDEX); + SETIMMTYPE(cJU_JPIMMED_3_02); + break; +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case 4: COPYTOLEAF_EVEN(Pjll, uint32_t); + SETIMMTYPE(cJ1_JPIMMED_4_02); + break; + case 5: COPYTOLEAF_ODD(5, Pjll, JU_COPY5_LONG_TO_PINDEX); + SETIMMTYPE(cJ1_JPIMMED_5_02); + break; + case 6: COPYTOLEAF_ODD(6, Pjll, JU_COPY6_LONG_TO_PINDEX); + SETIMMTYPE(cJ1_JPIMMED_6_02); + break; + case 7: COPYTOLEAF_ODD(7, Pjll, JU_COPY7_LONG_TO_PINDEX); + SETIMMTYPE(cJ1_JPIMMED_7_02); + break; +#endif + default: assert(FALSE); // should be impossible. + } + + return(TRUE); // note: no children => no *PPop1 mods. + + } // JPIMMED_*_02+ + + +// BUILD JPLEAF*: +// +// This code is a little tricky. The method is: For each level starting at +// the present Level down through levelsub = 1, and then as a special case for +// LeafB1 and FullPop (which are also at levelsub = 1 but have different +// capacity, see later), check if pop1 fits in a leaf (using leaf_maxpop1[]) +// at that level. If so, except for Level == levelsub, check if all of the +// current indexes to be stored are in the same (narrow) subexpanse, that is, +// the digits from Level to levelsub + 1, inclusive, are identical between the +// first and last index in the (sorted) list (in PIndex). If this condition is +// satisfied at any level, build a leaf at that level (under a narrow pointer +// if Level > levelsub). +// +// Note: Doing the search in this order results in storing the indexes in +// "least compressed form." + + for (levelsub = Level; levelsub >= 1; --levelsub) + { + Pjll_t PjllRaw; + Pjll_t Pjll; + +// Check if pop1 is too large to fit in a leaf at levelsub; if so, try the next +// lower level: + + if (pop1 > leaf_maxpop1[levelsub]) continue; + +// If pop1 fits in a leaf at levelsub, but levelsub is lower than Level, must +// also check whether all the indexes in the expanse to store can in fact be +// placed under a narrow pointer; if not, a leaf cannot be used, at this or any +// lower level (levelsub): + + if ((levelsub < Level) && (! SAMESUBEXP(levelsub))) + goto BuildBranch; // cant use a narrow, need a branch. + +// Ensure valid pop1 and all indexes are in fact common through Level: + + assert(pop1 <= cJU_POP0MASK(Level) + 1); + assert(! ((PIndex[0] ^ PIndex[pop1 - 1]) & cJU_DCDMASK(Level))); + + CHECKLEAFORDER; // indexes to be stored are sorted. + +// Build correct type of leaf: +// +// Note: The jp_DcdPopO and jp_Type assignments in MAKELEAF_* happen correctly +// for the levelsub (not Level) of the new leaf, even if its under a narrow +// pointer. + + switch (levelsub) + { +#if (defined(JUDYL) || (! defined(JU_64BIT))) + case 1: MAKELEAF_EVEN(1, cJU_JPLEAF1, j__udyAllocJLL1, + JL_LEAF1VALUEAREA, uint8_t); + break; +#endif + case 2: MAKELEAF_EVEN(2, cJU_JPLEAF2, j__udyAllocJLL2, + JL_LEAF2VALUEAREA, uint16_t); + break; + case 3: MAKELEAF_ODD( 3, cJU_JPLEAF3, j__udyAllocJLL3, + JL_LEAF3VALUEAREA, JU_COPY3_LONG_TO_PINDEX); + break; +#ifdef JU_64BIT + case 4: MAKELEAF_EVEN(4, cJU_JPLEAF4, j__udyAllocJLL4, + JL_LEAF4VALUEAREA, uint32_t); + break; + case 5: MAKELEAF_ODD( 5, cJU_JPLEAF5, j__udyAllocJLL5, + JL_LEAF5VALUEAREA, JU_COPY5_LONG_TO_PINDEX); + break; + case 6: MAKELEAF_ODD( 6, cJU_JPLEAF6, j__udyAllocJLL6, + JL_LEAF6VALUEAREA, JU_COPY6_LONG_TO_PINDEX); + break; + case 7: MAKELEAF_ODD( 7, cJU_JPLEAF7, j__udyAllocJLL7, + JL_LEAF7VALUEAREA, JU_COPY7_LONG_TO_PINDEX); + break; +#endif + default: assert(FALSE); // should be impossible. + } + + return(TRUE); // note: no children => no *PPop1 mods. + + } // JPLEAF* + + +// BUILD JPLEAF_B1 OR JPFULLPOPU1: +// +// See above about JPLEAF*. If pop1 doesnt fit in any level of linear leaf, +// it might still fit in a LeafB1 or FullPop, perhaps under a narrow pointer. + + if ((Level == 1) || SAMESUBEXP(1)) // same until last digit. + { + Pjlb_t PjlbRaw; // for bitmap leaf. + Pjlb_t Pjlb; + + assert(pop1 <= cJU_JPFULLPOPU1_POP0 + 1); + CHECKLEAFORDER; // indexes to be stored are sorted. + +#ifdef JUDY1 + +// JPFULLPOPU1: + + if (pop1 == cJU_JPFULLPOPU1_POP0 + 1) + { + Word_t Addr = PjpParent->jp_Addr; + Word_t DcdP0 = (*PIndex & cJU_DCDMASK(1)) + | cJU_JPFULLPOPU1_POP0; + JU_JPSETADT(PjpParent, Addr, DcdP0, cJ1_JPFULLPOPU1); + + return(TRUE); + } +#endif + +// JPLEAF_B1: + + if ((PjlbRaw = j__udyAllocJLB1(Pjpm)) == (Pjlb_t) NULL) + NOMEM; + Pjlb = P_JLB(PjlbRaw); + + for (offset = 0; offset < pop1; ++offset) + JU_BITMAPSETL(Pjlb, PIndex[offset]); + + retval = TRUE; // default. + +#ifdef JUDYL + +// Build subexpanse values-only leaves (LeafVs) under LeafB1: + + for (offset = 0; offset < cJU_NUMSUBEXPL; ++offset) + { + if (! (pop1sub = j__udyCountBitsL(JU_JLB_BITMAP(Pjlb, offset)))) + continue; // skip empty subexpanse. + +// Allocate one LeafV = JP subarray; if out of memory, clear bitmaps for higher +// subexpanses and adjust *PPop1: + + if ((PjvRaw = j__udyLAllocJV(pop1sub, Pjpm)) + == (Pjv_t) NULL) + { + for (/* null */; offset < cJU_NUMSUBEXPL; ++offset) + { + *PPop1 -= j__udyCountBitsL(JU_JLB_BITMAP(Pjlb, offset)); + JU_JLB_BITMAP(Pjlb, offset) = 0; + } + + retval = FALSE; + break; + } + +// Populate values-only leaf and save the pointer to it: + + Pjv = P_JV(PjvRaw); + JU_COPYMEM(Pjv, PValue, pop1sub); + JL_JLB_PVALUE(Pjlb, offset) = PjvRaw; // first-tier pointer. + PValue += pop1sub; + + } // for each subexpanse + +#endif // JUDYL + +// Attach new LeafB1 to parent JP; note use of *PPop1 possibly < pop1: + + JU_JPSETADT(PjpParent, (Word_t) PjlbRaw, + (*PIndex & cJU_DCDMASK(1)) | (*PPop1 - 1), cJU_JPLEAF_B1); + + return(retval); + + } // JPLEAF_B1 or JPFULLPOPU1 + + +// BUILD JPBRANCH_U*: +// +// Arriving at BuildBranch means Level < top level but the pop1 is too large +// for immediate indexes or a leaf, even under a narrow pointer, including a +// LeafB1 or FullPop at level 1. This implies SAMESUBEXP(1) == FALSE, that is, +// the indexes to be stored "branch" at level 2 or higher. + +BuildBranch: // come here directly if a leaf wont work. + + assert(Level >= 2); + assert(Level < cJU_ROOTSTATE); + assert(! SAMESUBEXP(1)); // sanity check, see above. + +// Determine the appropriate level for a new branch node; see if a narrow +// pointer can be used: +// +// This can be confusing. The branch is required at the lowest level L where +// the indexes to store are not in the same subexpanse at level L-1. Work down +// from Level to tree level 3, which is 1 above the lowest tree level = 2 at +// which a branch can be used. Theres no need to check SAMESUBEXP at level 2 +// because its known to be false at level 2-1 = 1. +// +// Note: Unlike for a leaf node, a narrow pointer is always used for a branch +// if possible, that is, maximum compression is always used, except at the top +// level of the tree, where a JPM cannot support a narrow pointer, meaning a +// top BranchL can have a single JP (fanout = 1); but that case jumps directly +// to BuildBranch2. +// +// Note: For 32-bit systems the only usable values for a narrow pointer are +// Level = 3 and levelsub = 2; 64-bit systems have many more choices; but +// hopefully this for-loop is fast enough even on a 32-bit system. +// +// TBD: If not fast enough, #ifdef JU_64BIT and handle the 32-bit case faster. + + for (levelsub = Level; levelsub >= 3; --levelsub) // see above. + if (! SAMESUBEXP(levelsub - 1)) // at limit of narrow pointer. + break; // put branch at levelsub. + +BuildBranch2: // come here directly for Level = levelsub = cJU_ROOTSTATE. + + assert(levelsub >= 2); + assert(levelsub <= Level); + +// Initially build a BranchU: +// +// Always start with a BranchU because the number of populated subexpanses is +// not yet known. Use digitmask, digitshifted, and digitshincr to avoid +// expensive variable shifts within JU_DIGITATSTATE within the loop. +// +// TBD: The use of digitmask, etc. results in more increment operations per +// loop, is there an even faster way? +// +// TBD: Would it pay to pre-count the populated JPs (subexpanses) and +// pre-compress the branch, that is, build a BranchL or BranchB immediately, +// also taking account of opportunistic uncompression rules? Probably not +// because at high levels of the tree there might be huge numbers of indexes +// (hence cache lines) to scan in the PIndex array to determine the fanout +// (number of JPs) needed. + + if ((PjbuRaw = j__udyAllocJBU(Pjpm)) == (Pjbu_t) NULL) NOMEM; + Pjbu = P_JBU(PjbuRaw); + + JPtype_null = cJU_JPNULL1 + levelsub - 2; // in new BranchU. + JU_JPSETADT(&JPnull, 0, 0, JPtype_null); + + Pjp = Pjbu->jbu_jp; // for convenience in loop. + numJPs = 0; // non-null in the BranchU. + digitmask = cJU_MASKATSTATE(levelsub); // see above. + digitshincr = 1UL << (cJU_BITSPERBYTE * (levelsub - 1)); + retval = TRUE; + +// Scan and populate JPs (subexpanses): +// +// Look for all indexes matching each digit in the BranchU (at the correct +// levelsub), and meanwhile notice any sorting error. Increment PIndex (and +// PValue) and reduce pop1 for each subexpanse handled successfully. + + for (digit = digitshifted = 0; + digit < cJU_BRANCHUNUMJPS; + ++digit, digitshifted += digitshincr, ++Pjp) + { + DBGCODE(Word_t pop1subprev;) + assert(pop1 != 0); // end of indexes is handled elsewhere. + +// Count indexes in digits subexpanse: + + for (pop1sub = 0; pop1sub < pop1; ++pop1sub) + if (digitshifted != (PIndex[pop1sub] & digitmask)) break; + +// Empty subexpanse (typical, performance path) or sorting error (rare): + + if (pop1sub == 0) + { + if (digitshifted < (PIndex[0] & digitmask)) + { SETJPNULL(Pjp); continue; } // empty subexpanse. + + assert(pop1 < *PPop1); // did save >= 1 index and decr pop1. + JU_SET_ERRNO_NONNULL(Pjpm, JU_ERRNO_UNSORTED); + goto AbandonBranch; + } + +// Non-empty subexpanse: +// +// First shortcut by handling pop1sub == 1 (JPIMMED_*_01) inline locally. + + if (pop1sub == 1) // note: can be at root level. + { + Word_t Addr = 0; + JUDYLCODE(Addr = (Word_t) (*PValue++);) + JU_JPSETADT(Pjp, Addr, *PIndex, cJU_JPIMMED_1_01 + levelsub -2); + + ++numJPs; + + if (--pop1) { ++PIndex; continue; } // more indexes to store. + + ++digit; ++Pjp; // skip JP just saved. + goto ClearBranch; // save time. + } + +// Recurse to populate one digits (subexpanses) JP; if successful, skip +// indexes (and values) just stored (performance path), except when expanse is +// completely stored: + + DBGCODE(pop1subprev = pop1sub;) + + if (j__udyInsArray(Pjp, levelsub - 1, &pop1sub, (PWord_t) PIndex, +#ifdef JUDYL + (Pjv_t) PValue, +#endif + Pjpm)) + { // complete success. + ++numJPs; + assert(pop1subprev == pop1sub); + assert(pop1 >= pop1sub); + + if ((pop1 -= pop1sub) != 0) // more indexes to store: + { + PIndex += pop1sub; // skip indexes just stored. + JUDYLCODE(PValue += pop1sub;) + continue; + } + // else leave PIndex in BranchUs expanse. + +// No more indexes to store in BranchUs expanse: + + ++digit; ++Pjp; // skip JP just saved. + goto ClearBranch; // save time. + } + +// Handle any error at a lower level of recursion: +// +// In case of partial success, pop1sub != 0, but it was reduced from the value +// passed to j__udyInsArray(); skip this JP later during ClearBranch. + + assert(pop1subprev > pop1sub); // check j__udyInsArray(). + assert(pop1 > pop1sub); // check j__udyInsArray(). + + if (pop1sub) // partial success. + { ++digit; ++Pjp; ++numJPs; } // skip JP just saved. + + pop1 -= pop1sub; // deduct saved indexes if any. + +// Same-level sorting error, or any lower-level error; abandon the rest of the +// branch: +// +// Arrive here with pop1 = remaining unsaved indexes (always non-zero). Adjust +// the *PPop1 value to record and return, modify retval, and use ClearBranch to +// finish up. + +AbandonBranch: + assert(pop1 != 0); // more to store, see above. + assert(pop1 <= *PPop1); // sanity check. + + *PPop1 -= pop1; // deduct unsaved indexes. + pop1 = 0; // to avoid error later. + retval = FALSE; + +// Error (rare), or end of indexes while traversing new BranchU (performance +// path); either way, mark the remaining JPs, if any, in the BranchU as nulls +// and exit the loop: +// +// Arrive here with digit and Pjp set to the first JP to set to null. + +ClearBranch: + for (/* null */; digit < cJU_BRANCHUNUMJPS; ++digit, ++Pjp) + SETJPNULL(Pjp); + break; // saves one more compare. + + } // for each digit + + +// FINISH JPBRANCH_U*: +// +// Arrive here with a BranchU built under Pjbu, numJPs set, and either: retval +// == TRUE and *PPop1 unmodified, or else retval == FALSE, *PPop1 set to the +// actual number of indexes saved (possibly 0 for complete failure at a lower +// level upon the first call of j__udyInsArray()), and the Judy error set in +// Pjpm. Either way, PIndex points to an index within the expanse just +// handled. + + Pjbany = (Word_t) PjbuRaw; // default = use this BranchU. + JPtype = branchU_JPtype[levelsub]; + +// Check for complete failure above: + + assert((! retval) || *PPop1); // sanity check. + + if ((! retval) && (*PPop1 == 0)) // nothing stored, full failure. + { + j__udyFreeJBU(PjbuRaw, Pjpm); + SETJPNULL_PARENT; + return(FALSE); + } + +// Complete or partial success so far; watch for sorting error after the +// maximum digit (255) in the BranchU, which is indicated by having more +// indexes to store in the BranchUs expanse: +// +// For example, if an index to store has a digit of 255 at levelsub, followed +// by an index with a digit of 254, the for-loop above runs out of digits +// without reducing pop1 to 0. + + if (pop1 != 0) + { + JU_SET_ERRNO_NONNULL(Pjpm, JU_ERRNO_UNSORTED); + *PPop1 -= pop1; // deduct unsaved indexes. + retval = FALSE; + } + assert(*PPop1 != 0); // branch (still) cannot be empty. + + +// OPTIONALLY COMPRESS JPBRANCH_U*: +// +// See if the BranchU should be compressed to a BranchL or BranchB; if so, do +// that and free the BranchU; otherwise just use the existing BranchU. Follow +// the same rules as in JudyIns.c (version 4.95): Only check local population +// (cJU_OPP_UNCOMP_POP0) for BranchL, and only check global memory efficiency +// (JU_OPP_UNCOMPRESS) for BranchB. TBD: Have the rules changed? +// +// Note: Because of differing order of operations, the latter compression +// might not result in the same set of branch nodes as a series of sequential +// insertions. +// +// Note: Allocating a BranchU only to sometimes convert it to a BranchL or +// BranchB is unfortunate, but attempting to work with a temporary BranchU on +// the stack and then allocate and keep it as a BranchU in many cases is worse +// in terms of error handling. + + +// COMPRESS JPBRANCH_U* TO JPBRANCH_L*: + + if (numJPs <= cJU_BRANCHLMAXJPS) // JPs fit in a BranchL. + { + Pjbl_t PjblRaw = (Pjbl_t) NULL; // new BranchL; init for cc. + Pjbl_t Pjbl; + + if ((*PPop1 > JU_BRANCHL_MAX_POP) // pop too high. + || ((PjblRaw = j__udyAllocJBL(Pjpm)) == (Pjbl_t) NULL)) + { // cant alloc BranchL. + goto SetParent; // just keep BranchU. + } + + Pjbl = P_JBL(PjblRaw); + +// Copy BranchU JPs to BranchL: + + (Pjbl->jbl_NumJPs) = numJPs; + offset = 0; + + for (digit = 0; digit < cJU_BRANCHUNUMJPS; ++digit) + { + if ((((Pjbu->jbu_jp) + digit)->jp_Type) == JPtype_null) + continue; + + (Pjbl->jbl_Expanse[offset ]) = digit; + (Pjbl->jbl_jp [offset++]) = Pjbu->jbu_jp[digit]; + } + assert(offset == numJPs); // found same number. + +// Free the BranchU and prepare to use the new BranchL instead: + + j__udyFreeJBU(PjbuRaw, Pjpm); + + Pjbany = (Word_t) PjblRaw; + JPtype = branchL_JPtype[levelsub]; + + } // compress to BranchL + + +// COMPRESS JPBRANCH_U* TO JPBRANCH_B*: +// +// If unable to allocate the BranchB or any JP subarray, free all related +// memory and just keep the BranchU. +// +// Note: This use of JU_OPP_UNCOMPRESS is a bit conservative because the +// BranchU is already allocated while the (presumably smaller) BranchB is not, +// the opposite of how its used in single-insert code. + + else + { + Pjbb_t PjbbRaw = (Pjbb_t) NULL; // new BranchB; init for cc. + Pjbb_t Pjbb; + Pjp_t Pjp2; // in BranchU. + + if ((*PPop1 > JU_BRANCHB_MAX_POP) // pop too high. + || ((PjbbRaw = j__udyAllocJBB(Pjpm)) == (Pjbb_t) NULL)) + { // cant alloc BranchB. + goto SetParent; // just keep BranchU. + } + + Pjbb = P_JBB(PjbbRaw); + +// Set bits in bitmap for populated subexpanses: + + Pjp2 = Pjbu->jbu_jp; + + for (digit = 0; digit < cJU_BRANCHUNUMJPS; ++digit) + if ((((Pjbu->jbu_jp) + digit)->jp_Type) != JPtype_null) + JU_BITMAPSETB(Pjbb, digit); + +// Copy non-null JPs to BranchB JP subarrays: + + for (offset = 0; offset < cJU_NUMSUBEXPB; ++offset) + { + Pjp_t PjparrayRaw; + Pjp_t Pjparray; + + if (! (numJPs = j__udyCountBitsB(JU_JBB_BITMAP(Pjbb, offset)))) + continue; // skip empty subexpanse. + +// If unable to allocate a JP subarray, free all BranchB memory so far and +// continue to use the BranchU: + + if ((PjparrayRaw = j__udyAllocJBBJP(numJPs, Pjpm)) + == (Pjp_t) NULL) + { + while (offset-- > 0) + { + if (JU_JBB_PJP(Pjbb, offset) == (Pjp_t) NULL) continue; + + j__udyFreeJBBJP(JU_JBB_PJP(Pjbb, offset), + j__udyCountBitsB(JU_JBB_BITMAP(Pjbb, offset)), + Pjpm); + } + j__udyFreeJBB(PjbbRaw, Pjpm); + goto SetParent; // keep BranchU. + } + +// Set one JP subarray pointer and copy the subexpanses JPs to the subarray: +// +// Scan the BranchU for non-null JPs until numJPs JPs are copied. + + JU_JBB_PJP(Pjbb, offset) = PjparrayRaw; + Pjparray = P_JP(PjparrayRaw); + + while (numJPs-- > 0) + { + while ((Pjp2->jp_Type) == JPtype_null) + { + ++Pjp2; + assert(Pjp2 < (Pjbu->jbu_jp) + cJU_BRANCHUNUMJPS); + } + *Pjparray++ = *Pjp2++; + } + } // for each subexpanse + +// Free the BranchU and prepare to use the new BranchB instead: + + j__udyFreeJBU(PjbuRaw, Pjpm); + + Pjbany = (Word_t) PjbbRaw; + JPtype = branchB_JPtype[levelsub]; + + } // compress to BranchB + + +// COMPLETE OR PARTIAL SUCCESS: +// +// Attach new branch (under Pjp, with JPtype) to parent JP; note use of *PPop1, +// possibly reduced due to partial failure. + +SetParent: + (PjpParent->jp_Addr) = Pjbany; + (PjpParent->jp_Type) = JPtype; + + if (Level < cJU_ROOTSTATE) // PjpParent not in JPM: + { + Word_t DcdP0 = (*PIndex & cJU_DCDMASK(levelsub)) | (*PPop1 - 1); + + JU_JPSETADT(PjpParent ,Pjbany, DcdP0, JPtype); + } + + return(retval); + +} // j__udyInsArray() diff --git a/src/libnetdata/libjudy/src/JudyL/JudyLInsertBranch.c b/src/libnetdata/libjudy/src/JudyL/JudyLInsertBranch.c new file mode 100644 index 00000000..cfa16bd6 --- /dev/null +++ b/src/libnetdata/libjudy/src/JudyL/JudyLInsertBranch.c @@ -0,0 +1,135 @@ +// Copyright (C) 2000 - 2002 Hewlett-Packard Company +// +// This program is free software; you can redistribute it and/or modify it +// under the term of the GNU Lesser General Public License as published by the +// Free Software Foundation; either version 2 of the License, or (at your +// option) any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License +// for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this program; if not, write to the Free Software Foundation, +// Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// _________________ + +// @(#) $Revision: 4.17 $ $Source: /judy/src/JudyCommon/JudyInsertBranch.c $ + +// BranchL insertion functions for Judy1 and JudyL. +// Compile with one of -DJUDY1 or -DJUDYL. + +#if (! (defined(JUDY1) || defined(JUDYL))) +#error: One of -DJUDY1 or -DJUDYL must be specified. +#endif + +#ifdef JUDY1 +#include "Judy1.h" +#else +#include "JudyL.h" +#endif + +#include "JudyPrivate1L.h" + +extern int j__udyCreateBranchL(Pjp_t, Pjp_t, uint8_t *, Word_t, Pvoid_t); + + +// **************************************************************************** +// __ J U D Y I N S E R T B R A N C H +// +// Insert 2-element BranchL in between Pjp and Pjp->jp_Addr. +// +// Return -1 if out of memory, otherwise return 1. + +FUNCTION int j__udyInsertBranch( + Pjp_t Pjp, // JP containing narrow pointer. + Word_t Index, // outlier to Pjp. + Word_t BranchLevel, // of what JP points to, mapped from JP type. + Pjpm_t Pjpm) // for global accounting. +{ + jp_t JP2 [2]; + jp_t JP; + Pjp_t PjpNull; + Word_t XorExp; + Word_t Inew, Iold; + Word_t DCDMask; // initially for original BranchLevel. + int Ret; + uint8_t Exp2[2]; + uint8_t DecodeByteN, DecodeByteO; + +// Get the current mask for the DCD digits: + + DCDMask = cJU_DCDMASK(BranchLevel); + +// Obtain Dcd bits that differ between Index and JP, shifted so the +// digit for BranchLevel is the LSB: + + XorExp = ((Index ^ JU_JPDCDPOP0(Pjp)) & (cJU_ALLONES >> cJU_BITSPERBYTE)) + >> (BranchLevel * cJU_BITSPERBYTE); + assert(XorExp); // Index must be an outlier. + +// Count levels between object under narrow pointer and the level at which +// the outlier diverges from it, which is always at least initial +// BranchLevel + 1, to end up with the level (JP type) at which to insert +// the new intervening BranchL: + + do { ++BranchLevel; } while ((XorExp >>= cJU_BITSPERBYTE)); + assert((BranchLevel > 1) && (BranchLevel < cJU_ROOTSTATE)); + +// Get the MSB (highest digit) that differs between the old expanse and +// the new Index to insert: + + DecodeByteO = JU_DIGITATSTATE(JU_JPDCDPOP0(Pjp), BranchLevel); + DecodeByteN = JU_DIGITATSTATE(Index, BranchLevel); + + assert(DecodeByteO != DecodeByteN); + +// Determine sorted order for old expanse and new Index digits: + + if (DecodeByteN > DecodeByteO) { Iold = 0; Inew = 1; } + else { Iold = 1; Inew = 0; } + +// Copy old JP into staging area for new Branch + JP2 [Iold] = *Pjp; + Exp2[Iold] = DecodeByteO; + Exp2[Inew] = DecodeByteN; + +// Create a 2 Expanse Linear branch +// +// Note: Pjp->jp_Addr is set by j__udyCreateBranchL() + + Ret = j__udyCreateBranchL(Pjp, JP2, Exp2, 2, Pjpm); + if (Ret == -1) return(-1); + +// Get Pjp to the NULL of where to do insert + PjpNull = ((P_JBL(Pjp->jp_Addr))->jbl_jp) + Inew; + +// Convert to a cJU_JPIMMED_*_01 at the correct level: +// Build JP and set type below to: cJU_JPIMMED_X_01 + JU_JPSETADT(PjpNull, 0, Index, cJU_JPIMMED_1_01 - 2 + BranchLevel); + +// Return pointer to Value area in cJU_JPIMMED_X_01 + JUDYLCODE(Pjpm->jpm_PValue = (Pjv_t) PjpNull;) + +// The old JP now points to a BranchL that is at higher level. Therefore +// it contains excess DCD bits (in the least significant position) that +// must be removed (zeroed); that is, they become part of the Pop0 +// subfield. Note that the remaining (lower) bytes in the Pop0 field do +// not change. +// +// Take from the old DCDMask, which went "down" to a lower BranchLevel, +// and zero any high bits that are still in the mask at the new, higher +// BranchLevel; then use this mask to zero the bits in jp_DcdPopO: + +// Set old JP to a BranchL at correct level + + Pjp->jp_Type = cJU_JPBRANCH_L2 - 2 + BranchLevel; + DCDMask ^= cJU_DCDMASK(BranchLevel); + DCDMask = ~DCDMask & JU_JPDCDPOP0(Pjp); + JP = *Pjp; + JU_JPSETADT(Pjp, JP.jp_Addr, DCDMask, JP.jp_Type); + + return(1); + +} // j__udyInsertBranch() diff --git a/src/libnetdata/libjudy/src/JudyL/JudyLMallocIF.c b/src/libnetdata/libjudy/src/JudyL/JudyLMallocIF.c new file mode 100644 index 00000000..9a7d02f2 --- /dev/null +++ b/src/libnetdata/libjudy/src/JudyL/JudyLMallocIF.c @@ -0,0 +1,782 @@ +// Copyright (C) 2000 - 2002 Hewlett-Packard Company +// +// This program is free software; you can redistribute it and/or modify it +// under the term of the GNU Lesser General Public License as published by the +// Free Software Foundation; either version 2 of the License, or (at your +// option) any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License +// for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this program; if not, write to the Free Software Foundation, +// Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// _________________ + +// @(#) $Revision: 4.45 $ $Source: /judy/src/JudyCommon/JudyMallocIF.c $ +// +// Judy malloc/free interface functions for Judy1 and JudyL. +// +// Compile with one of -DJUDY1 or -DJUDYL. +// +// Compile with -DTRACEMI (Malloc Interface) to turn on tracing of malloc/free +// calls at the interface level. (See also TRACEMF in lower-level code.) +// Use -DTRACEMI2 for a terser format suitable for trace analysis. +// +// There can be malloc namespace bits in the LSBs of "raw" addresses from most, +// but not all, of the j__udy*Alloc*() functions; see also JudyPrivate.h. To +// test the Judy code, compile this file with -DMALLOCBITS and use debug flavor +// only (for assertions). This test ensures that (a) all callers properly mask +// the namespace bits out before dereferencing a pointer (or else a core dump +// occurs), and (b) all callers send "raw" (unmasked) addresses to +// j__udy*Free*() calls. +// +// Note: Currently -DDEBUG turns on MALLOCBITS automatically. + +#if (! (defined(JUDY1) || defined(JUDYL))) +#error: One of -DJUDY1 or -DJUDYL must be specified. +#endif + +#ifdef JUDY1 +#include "Judy1.h" +#else +#include "JudyL.h" +#endif + +#include "JudyPrivate1L.h" + +// Set "hidden" global j__uMaxWords to the maximum number of words to allocate +// to any one array (large enough to have a JPM, otherwise j__uMaxWords is +// ignored), to trigger a fake malloc error when the number is exceeded. Note, +// this code is always executed, not #ifdefd, because its virtually free. +// +// Note: To keep the MALLOC macro faster and simpler, set j__uMaxWords to +// MAXINT, not zero, by default. + +Word_t j__uMaxWords = ~0UL; + +// This macro hides the faking of a malloc failure: +// +// Note: To keep this fast, just compare WordsPrev to j__uMaxWords without the +// complexity of first adding WordsNow, meaning the trigger point is not +// exactly where you might assume, but it shouldnt matter. + +#define MALLOC(MallocFunc,WordsPrev,WordsNow) \ + (((WordsPrev) > j__uMaxWords) ? 0UL : MallocFunc(WordsNow)) + +// Clear words starting at address: +// +// Note: Only use this for objects that care; in other cases, it doesnt +// matter if the objects memory is pre-zeroed. + +#define ZEROWORDS(Addr,Words) \ + { \ + Word_t Words__ = (Words); \ + PWord_t Addr__ = (PWord_t) (Addr); \ + while (Words__--) *Addr__++ = 0UL; \ + } + +#ifdef TRACEMI + +// TRACING SUPPORT: +// +// Note: For TRACEMI, use a format for address printing compatible with other +// tracing facilities; in particular, %x not %lx, to truncate the "noisy" high +// part on 64-bit systems. +// +// TBD: The trace macros need fixing for alternate address types. +// +// Note: TRACEMI2 supports trace analysis no matter the underlying malloc/free +// engine used. + +#include <stdio.h> + +static Word_t j__udyMemSequence = 0L; // event sequence number. + +#define TRACE_ALLOC5(a,b,c,d,e) (void) printf(a, (b), c, d) +#define TRACE_FREE5( a,b,c,d,e) (void) printf(a, (b), c, d) +#define TRACE_ALLOC6(a,b,c,d,e,f) (void) printf(a, (b), c, d, e) +#define TRACE_FREE6( a,b,c,d,e,f) (void) printf(a, (b), c, d, e) + +#else + +#ifdef TRACEMI2 + +#include <stdio.h> + +#define b_pw cJU_BYTESPERWORD + +#define TRACE_ALLOC5(a,b,c,d,e) \ + (void) printf("a %lx %lx %lx\n", (b), (d) * b_pw, e) +#define TRACE_FREE5( a,b,c,d,e) \ + (void) printf("f %lx %lx %lx\n", (b), (d) * b_pw, e) +#define TRACE_ALLOC6(a,b,c,d,e,f) \ + (void) printf("a %lx %lx %lx\n", (b), (e) * b_pw, f) +#define TRACE_FREE6( a,b,c,d,e,f) \ + (void) printf("f %lx %lx %lx\n", (b), (e) * b_pw, f) + +static Word_t j__udyMemSequence = 0L; // event sequence number. + +#else + +#define TRACE_ALLOC5(a,b,c,d,e) // null. +#define TRACE_FREE5( a,b,c,d,e) // null. +#define TRACE_ALLOC6(a,b,c,d,e,f) // null. +#define TRACE_FREE6( a,b,c,d,e,f) // null. + +#endif // ! TRACEMI2 +#endif // ! TRACEMI + + +// MALLOC NAMESPACE SUPPORT: + +#if (defined(DEBUG) && (! defined(MALLOCBITS))) // for now, DEBUG => MALLOCBITS: +#define MALLOCBITS 1 +#endif + +#ifdef MALLOCBITS +#define MALLOCBITS_VALUE 0x3 // bit pattern to use. +#define MALLOCBITS_MASK 0x7 // note: matches mask__ in JudyPrivate.h. + +#define MALLOCBITS_SET( Type,Addr) \ + ((Addr) = (Type) ((Word_t) (Addr) | MALLOCBITS_VALUE)) +#define MALLOCBITS_TEST(Type,Addr) \ + assert((((Word_t) (Addr)) & MALLOCBITS_MASK) == MALLOCBITS_VALUE); \ + ((Addr) = (Type) ((Word_t) (Addr) & ~MALLOCBITS_VALUE)) +#else +#define MALLOCBITS_SET( Type,Addr) // null. +#define MALLOCBITS_TEST(Type,Addr) // null. +#endif + + +// SAVE ERROR INFORMATION IN A Pjpm: +// +// "Small" (invalid) Addr values are used to distinguish overrun and no-mem +// errors. (TBD, non-zero invalid values are no longer returned from +// lower-level functions, that is, JU_ERRNO_OVERRUN is no longer detected.) + +#define J__UDYSETALLOCERROR(Addr) \ + { \ + JU_ERRID(Pjpm) = __LINE__; \ + if ((Word_t) (Addr) > 0) JU_ERRNO(Pjpm) = JU_ERRNO_OVERRUN; \ + else JU_ERRNO(Pjpm) = JU_ERRNO_NOMEM; \ + return(0); \ + } + + +// **************************************************************************** +// ALLOCATION FUNCTIONS: +// +// To help the compiler catch coding errors, each function returns a specific +// object type. +// +// Note: Only j__udyAllocJPM() and j__udyAllocJLW() return multiple values <= +// sizeof(Word_t) to indicate the type of memory allocation failure. Other +// allocation functions convert this failure to a JU_ERRNO. + + +// Note: Unlike other j__udyAlloc*() functions, Pjpms are returned non-raw, +// that is, without malloc namespace or root pointer type bits: + +FUNCTION Pjpm_t j__udyAllocJPM(void) +{ + Word_t Words = (sizeof(jpm_t) + cJU_BYTESPERWORD - 1) / cJU_BYTESPERWORD; + Pjpm_t Pjpm = (Pjpm_t) MALLOC(JudyMalloc, Words, Words); + + assert((Words * cJU_BYTESPERWORD) == sizeof(jpm_t)); + + if ((Word_t) Pjpm > sizeof(Word_t)) + { + ZEROWORDS(Pjpm, Words); + Pjpm->jpm_TotalMemWords = Words; + } + + TRACE_ALLOC5("0x%x %8lu = j__udyAllocJPM(), Words = %lu\n", + Pjpm, j__udyMemSequence++, Words, cJU_LEAFW_MAXPOP1 + 1); + // MALLOCBITS_SET(Pjpm_t, Pjpm); // see above. + return(Pjpm); + +} // j__udyAllocJPM() + + +FUNCTION Pjbl_t j__udyAllocJBL(Pjpm_t Pjpm) +{ + Word_t Words = sizeof(jbl_t) / cJU_BYTESPERWORD; + Pjbl_t PjblRaw = (Pjbl_t) MALLOC(JudyMallocVirtual, + Pjpm->jpm_TotalMemWords, Words); + + assert((Words * cJU_BYTESPERWORD) == sizeof(jbl_t)); + + if ((Word_t) PjblRaw > sizeof(Word_t)) + { + ZEROWORDS(P_JBL(PjblRaw), Words); + Pjpm->jpm_TotalMemWords += Words; + } + else { J__UDYSETALLOCERROR(PjblRaw); } + + TRACE_ALLOC5("0x%x %8lu = j__udyAllocJBL(), Words = %lu\n", PjblRaw, + j__udyMemSequence++, Words, (Pjpm->jpm_Pop0) + 2); + MALLOCBITS_SET(Pjbl_t, PjblRaw); + return(PjblRaw); + +} // j__udyAllocJBL() + + +FUNCTION Pjbb_t j__udyAllocJBB(Pjpm_t Pjpm) +{ + Word_t Words = sizeof(jbb_t) / cJU_BYTESPERWORD; + Pjbb_t PjbbRaw = (Pjbb_t) MALLOC(JudyMallocVirtual, + Pjpm->jpm_TotalMemWords, Words); + + assert((Words * cJU_BYTESPERWORD) == sizeof(jbb_t)); + + if ((Word_t) PjbbRaw > sizeof(Word_t)) + { + ZEROWORDS(P_JBB(PjbbRaw), Words); + Pjpm->jpm_TotalMemWords += Words; + } + else { J__UDYSETALLOCERROR(PjbbRaw); } + + TRACE_ALLOC5("0x%x %8lu = j__udyAllocJBB(), Words = %lu\n", PjbbRaw, + j__udyMemSequence++, Words, (Pjpm->jpm_Pop0) + 2); + MALLOCBITS_SET(Pjbb_t, PjbbRaw); + return(PjbbRaw); + +} // j__udyAllocJBB() + + +FUNCTION Pjp_t j__udyAllocJBBJP(Word_t NumJPs, Pjpm_t Pjpm) +{ + Word_t Words = JU_BRANCHJP_NUMJPSTOWORDS(NumJPs); + Pjp_t PjpRaw; + + PjpRaw = (Pjp_t) MALLOC(JudyMalloc, Pjpm->jpm_TotalMemWords, Words); + + if ((Word_t) PjpRaw > sizeof(Word_t)) + { + Pjpm->jpm_TotalMemWords += Words; + } + else { J__UDYSETALLOCERROR(PjpRaw); } + + TRACE_ALLOC6("0x%x %8lu = j__udyAllocJBBJP(%lu), Words = %lu\n", PjpRaw, + j__udyMemSequence++, NumJPs, Words, (Pjpm->jpm_Pop0) + 2); + MALLOCBITS_SET(Pjp_t, PjpRaw); + return(PjpRaw); + +} // j__udyAllocJBBJP() + + +FUNCTION Pjbu_t j__udyAllocJBU(Pjpm_t Pjpm) +{ + Word_t Words = sizeof(jbu_t) / cJU_BYTESPERWORD; + Pjbu_t PjbuRaw = (Pjbu_t) MALLOC(JudyMallocVirtual, + Pjpm->jpm_TotalMemWords, Words); + + assert((Words * cJU_BYTESPERWORD) == sizeof(jbu_t)); + + if ((Word_t) PjbuRaw > sizeof(Word_t)) + { + Pjpm->jpm_TotalMemWords += Words; + } + else { J__UDYSETALLOCERROR(PjbuRaw); } + + TRACE_ALLOC5("0x%x %8lu = j__udyAllocJBU(), Words = %lu\n", PjbuRaw, + j__udyMemSequence++, Words, (Pjpm->jpm_Pop0) + 2); + MALLOCBITS_SET(Pjbu_t, PjbuRaw); + return(PjbuRaw); + +} // j__udyAllocJBU() + + +#if (defined(JUDYL) || (! defined(JU_64BIT))) + +FUNCTION Pjll_t j__udyAllocJLL1(Word_t Pop1, Pjpm_t Pjpm) +{ + Word_t Words = JU_LEAF1POPTOWORDS(Pop1); + Pjll_t PjllRaw; + + PjllRaw = (Pjll_t) MALLOC(JudyMalloc, Pjpm->jpm_TotalMemWords, Words); + + if ((Word_t) PjllRaw > sizeof(Word_t)) + { + Pjpm->jpm_TotalMemWords += Words; + } + else { J__UDYSETALLOCERROR(PjllRaw); } + + TRACE_ALLOC6("0x%x %8lu = j__udyAllocJLL1(%lu), Words = %lu\n", PjllRaw, + j__udyMemSequence++, Pop1, Words, (Pjpm->jpm_Pop0) + 2); + MALLOCBITS_SET(Pjll_t, PjllRaw); + return(PjllRaw); + +} // j__udyAllocJLL1() + +#endif // (JUDYL || (! JU_64BIT)) + + +FUNCTION Pjll_t j__udyAllocJLL2(Word_t Pop1, Pjpm_t Pjpm) +{ + Word_t Words = JU_LEAF2POPTOWORDS(Pop1); + Pjll_t PjllRaw; + + PjllRaw = (Pjll_t) MALLOC(JudyMalloc, Pjpm->jpm_TotalMemWords, Words); + + if ((Word_t) PjllRaw > sizeof(Word_t)) + { + Pjpm->jpm_TotalMemWords += Words; + } + else { J__UDYSETALLOCERROR(PjllRaw); } + + TRACE_ALLOC6("0x%x %8lu = j__udyAllocJLL2(%lu), Words = %lu\n", PjllRaw, + j__udyMemSequence++, Pop1, Words, (Pjpm->jpm_Pop0) + 2); + MALLOCBITS_SET(Pjll_t, PjllRaw); + return(PjllRaw); + +} // j__udyAllocJLL2() + + +FUNCTION Pjll_t j__udyAllocJLL3(Word_t Pop1, Pjpm_t Pjpm) +{ + Word_t Words = JU_LEAF3POPTOWORDS(Pop1); + Pjll_t PjllRaw; + + PjllRaw = (Pjll_t) MALLOC(JudyMalloc, Pjpm->jpm_TotalMemWords, Words); + + if ((Word_t) PjllRaw > sizeof(Word_t)) + { + Pjpm->jpm_TotalMemWords += Words; + } + else { J__UDYSETALLOCERROR(PjllRaw); } + + TRACE_ALLOC6("0x%x %8lu = j__udyAllocJLL3(%lu), Words = %lu\n", PjllRaw, + j__udyMemSequence++, Pop1, Words, (Pjpm->jpm_Pop0) + 2); + MALLOCBITS_SET(Pjll_t, PjllRaw); + return(PjllRaw); + +} // j__udyAllocJLL3() + + +#ifdef JU_64BIT + +FUNCTION Pjll_t j__udyAllocJLL4(Word_t Pop1, Pjpm_t Pjpm) +{ + Word_t Words = JU_LEAF4POPTOWORDS(Pop1); + Pjll_t PjllRaw; + + PjllRaw = (Pjll_t) MALLOC(JudyMalloc, Pjpm->jpm_TotalMemWords, Words); + + if ((Word_t) PjllRaw > sizeof(Word_t)) + { + Pjpm->jpm_TotalMemWords += Words; + } + else { J__UDYSETALLOCERROR(PjllRaw); } + + TRACE_ALLOC6("0x%x %8lu = j__udyAllocJLL4(%lu), Words = %lu\n", PjllRaw, + j__udyMemSequence++, Pop1, Words, (Pjpm->jpm_Pop0) + 2); + MALLOCBITS_SET(Pjll_t, PjllRaw); + return(PjllRaw); + +} // j__udyAllocJLL4() + + +FUNCTION Pjll_t j__udyAllocJLL5(Word_t Pop1, Pjpm_t Pjpm) +{ + Word_t Words = JU_LEAF5POPTOWORDS(Pop1); + Pjll_t PjllRaw; + + PjllRaw = (Pjll_t) MALLOC(JudyMalloc, Pjpm->jpm_TotalMemWords, Words); + + if ((Word_t) PjllRaw > sizeof(Word_t)) + { + Pjpm->jpm_TotalMemWords += Words; + } + else { J__UDYSETALLOCERROR(PjllRaw); } + + TRACE_ALLOC6("0x%x %8lu = j__udyAllocJLL5(%lu), Words = %lu\n", PjllRaw, + j__udyMemSequence++, Pop1, Words, (Pjpm->jpm_Pop0) + 2); + MALLOCBITS_SET(Pjll_t, PjllRaw); + return(PjllRaw); + +} // j__udyAllocJLL5() + + +FUNCTION Pjll_t j__udyAllocJLL6(Word_t Pop1, Pjpm_t Pjpm) +{ + Word_t Words = JU_LEAF6POPTOWORDS(Pop1); + Pjll_t PjllRaw; + + PjllRaw = (Pjll_t) MALLOC(JudyMalloc, Pjpm->jpm_TotalMemWords, Words); + + if ((Word_t) PjllRaw > sizeof(Word_t)) + { + Pjpm->jpm_TotalMemWords += Words; + } + else { J__UDYSETALLOCERROR(PjllRaw); } + + TRACE_ALLOC6("0x%x %8lu = j__udyAllocJLL6(%lu), Words = %lu\n", PjllRaw, + j__udyMemSequence++, Pop1, Words, (Pjpm->jpm_Pop0) + 2); + MALLOCBITS_SET(Pjll_t, PjllRaw); + return(PjllRaw); + +} // j__udyAllocJLL6() + + +FUNCTION Pjll_t j__udyAllocJLL7(Word_t Pop1, Pjpm_t Pjpm) +{ + Word_t Words = JU_LEAF7POPTOWORDS(Pop1); + Pjll_t PjllRaw; + + PjllRaw = (Pjll_t) MALLOC(JudyMalloc, Pjpm->jpm_TotalMemWords, Words); + + if ((Word_t) PjllRaw > sizeof(Word_t)) + { + Pjpm->jpm_TotalMemWords += Words; + } + else { J__UDYSETALLOCERROR(PjllRaw); } + + TRACE_ALLOC6("0x%x %8lu = j__udyAllocJLL7(%lu), Words = %lu\n", PjllRaw, + j__udyMemSequence++, Pop1, Words, (Pjpm->jpm_Pop0) + 2); + MALLOCBITS_SET(Pjll_t, PjllRaw); + return(PjllRaw); + +} // j__udyAllocJLL7() + +#endif // JU_64BIT + + +// Note: Root-level leaf addresses are always whole words (Pjlw_t), and unlike +// other j__udyAlloc*() functions, they are returned non-raw, that is, without +// malloc namespace or root pointer type bits (the latter are added later by +// the caller): + +FUNCTION Pjlw_t j__udyAllocJLW(Word_t Pop1) +{ + Word_t Words = JU_LEAFWPOPTOWORDS(Pop1); + Pjlw_t Pjlw = (Pjlw_t) MALLOC(JudyMalloc, Words, Words); + + TRACE_ALLOC6("0x%x %8lu = j__udyAllocJLW(%lu), Words = %lu\n", Pjlw, + j__udyMemSequence++, Pop1, Words, Pop1); + // MALLOCBITS_SET(Pjlw_t, Pjlw); // see above. + return(Pjlw); + +} // j__udyAllocJLW() + + +FUNCTION Pjlb_t j__udyAllocJLB1(Pjpm_t Pjpm) +{ + Word_t Words = sizeof(jlb_t) / cJU_BYTESPERWORD; + Pjlb_t PjlbRaw; + + PjlbRaw = (Pjlb_t) MALLOC(JudyMalloc, Pjpm->jpm_TotalMemWords, Words); + + assert((Words * cJU_BYTESPERWORD) == sizeof(jlb_t)); + + if ((Word_t) PjlbRaw > sizeof(Word_t)) + { + ZEROWORDS(P_JLB(PjlbRaw), Words); + Pjpm->jpm_TotalMemWords += Words; + } + else { J__UDYSETALLOCERROR(PjlbRaw); } + + TRACE_ALLOC5("0x%x %8lu = j__udyAllocJLB1(), Words = %lu\n", PjlbRaw, + j__udyMemSequence++, Words, (Pjpm->jpm_Pop0) + 2); + MALLOCBITS_SET(Pjlb_t, PjlbRaw); + return(PjlbRaw); + +} // j__udyAllocJLB1() + + +#ifdef JUDYL + +FUNCTION Pjv_t j__udyLAllocJV(Word_t Pop1, Pjpm_t Pjpm) +{ + Word_t Words = JL_LEAFVPOPTOWORDS(Pop1); + Pjv_t PjvRaw; + + PjvRaw = (Pjv_t) MALLOC(JudyMalloc, Pjpm->jpm_TotalMemWords, Words); + + if ((Word_t) PjvRaw > sizeof(Word_t)) + { + Pjpm->jpm_TotalMemWords += Words; + } + else { J__UDYSETALLOCERROR(PjvRaw); } + + TRACE_ALLOC6("0x%x %8lu = j__udyLAllocJV(%lu), Words = %lu\n", PjvRaw, + j__udyMemSequence++, Pop1, Words, (Pjpm->jpm_Pop0) + 2); + MALLOCBITS_SET(Pjv_t, PjvRaw); + return(PjvRaw); + +} // j__udyLAllocJV() + +#endif // JUDYL + + +// **************************************************************************** +// FREE FUNCTIONS: +// +// To help the compiler catch coding errors, each function takes a specific +// object type to free. + + +// Note: j__udyFreeJPM() receives a root pointer with NO root pointer type +// bits present, that is, they must be stripped by the caller using P_JPM(): + +FUNCTION void j__udyFreeJPM(Pjpm_t PjpmFree, Pjpm_t PjpmStats) +{ + Word_t Words = (sizeof(jpm_t) + cJU_BYTESPERWORD - 1) / cJU_BYTESPERWORD; + + // MALLOCBITS_TEST(Pjpm_t, PjpmFree); // see above. + JudyFree((Pvoid_t) PjpmFree, Words); + + if (PjpmStats != (Pjpm_t) NULL) PjpmStats->jpm_TotalMemWords -= Words; + +// Note: Log PjpmFree->jpm_Pop0, similar to other j__udyFree*() functions, not +// an assumed value of cJU_LEAFW_MAXPOP1, for when the caller is +// Judy*FreeArray(), jpm_Pop0 is set to 0, and the population after the free +// really will be 0, not cJU_LEAFW_MAXPOP1. + + TRACE_FREE6("0x%x %8lu = j__udyFreeJPM(%lu), Words = %lu\n", PjpmFree, + j__udyMemSequence++, Words, Words, PjpmFree->jpm_Pop0); + + +} // j__udyFreeJPM() + + +FUNCTION void j__udyFreeJBL(Pjbl_t Pjbl, Pjpm_t Pjpm) +{ + Word_t Words = sizeof(jbl_t) / cJU_BYTESPERWORD; + + MALLOCBITS_TEST(Pjbl_t, Pjbl); + JudyFreeVirtual((Pvoid_t) Pjbl, Words); + + Pjpm->jpm_TotalMemWords -= Words; + + TRACE_FREE5("0x%x %8lu = j__udyFreeJBL(), Words = %lu\n", Pjbl, + j__udyMemSequence++, Words, Pjpm->jpm_Pop0); + + +} // j__udyFreeJBL() + + +FUNCTION void j__udyFreeJBB(Pjbb_t Pjbb, Pjpm_t Pjpm) +{ + Word_t Words = sizeof(jbb_t) / cJU_BYTESPERWORD; + + MALLOCBITS_TEST(Pjbb_t, Pjbb); + JudyFreeVirtual((Pvoid_t) Pjbb, Words); + + Pjpm->jpm_TotalMemWords -= Words; + + TRACE_FREE5("0x%x %8lu = j__udyFreeJBB(), Words = %lu\n", Pjbb, + j__udyMemSequence++, Words, Pjpm->jpm_Pop0); + + +} // j__udyFreeJBB() + + +FUNCTION void j__udyFreeJBBJP(Pjp_t Pjp, Word_t NumJPs, Pjpm_t Pjpm) +{ + Word_t Words = JU_BRANCHJP_NUMJPSTOWORDS(NumJPs); + + MALLOCBITS_TEST(Pjp_t, Pjp); + JudyFree((Pvoid_t) Pjp, Words); + + Pjpm->jpm_TotalMemWords -= Words; + + TRACE_FREE6("0x%x %8lu = j__udyFreeJBBJP(%lu), Words = %lu\n", Pjp, + j__udyMemSequence++, NumJPs, Words, Pjpm->jpm_Pop0); + + +} // j__udyFreeJBBJP() + + +FUNCTION void j__udyFreeJBU(Pjbu_t Pjbu, Pjpm_t Pjpm) +{ + Word_t Words = sizeof(jbu_t) / cJU_BYTESPERWORD; + + MALLOCBITS_TEST(Pjbu_t, Pjbu); + JudyFreeVirtual((Pvoid_t) Pjbu, Words); + + Pjpm->jpm_TotalMemWords -= Words; + + TRACE_FREE5("0x%x %8lu = j__udyFreeJBU(), Words = %lu\n", Pjbu, + j__udyMemSequence++, Words, Pjpm->jpm_Pop0); + + +} // j__udyFreeJBU() + + +#if (defined(JUDYL) || (! defined(JU_64BIT))) + +FUNCTION void j__udyFreeJLL1(Pjll_t Pjll, Word_t Pop1, Pjpm_t Pjpm) +{ + Word_t Words = JU_LEAF1POPTOWORDS(Pop1); + + MALLOCBITS_TEST(Pjll_t, Pjll); + JudyFree((Pvoid_t) Pjll, Words); + + Pjpm->jpm_TotalMemWords -= Words; + + TRACE_FREE6("0x%x %8lu = j__udyFreeJLL1(%lu), Words = %lu\n", Pjll, + j__udyMemSequence++, Pop1, Words, Pjpm->jpm_Pop0); + + +} // j__udyFreeJLL1() + +#endif // (JUDYL || (! JU_64BIT)) + + +FUNCTION void j__udyFreeJLL2(Pjll_t Pjll, Word_t Pop1, Pjpm_t Pjpm) +{ + Word_t Words = JU_LEAF2POPTOWORDS(Pop1); + + MALLOCBITS_TEST(Pjll_t, Pjll); + JudyFree((Pvoid_t) Pjll, Words); + + Pjpm->jpm_TotalMemWords -= Words; + + TRACE_FREE6("0x%x %8lu = j__udyFreeJLL2(%lu), Words = %lu\n", Pjll, + j__udyMemSequence++, Pop1, Words, Pjpm->jpm_Pop0); + + +} // j__udyFreeJLL2() + + +FUNCTION void j__udyFreeJLL3(Pjll_t Pjll, Word_t Pop1, Pjpm_t Pjpm) +{ + Word_t Words = JU_LEAF3POPTOWORDS(Pop1); + + MALLOCBITS_TEST(Pjll_t, Pjll); + JudyFree((Pvoid_t) Pjll, Words); + + Pjpm->jpm_TotalMemWords -= Words; + + TRACE_FREE6("0x%x %8lu = j__udyFreeJLL3(%lu), Words = %lu\n", Pjll, + j__udyMemSequence++, Pop1, Words, Pjpm->jpm_Pop0); + + +} // j__udyFreeJLL3() + + +#ifdef JU_64BIT + +FUNCTION void j__udyFreeJLL4(Pjll_t Pjll, Word_t Pop1, Pjpm_t Pjpm) +{ + Word_t Words = JU_LEAF4POPTOWORDS(Pop1); + + MALLOCBITS_TEST(Pjll_t, Pjll); + JudyFree((Pvoid_t) Pjll, Words); + + Pjpm->jpm_TotalMemWords -= Words; + + TRACE_FREE6("0x%x %8lu = j__udyFreeJLL4(%lu), Words = %lu\n", Pjll, + j__udyMemSequence++, Pop1, Words, Pjpm->jpm_Pop0); + + +} // j__udyFreeJLL4() + + +FUNCTION void j__udyFreeJLL5(Pjll_t Pjll, Word_t Pop1, Pjpm_t Pjpm) +{ + Word_t Words = JU_LEAF5POPTOWORDS(Pop1); + + MALLOCBITS_TEST(Pjll_t, Pjll); + JudyFree((Pvoid_t) Pjll, Words); + + Pjpm->jpm_TotalMemWords -= Words; + + TRACE_FREE6("0x%x %8lu = j__udyFreeJLL5(%lu), Words = %lu\n", Pjll, + j__udyMemSequence++, Pop1, Words, Pjpm->jpm_Pop0); + + +} // j__udyFreeJLL5() + + +FUNCTION void j__udyFreeJLL6(Pjll_t Pjll, Word_t Pop1, Pjpm_t Pjpm) +{ + Word_t Words = JU_LEAF6POPTOWORDS(Pop1); + + MALLOCBITS_TEST(Pjll_t, Pjll); + JudyFree((Pvoid_t) Pjll, Words); + + Pjpm->jpm_TotalMemWords -= Words; + + TRACE_FREE6("0x%x %8lu = j__udyFreeJLL6(%lu), Words = %lu\n", Pjll, + j__udyMemSequence++, Pop1, Words, Pjpm->jpm_Pop0); + + +} // j__udyFreeJLL6() + + +FUNCTION void j__udyFreeJLL7(Pjll_t Pjll, Word_t Pop1, Pjpm_t Pjpm) +{ + Word_t Words = JU_LEAF7POPTOWORDS(Pop1); + + MALLOCBITS_TEST(Pjll_t, Pjll); + JudyFree((Pvoid_t) Pjll, Words); + + Pjpm->jpm_TotalMemWords -= Words; + + TRACE_FREE6("0x%x %8lu = j__udyFreeJLL7(%lu), Words = %lu\n", Pjll, + j__udyMemSequence++, Pop1, Words, Pjpm->jpm_Pop0); + + +} // j__udyFreeJLL7() + +#endif // JU_64BIT + + +// Note: j__udyFreeJLW() receives a root pointer with NO root pointer type +// bits present, that is, they are stripped by P_JLW(): + +FUNCTION void j__udyFreeJLW(Pjlw_t Pjlw, Word_t Pop1, Pjpm_t Pjpm) +{ + Word_t Words = JU_LEAFWPOPTOWORDS(Pop1); + + // MALLOCBITS_TEST(Pjlw_t, Pjlw); // see above. + JudyFree((Pvoid_t) Pjlw, Words); + + if (Pjpm) Pjpm->jpm_TotalMemWords -= Words; + + TRACE_FREE6("0x%x %8lu = j__udyFreeJLW(%lu), Words = %lu\n", Pjlw, + j__udyMemSequence++, Pop1, Words, Pop1 - 1); + + +} // j__udyFreeJLW() + + +FUNCTION void j__udyFreeJLB1(Pjlb_t Pjlb, Pjpm_t Pjpm) +{ + Word_t Words = sizeof(jlb_t) / cJU_BYTESPERWORD; + + MALLOCBITS_TEST(Pjlb_t, Pjlb); + JudyFree((Pvoid_t) Pjlb, Words); + + Pjpm->jpm_TotalMemWords -= Words; + + TRACE_FREE5("0x%x %8lu = j__udyFreeJLB1(), Words = %lu\n", Pjlb, + j__udyMemSequence++, Words, Pjpm->jpm_Pop0); + + +} // j__udyFreeJLB1() + + +#ifdef JUDYL + +FUNCTION void j__udyLFreeJV(Pjv_t Pjv, Word_t Pop1, Pjpm_t Pjpm) +{ + Word_t Words = JL_LEAFVPOPTOWORDS(Pop1); + + MALLOCBITS_TEST(Pjv_t, Pjv); + JudyFree((Pvoid_t) Pjv, Words); + + Pjpm->jpm_TotalMemWords -= Words; + + TRACE_FREE6("0x%x %8lu = j__udyLFreeJV(%lu), Words = %lu\n", Pjv, + j__udyMemSequence++, Pop1, Words, Pjpm->jpm_Pop0); + + +} // j__udyLFreeJV() + +#endif // JUDYL diff --git a/src/libnetdata/libjudy/src/JudyL/JudyLMemActive.c b/src/libnetdata/libjudy/src/JudyL/JudyLMemActive.c new file mode 100644 index 00000000..fb58d0e2 --- /dev/null +++ b/src/libnetdata/libjudy/src/JudyL/JudyLMemActive.c @@ -0,0 +1,259 @@ +// Copyright (C) 2000 - 2002 Hewlett-Packard Company +// +// This program is free software; you can redistribute it and/or modify it +// under the term of the GNU Lesser General Public License as published by the +// Free Software Foundation; either version 2 of the License, or (at your +// option) any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License +// for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this program; if not, write to the Free Software Foundation, +// Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// _________________ + +// @(#) $Revision: 4.7 $ $Source: /judy/src/JudyCommon/JudyMemActive.c $ +// +// Return number of bytes of memory used to support a Judy1/L array. +// Compile with one of -DJUDY1 or -DJUDYL. + +#if (! (defined(JUDY1) || defined(JUDYL))) +#error: One of -DJUDY1 or -DJUDYL must be specified. +#endif + +#ifdef JUDY1 +#include "Judy1.h" +#else +#include "JudyL.h" +#endif + +#include "JudyPrivate1L.h" + +FUNCTION static Word_t j__udyGetMemActive(Pjp_t); + + +// **************************************************************************** +// J U D Y 1 M E M A C T I V E +// J U D Y L M E M A C T I V E + +#ifdef JUDY1 +FUNCTION Word_t Judy1MemActive +#else +FUNCTION Word_t JudyLMemActive +#endif + ( + Pcvoid_t PArray // from which to retrieve. + ) +{ + if (PArray == (Pcvoid_t)NULL) return(0); + + if (JU_LEAFW_POP0(PArray) < cJU_LEAFW_MAXPOP1) // must be a LEAFW + { + Pjlw_t Pjlw = P_JLW(PArray); // first word of leaf. + Word_t Words = Pjlw[0] + 1; // population. +#ifdef JUDY1 + return((Words + 1) * sizeof(Word_t)); +#else + return(((Words * 2) + 1) * sizeof(Word_t)); +#endif + } + else + { + Pjpm_t Pjpm = P_JPM(PArray); + return(j__udyGetMemActive(&Pjpm->jpm_JP) + sizeof(jpm_t)); + } + +} // JudyMemActive() + + +// **************************************************************************** +// __ J U D Y G E T M E M A C T I V E + +FUNCTION static Word_t j__udyGetMemActive( + Pjp_t Pjp) // top of subtree. +{ + Word_t offset; // in a branch. + Word_t Bytes = 0; // actual bytes used at this level. + Word_t IdxSz; // bytes per index in leaves + + switch (JU_JPTYPE(Pjp)) + { + + case cJU_JPBRANCH_L2: + case cJU_JPBRANCH_L3: +#ifdef JU_64BIT + case cJU_JPBRANCH_L4: + case cJU_JPBRANCH_L5: + case cJU_JPBRANCH_L6: + case cJU_JPBRANCH_L7: +#endif + case cJU_JPBRANCH_L: + { + Pjbl_t Pjbl = P_JBL(Pjp->jp_Addr); + + for (offset = 0; offset < (Pjbl->jbl_NumJPs); ++offset) + Bytes += j__udyGetMemActive((Pjbl->jbl_jp) + offset); + + return(Bytes + sizeof(jbl_t)); + } + + case cJU_JPBRANCH_B2: + case cJU_JPBRANCH_B3: +#ifdef JU_64BIT + case cJU_JPBRANCH_B4: + case cJU_JPBRANCH_B5: + case cJU_JPBRANCH_B6: + case cJU_JPBRANCH_B7: +#endif + case cJU_JPBRANCH_B: + { + Word_t subexp; + Word_t jpcount; + Pjbb_t Pjbb = P_JBB(Pjp->jp_Addr); + + for (subexp = 0; subexp < cJU_NUMSUBEXPB; ++subexp) + { + jpcount = j__udyCountBitsB(JU_JBB_BITMAP(Pjbb, subexp)); + Bytes += jpcount * sizeof(jp_t); + + for (offset = 0; offset < jpcount; ++offset) + { + Bytes += j__udyGetMemActive(P_JP(JU_JBB_PJP(Pjbb, subexp)) + + offset); + } + } + + return(Bytes + sizeof(jbb_t)); + } + + case cJU_JPBRANCH_U2: + case cJU_JPBRANCH_U3: +#ifdef JU_64BIT + case cJU_JPBRANCH_U4: + case cJU_JPBRANCH_U5: + case cJU_JPBRANCH_U6: + case cJU_JPBRANCH_U7: +#endif + case cJU_JPBRANCH_U: + { + Pjbu_t Pjbu = P_JBU(Pjp->jp_Addr); + + for (offset = 0; offset < cJU_BRANCHUNUMJPS; ++offset) + { + if (((Pjbu->jbu_jp[offset].jp_Type) >= cJU_JPNULL1) + && ((Pjbu->jbu_jp[offset].jp_Type) <= cJU_JPNULLMAX)) + { + continue; // skip null JP to save time. + } + + Bytes += j__udyGetMemActive(Pjbu->jbu_jp + offset); + } + + return(Bytes + sizeof(jbu_t)); + } + + +// -- Cases below here terminate and do not recurse. -- + +#if (defined(JUDYL) || (! defined(JU_64BIT))) + case cJU_JPLEAF1: IdxSz = 1; goto LeafWords; +#endif + case cJU_JPLEAF2: IdxSz = 2; goto LeafWords; + case cJU_JPLEAF3: IdxSz = 3; goto LeafWords; +#ifdef JU_64BIT + case cJU_JPLEAF4: IdxSz = 4; goto LeafWords; + case cJU_JPLEAF5: IdxSz = 5; goto LeafWords; + case cJU_JPLEAF6: IdxSz = 6; goto LeafWords; + case cJU_JPLEAF7: IdxSz = 7; goto LeafWords; +#endif +LeafWords: + +#ifdef JUDY1 + return(IdxSz * (JU_JPLEAF_POP0(Pjp) + 1)); +#else + return((IdxSz + sizeof(Word_t)) + * (JU_JPLEAF_POP0(Pjp) + 1)); +#endif + case cJU_JPLEAF_B1: + { +#ifdef JUDY1 + return(sizeof(jlb_t)); +#else + Bytes = (JU_JPLEAF_POP0(Pjp) + 1) * sizeof(Word_t); + + return(Bytes + sizeof(jlb_t)); +#endif + } + + JUDY1CODE(case cJ1_JPFULLPOPU1: return(0);) + +#ifdef JUDY1 +#define J__Mpy 0 +#else +#define J__Mpy sizeof(Word_t) +#endif + + case cJU_JPIMMED_1_01: return(0); + case cJU_JPIMMED_2_01: return(0); + case cJU_JPIMMED_3_01: return(0); +#ifdef JU_64BIT + case cJU_JPIMMED_4_01: return(0); + case cJU_JPIMMED_5_01: return(0); + case cJU_JPIMMED_6_01: return(0); + case cJU_JPIMMED_7_01: return(0); +#endif + + case cJU_JPIMMED_1_02: return(J__Mpy * 2); + case cJU_JPIMMED_1_03: return(J__Mpy * 3); +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_1_04: return(J__Mpy * 4); + case cJU_JPIMMED_1_05: return(J__Mpy * 5); + case cJU_JPIMMED_1_06: return(J__Mpy * 6); + case cJU_JPIMMED_1_07: return(J__Mpy * 7); +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_1_08: return(0); + case cJ1_JPIMMED_1_09: return(0); + case cJ1_JPIMMED_1_10: return(0); + case cJ1_JPIMMED_1_11: return(0); + case cJ1_JPIMMED_1_12: return(0); + case cJ1_JPIMMED_1_13: return(0); + case cJ1_JPIMMED_1_14: return(0); + case cJ1_JPIMMED_1_15: return(0); +#endif + +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_2_02: return(J__Mpy * 2); + case cJU_JPIMMED_2_03: return(J__Mpy * 3); +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_2_04: return(0); + case cJ1_JPIMMED_2_05: return(0); + case cJ1_JPIMMED_2_06: return(0); + case cJ1_JPIMMED_2_07: return(0); +#endif + +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_3_02: return(J__Mpy * 2); +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_3_03: return(0); + case cJ1_JPIMMED_3_04: return(0); + case cJ1_JPIMMED_3_05: return(0); + + case cJ1_JPIMMED_4_02: return(0); + case cJ1_JPIMMED_4_03: return(0); + case cJ1_JPIMMED_5_02: return(0); + case cJ1_JPIMMED_5_03: return(0); + case cJ1_JPIMMED_6_02: return(0); + case cJ1_JPIMMED_7_02: return(0); +#endif + + } // switch (JU_JPTYPE(Pjp)) + + return(0); // to make some compilers happy. + +} // j__udyGetMemActive() diff --git a/src/libnetdata/libjudy/src/JudyL/JudyLMemUsed.c b/src/libnetdata/libjudy/src/JudyL/JudyLMemUsed.c new file mode 100644 index 00000000..81e3a79c --- /dev/null +++ b/src/libnetdata/libjudy/src/JudyL/JudyLMemUsed.c @@ -0,0 +1,61 @@ +// Copyright (C) 2000 - 2002 Hewlett-Packard Company +// +// This program is free software; you can redistribute it and/or modify it +// under the term of the GNU Lesser General Public License as published by the +// Free Software Foundation; either version 2 of the License, or (at your +// option) any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License +// for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this program; if not, write to the Free Software Foundation, +// Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// _________________ + +// @(#) $Revision: 4.5 $ $Source: /judy/src/JudyCommon/JudyMemUsed.c $ +// +// Return number of bytes of memory used to support a Judy1/L array. +// Compile with one of -DJUDY1 or -DJUDYL. + +#if (! (defined(JUDY1) || defined(JUDYL))) +#error: One of -DJUDY1 or -DJUDYL must be specified. +#endif + +#ifdef JUDY1 +#include "Judy1.h" +#else +#include "JudyL.h" +#endif + +#include "JudyPrivate1L.h" + +#ifdef JUDY1 +FUNCTION Word_t Judy1MemUsed +#else // JUDYL +FUNCTION Word_t JudyLMemUsed +#endif + ( + Pcvoid_t PArray // from which to retrieve. + ) +{ + Word_t Words = 0; + + if (PArray == (Pcvoid_t) NULL) return(0); + + if (JU_LEAFW_POP0(PArray) < cJU_LEAFW_MAXPOP1) // must be a LEAFW + { + Pjlw_t Pjlw = P_JLW(PArray); // first word of leaf. + Words = JU_LEAFWPOPTOWORDS(Pjlw[0] + 1); // based on pop1. + } + else + { + Pjpm_t Pjpm = P_JPM(PArray); + Words = Pjpm->jpm_TotalMemWords; + } + + return(Words * sizeof(Word_t)); // convert to bytes. + +} // Judy1MemUsed() / JudyLMemUsed() diff --git a/src/libnetdata/libjudy/src/JudyL/JudyLNext.c b/src/libnetdata/libjudy/src/JudyL/JudyLNext.c new file mode 100644 index 00000000..4bcdccf1 --- /dev/null +++ b/src/libnetdata/libjudy/src/JudyL/JudyLNext.c @@ -0,0 +1,1890 @@ +// Copyright (C) 2000 - 2002 Hewlett-Packard Company +// +// This program is free software; you can redistribute it and/or modify it +// under the term of the GNU Lesser General Public License as published by the +// Free Software Foundation; either version 2 of the License, or (at your +// option) any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License +// for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this program; if not, write to the Free Software Foundation, +// Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// _________________ + +// @(#) $Revision: 4.54 $ $Source: /judy/src/JudyCommon/JudyPrevNext.c $ +// +// Judy*Prev() and Judy*Next() functions for Judy1 and JudyL. +// Compile with one of -DJUDY1 or -DJUDYL. +// +// Compile with -DJUDYNEXT for the Judy*Next() function; otherwise defaults to +// Judy*Prev(). + +#if (! (defined(JUDY1) || defined(JUDYL))) +#error: One of -DJUDY1 or -DJUDYL must be specified. +#endif + +#ifndef JUDYNEXT +#ifndef JUDYPREV +#define JUDYPREV 1 // neither set => use default. +#endif +#endif + +#ifdef JUDY1 +#include "Judy1.h" +#else +#include "JudyL.h" +#endif + +#include "JudyPrivate1L.h" + + +// **************************************************************************** +// J U D Y 1 P R E V +// J U D Y 1 N E X T +// J U D Y L P R E V +// J U D Y L N E X T +// +// See the manual entry for the API. +// +// OVERVIEW OF Judy*Prev(): +// +// Use a reentrant switch statement (state machine, SM1 = "get") to decode the +// callers *PIndex-1, starting with the (PArray), through branches, if +// any, down to an immediate or a leaf. Look for *PIndex-1 in that leaf, and +// if found, return it. +// +// A dead end is either a branch that does not contain a JP for the appropriate +// digit in *PIndex-1, or a leaf that does not contain the undecoded digits of +// *PIndex-1. Upon reaching a dead end, backtrack through the leaf/branches +// that were just traversed, using a list (history) of parent JPs that is built +// while going forward in SM1Get. Start with the current leaf or branch. In a +// backtracked leaf, look for an Index less than *PIndex-1. In each +// backtracked branch, look "sideways" for the next JP, if any, lower than the +// one for the digit (from *PIndex-1) that was previously decoded. While +// backtracking, if a leaf has no previous Index or a branch has no lower JP, +// go to its parent branch in turn. Upon reaching the JRP, return failure, "no +// previous Index". The backtrack process is sufficiently different from +// SM1Get to merit its own separate reentrant switch statement (SM2 = +// "backtrack"). +// +// While backtracking, upon finding a lower JP in a branch, there is certain to +// be a "prev" Index under that JP (unless the Judy array is corrupt). +// Traverse forward again, this time taking the last (highest, right-most) JP +// in each branch, and the last (highest) Index upon reaching an immediate or a +// leaf. This traversal is sufficiently different from SM1Get and SM2Backtrack +// to merit its own separate reentrant switch statement (SM3 = "findlimit"). +// +// "Decode" bytes in JPs complicate this process a little. In SM1Get, when a +// JP is a narrow pointer, that is, when states are skipped (so the skipped +// digits are stored in jp_DcdPopO), compare the relevant digits to the same +// digits in *PIndex-1. If they are EQUAL, proceed in SM1Get as before. If +// jp_DcdPopOs digits are GREATER, treat the JP as a dead end and proceed in +// SM2Backtrack. If jp_DcdPopOs digits are LESS, treat the JP as if it had +// just been found during a backtrack and proceed directly in SM3Findlimit. +// +// Note that Decode bytes can be ignored in SM3Findlimit; they dont matter. +// Also note that in practice the Decode bytes are routinely compared with +// *PIndex-1 because thats simpler and no slower than first testing for +// narrowness. +// +// Decode bytes also make it unnecessary to construct the Index to return (the +// revised *PIndex) during the search. This step is deferred until finding an +// Index during backtrack or findlimit, before returning it. The first digit +// of *PIndex is derived (saved) based on which JP is used in a JRP branch. +// The remaining digits are obtained from the jp_DcdPopO field in the JP (if +// any) above the immediate or leaf containing the found (prev) Index, plus the +// remaining digit(s) in the immediate or leaf itself. In the case of a LEAFW, +// the Index to return is found directly in the leaf. +// +// Note: Theoretically, as described above, upon reaching a dead end, SM1Get +// passes control to SM2Backtrack to look sideways, even in a leaf. Actually +// its a little more efficient for the SM1Get leaf cases to shortcut this and +// take care of the sideways searches themselves. Hence the history list only +// contains branch JPs, and SM2Backtrack only handles branches. In fact, even +// the branch handling cases in SM1Get do some shortcutting (sideways +// searching) to avoid pushing history and calling SM2Backtrack unnecessarily. +// +// Upon reaching an Index to return after backtracking, *PIndex must be +// modified to the found Index. In principle this could be done by building +// the Index from a saved rootdigit (in the top branch) plus the Dcd bytes from +// the parent JP plus the appropriate Index bytes from the leaf. However, +// Immediates are difficult because their parent JPs lack one (last) digit. So +// instead just build the *PIndex to return "top down" while backtracking and +// findlimiting. +// +// This function is written iteratively for speed, rather than recursively. +// +// CAVEATS: +// +// Why use a backtrack list (history stack), since it has finite size? The +// size is small for Judy on both 32-bit and 64-bit systems, and a list (really +// just an array) is fast to maintain and use. Other alternatives include +// doing a lookahead (lookaside) in each branch while traversing forward +// (decoding), and restarting from the top upon a dead end. +// +// A lookahead means noting the last branch traversed which contained a +// non-null JP lower than the one specified by a digit in *PIndex-1, and +// returning to that point for SM3Findlimit. This seems like a good idea, and +// should be pretty cheap for linear and bitmap branches, but it could result +// in up to 31 unnecessary additional cache line fills (in extreme cases) for +// every uncompressed branch traversed. We have considered means of attaching +// to or hiding within an uncompressed branch (in null JPs) a "cache line map" +// or other structure, such as an offset to the next non-null JP, that would +// speed this up, but it seems unnecessary merely to avoid having a +// finite-length list (array). (If JudySL is ever made "native", the finite +// list length will be an issue.) +// +// Restarting at the top of the Judy array after a dead end requires a careful +// modification of *PIndex-1 to decrement the digit for the parent branch and +// set the remaining lower digits to all 1s. This must be repeated each time a +// parent branch contains another dead end, so even though it should all happen +// in cache, the CPU time can be excessive. (For JudySL or an equivalent +// "infinitely deep" Judy array, consider a hybrid of a large, finite, +// "circular" list and a restart-at-top when the list is backtracked to +// exhaustion.) +// +// Why search for *PIndex-1 instead of *PIndex during SM1Get? In rare +// instances this prevents an unnecessary decode down the wrong path followed +// by a backtrack; its pretty cheap to set up initially; and it means the +// SM1Get machine can simply return if/when it finds that Index. +// +// TBD: Wed like to enhance this function to make successive searches faster. +// This would require saving some previous state, including the previous Index +// returned, and in which leaf it was found. If the next call is for the same +// Index and the array has not been modified, start at the same leaf. This +// should be much easier to implement since this is iterative rather than +// recursive code. +// +// VARIATIONS FOR Judy*Next(): +// +// The Judy*Next() code is nearly a perfect mirror of the Judy*Prev() code. +// See the Judy*Prev() overview comments, and mentally switch the following: +// +// - "*PIndex-1" => "*PIndex+1" +// - "less than" => "greater than" +// - "lower" => "higher" +// - "lowest" => "highest" +// - "next-left" => "next-right" +// - "right-most" => "left-most" +// +// Note: SM3Findlimit could be called SM3Findmax/SM3Findmin, but a common name +// for both Prev and Next means many fewer ifdefs in this code. +// +// TBD: Currently this code traverses a JP whether its expanse is partially or +// completely full (populated). For Judy1 (only), since there is no value area +// needed, consider shortcutting to a "success" return upon encountering a full +// JP in SM1Get (or even SM3Findlimit?) A full JP looks like this: +// +// (((JU_JPDCDPOP0(Pjp) ^ cJU_ALLONES) & cJU_POP0MASK(cLevel)) == 0) + +#ifdef JUDY1 +#ifdef JUDYPREV +FUNCTION int Judy1Prev +#else +FUNCTION int Judy1Next +#endif +#else +#ifdef JUDYPREV +FUNCTION PPvoid_t JudyLPrev +#else +FUNCTION PPvoid_t JudyLNext +#endif +#endif + ( + Pcvoid_t PArray, // Judy array to search. + Word_t * PIndex, // starting point and result. + PJError_t PJError // optional, for returning error info. + ) +{ + Pjp_t Pjp, Pjp2; // current JPs. + Pjbl_t Pjbl; // Pjp->jp_Addr masked and cast to types: + Pjbb_t Pjbb; + Pjbu_t Pjbu; + +// Note: The following initialization is not strictly required but it makes +// gcc -Wall happy because there is an "impossible" path from Immed handling to +// SM1LeafLImm code that looks like Pjll might be used before set: + + Pjll_t Pjll = (Pjll_t) NULL; + Word_t state; // current state in SM. + Word_t digit; // next digit to decode from Index. + +// Note: The following initialization is not strictly required but it makes +// gcc -Wall happy because there is an "impossible" path from Immed handling to +// SM1LeafLImm code (for JudyL & JudyPrev only) that looks like pop1 might be +// used before set: + +#if (defined(JUDYL) && defined(JUDYPREV)) + Word_t pop1 = 0; // in a leaf. +#else + Word_t pop1; // in a leaf. +#endif + int offset; // linear branch/leaf, from j__udySearchLeaf*(). + int subexp; // subexpanse in a bitmap branch. + Word_t bitposmask; // bit in bitmap for Index. + +// History for SM2Backtrack: +// +// For a given histnum, APjphist[histnum] is a parent JP that points to a +// branch, and Aoffhist[histnum] is the offset of the NEXT JP in the branch to +// which the parent JP points. The meaning of Aoffhist[histnum] depends on the +// type of branch to which the parent JP points: +// +// Linear: Offset of the next JP in the JP list. +// +// Bitmap: Which subexpanse, plus the offset of the next JP in the +// subexpanses JP list (to avoid bit-counting again), plus for Judy*Next(), +// hidden one byte to the left, which digit, because Judy*Next() also needs +// this. +// +// Uncompressed: Digit, which is actually the offset of the JP in the branch. +// +// Note: Only branch JPs are stored in APjphist[] because, as explained +// earlier, SM1Get shortcuts sideways searches in leaves (and even in branches +// in some cases), so SM2Backtrack only handles branches. + +#define HISTNUMMAX cJU_ROOTSTATE // maximum branches traversable. + Pjp_t APjphist[HISTNUMMAX]; // list of branch JPs traversed. + int Aoffhist[HISTNUMMAX]; // list of next JP offsets; see above. + int histnum = 0; // number of JPs now in list. + + +// ---------------------------------------------------------------------------- +// M A C R O S +// +// These are intended to make the code a bit more readable and less redundant. + + +// "PUSH" AND "POP" Pjp AND offset ON HISTORY STACKS: +// +// Note: Ensure a corrupt Judy array does not overflow *hist[]. Meanwhile, +// underflowing *hist[] simply means theres no more room to backtrack => +// "no previous/next Index". + +#define HISTPUSH(Pjp,Offset) \ + APjphist[histnum] = (Pjp); \ + Aoffhist[histnum] = (Offset); \ + \ + if (++histnum >= HISTNUMMAX) \ + { \ + JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT) \ + JUDY1CODE(return(JERRI );) \ + JUDYLCODE(return(PPJERR);) \ + } + +#define HISTPOP(Pjp,Offset) \ + if ((histnum--) < 1) JU_RET_NOTFOUND; \ + (Pjp) = APjphist[histnum]; \ + (Offset) = Aoffhist[histnum] + +// How to pack/unpack Aoffhist[] values for bitmap branches: + +#ifdef JUDYPREV + +#define HISTPUSHBOFF(Subexp,Offset,Digit) \ + (((Subexp) * cJU_BITSPERSUBEXPB) | (Offset)) + +#define HISTPOPBOFF(Subexp,Offset,Digit) \ + (Subexp) = (Offset) / cJU_BITSPERSUBEXPB; \ + (Offset) %= cJU_BITSPERSUBEXPB +#else + +#define HISTPUSHBOFF(Subexp,Offset,Digit) \ + (((Digit) << cJU_BITSPERBYTE) \ + | ((Subexp) * cJU_BITSPERSUBEXPB) | (Offset)) + +#define HISTPOPBOFF(Subexp,Offset,Digit) \ + (Digit) = (Offset) >> cJU_BITSPERBYTE; \ + (Subexp) = ((Offset) & JU_LEASTBYTESMASK(1)) / cJU_BITSPERSUBEXPB; \ + (Offset) %= cJU_BITSPERSUBEXPB +#endif + + +// CHECK FOR NULL JP: + +#define JPNULL(Type) (((Type) >= cJU_JPNULL1) && ((Type) <= cJU_JPNULLMAX)) + + +// SEARCH A BITMAP: +// +// This is a weak analog of j__udySearchLeaf*() for bitmaps. Return the actual +// or next-left position, base 0, of Digit in the single uint32_t bitmap, also +// given a Bitposmask for Digit. +// +// Unlike j__udySearchLeaf*(), the offset is not returned bit-complemented if +// Digits bit is unset, because the caller can check the bitmap themselves to +// determine that. Also, if Digits bit is unset, the returned offset is to +// the next-left JP (including -1), not to the "ideal" position for the Index = +// next-right JP. +// +// Shortcut and skip calling j__udyCountBits*() if the bitmap is full, in which +// case (Digit % cJU_BITSPERSUBEXP*) itself is the base-0 offset. +// +// TBD for Judy*Next(): Should this return next-right instead of next-left? +// That is, +1 from current value? Maybe not, if Digits bit IS set, +1 would +// be wrong. + +#define SEARCHBITMAPB(Bitmap,Digit,Bitposmask) \ + (((Bitmap) == cJU_FULLBITMAPB) ? (Digit % cJU_BITSPERSUBEXPB) : \ + j__udyCountBitsB((Bitmap) & JU_MASKLOWERINC(Bitposmask)) - 1) + +#define SEARCHBITMAPL(Bitmap,Digit,Bitposmask) \ + (((Bitmap) == cJU_FULLBITMAPL) ? (Digit % cJU_BITSPERSUBEXPL) : \ + j__udyCountBitsL((Bitmap) & JU_MASKLOWERINC(Bitposmask)) - 1) + +#ifdef JUDYPREV +// Equivalent to search for the highest offset in Bitmap: + +#define SEARCHBITMAPMAXB(Bitmap) \ + (((Bitmap) == cJU_FULLBITMAPB) ? cJU_BITSPERSUBEXPB - 1 : \ + j__udyCountBitsB(Bitmap) - 1) + +#define SEARCHBITMAPMAXL(Bitmap) \ + (((Bitmap) == cJU_FULLBITMAPL) ? cJU_BITSPERSUBEXPL - 1 : \ + j__udyCountBitsL(Bitmap) - 1) +#endif + + +// CHECK DECODE BYTES: +// +// Check Decode bytes in a JP against the equivalent portion of *PIndex. If +// *PIndex is lower (for Judy*Prev()) or higher (for Judy*Next()), this JP is a +// dead end (the same as if it had been absent in a linear or bitmap branch or +// null in an uncompressed branch), enter SM2Backtrack; otherwise enter +// SM3Findlimit to find the highest/lowest Index under this JP, as if the code +// had already backtracked to this JP. + +#ifdef JUDYPREV +#define CDcmp__ < +#else +#define CDcmp__ > +#endif + +#define CHECKDCD(cState) \ + if (JU_DCDNOTMATCHINDEX(*PIndex, Pjp, cState)) \ + { \ + if ((*PIndex & cJU_DCDMASK(cState)) \ + CDcmp__(JU_JPDCDPOP0(Pjp) & cJU_DCDMASK(cState))) \ + { \ + goto SM2Backtrack; \ + } \ + goto SM3Findlimit; \ + } + + +// PREPARE TO HANDLE A LEAFW OR JRP BRANCH IN SM1: +// +// Extract a state-dependent digit from Index in a "constant" way, then jump to +// common code for multiple cases. + +#define SM1PREPB(cState,Next) \ + state = (cState); \ + digit = JU_DIGITATSTATE(*PIndex, cState); \ + goto Next + + +// PREPARE TO HANDLE A LEAFW OR JRP BRANCH IN SM3: +// +// Optionally save Dcd bytes into *PIndex, then save state and jump to common +// code for multiple cases. + +#define SM3PREPB_DCD(cState,Next) \ + JU_SETDCD(*PIndex, Pjp, cState); \ + SM3PREPB(cState,Next) + +#define SM3PREPB(cState,Next) state = (cState); goto Next + + +// ---------------------------------------------------------------------------- +// CHECK FOR SHORTCUTS: +// +// Error out if PIndex is null. Execute JU_RET_NOTFOUND if the Judy array is +// empty or *PIndex is already the minimum/maximum Index possible. +// +// Note: As documented, in case of failure *PIndex may be modified. + + if (PIndex == (PWord_t) NULL) + { + JU_SET_ERRNO(PJError, JU_ERRNO_NULLPINDEX); + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + } + +#ifdef JUDYPREV + if ((PArray == (Pvoid_t) NULL) || ((*PIndex)-- == 0)) +#else + if ((PArray == (Pvoid_t) NULL) || ((*PIndex)++ == cJU_ALLONES)) +#endif + JU_RET_NOTFOUND; + + +// HANDLE JRP: +// +// Before even entering SM1Get, check the JRP type. For JRP branches, traverse +// the JPM; handle LEAFW leaves directly; but look for the most common cases +// first. + +// ROOT-STATE LEAF that starts with a Pop0 word; just look within the leaf: +// +// If *PIndex is in the leaf, return it; otherwise return the Index, if any, +// below where it would belong. + + if (JU_LEAFW_POP0(PArray) < cJU_LEAFW_MAXPOP1) // must be a LEAFW + { + Pjlw_t Pjlw = P_JLW(PArray); // first word of leaf. + pop1 = Pjlw[0] + 1; + + if ((offset = j__udySearchLeafW(Pjlw + 1, pop1, *PIndex)) + >= 0) // Index is present. + { + assert(offset < pop1); // in expected range. + JU_RET_FOUND_LEAFW(Pjlw, pop1, offset); // *PIndex is set. + } + +#ifdef JUDYPREV + if ((offset = ~offset) == 0) // no next-left Index. +#else + if ((offset = ~offset) >= pop1) // no next-right Index. +#endif + JU_RET_NOTFOUND; + + assert(offset <= pop1); // valid result. + +#ifdef JUDYPREV + *PIndex = Pjlw[offset--]; // next-left Index, base 1. +#else + *PIndex = Pjlw[offset + 1]; // next-right Index, base 1. +#endif + JU_RET_FOUND_LEAFW(Pjlw, pop1, offset); // base 0. + + } + else // JRP BRANCH + { + Pjpm_t Pjpm = P_JPM(PArray); + Pjp = &(Pjpm->jpm_JP); + +// goto SM1Get; + } + +// ============================================================================ +// STATE MACHINE 1 -- GET INDEX: +// +// Search for *PIndex (already decremented/incremented so as to be inclusive). +// If found, return it. Otherwise in theory hand off to SM2Backtrack or +// SM3Findlimit, but in practice "shortcut" by first sideways searching the +// current branch or leaf upon hitting a dead end. During sideways search, +// modify *PIndex to a new path taken. +// +// ENTRY: Pjp points to next JP to interpret, whose Decode bytes have not yet +// been checked. This JP is not yet listed in history. +// +// Note: Check Decode bytes at the start of each loop, not after looking up a +// new JP, so its easy to do constant shifts/masks, although this requires +// cautious handling of Pjp, offset, and *hist[] for correct entry to +// SM2Backtrack. +// +// EXIT: Return, or branch to SM2Backtrack or SM3Findlimit with correct +// interface, as described elsewhere. +// +// WARNING: For run-time efficiency the following cases replicate code with +// varying constants, rather than using common code with variable values! + +SM1Get: // return here for next branch/leaf. + + switch (JU_JPTYPE(Pjp)) + { + + +// ---------------------------------------------------------------------------- +// LINEAR BRANCH: +// +// Check Decode bytes, if any, in the current JP, then search for a JP for the +// next digit in *PIndex. + + case cJU_JPBRANCH_L2: CHECKDCD(2); SM1PREPB(2, SM1BranchL); + case cJU_JPBRANCH_L3: CHECKDCD(3); SM1PREPB(3, SM1BranchL); +#ifdef JU_64BIT + case cJU_JPBRANCH_L4: CHECKDCD(4); SM1PREPB(4, SM1BranchL); + case cJU_JPBRANCH_L5: CHECKDCD(5); SM1PREPB(5, SM1BranchL); + case cJU_JPBRANCH_L6: CHECKDCD(6); SM1PREPB(6, SM1BranchL); + case cJU_JPBRANCH_L7: CHECKDCD(7); SM1PREPB(7, SM1BranchL); +#endif + case cJU_JPBRANCH_L: SM1PREPB(cJU_ROOTSTATE, SM1BranchL); + +// Common code (state-independent) for all cases of linear branches: + +SM1BranchL: + Pjbl = P_JBL(Pjp->jp_Addr); + +// Found JP matching current digit in *PIndex; record parent JP and the next +// JPs offset, and iterate to the next JP: + + if ((offset = j__udySearchLeaf1((Pjll_t) (Pjbl->jbl_Expanse), + Pjbl->jbl_NumJPs, digit)) >= 0) + { + HISTPUSH(Pjp, offset); + Pjp = (Pjbl->jbl_jp) + offset; + goto SM1Get; + } + +// Dead end, no JP in BranchL for next digit in *PIndex: +// +// Get the ideal location of digits JP, and if theres no next-left/right JP +// in the BranchL, shortcut and start backtracking one level up; ignore the +// current Pjp because it points to a BranchL with no next-left/right JP. + +#ifdef JUDYPREV + if ((offset = (~offset) - 1) < 0) // no next-left JP in BranchL. +#else + if ((offset = (~offset)) >= Pjbl->jbl_NumJPs) // no next-right. +#endif + goto SM2Backtrack; + +// Theres a next-left/right JP in the current BranchL; save its digit in +// *PIndex and shortcut to SM3Findlimit: + + JU_SETDIGIT(*PIndex, Pjbl->jbl_Expanse[offset], state); + Pjp = (Pjbl->jbl_jp) + offset; + goto SM3Findlimit; + + +// ---------------------------------------------------------------------------- +// BITMAP BRANCH: +// +// Check Decode bytes, if any, in the current JP, then look for a JP for the +// next digit in *PIndex. + + case cJU_JPBRANCH_B2: CHECKDCD(2); SM1PREPB(2, SM1BranchB); + case cJU_JPBRANCH_B3: CHECKDCD(3); SM1PREPB(3, SM1BranchB); +#ifdef JU_64BIT + case cJU_JPBRANCH_B4: CHECKDCD(4); SM1PREPB(4, SM1BranchB); + case cJU_JPBRANCH_B5: CHECKDCD(5); SM1PREPB(5, SM1BranchB); + case cJU_JPBRANCH_B6: CHECKDCD(6); SM1PREPB(6, SM1BranchB); + case cJU_JPBRANCH_B7: CHECKDCD(7); SM1PREPB(7, SM1BranchB); +#endif + case cJU_JPBRANCH_B: SM1PREPB(cJU_ROOTSTATE, SM1BranchB); + +// Common code (state-independent) for all cases of bitmap branches: + +SM1BranchB: + Pjbb = P_JBB(Pjp->jp_Addr); + +// Locate the digits JP in the subexpanse list, if present, otherwise the +// offset of the next-left JP, if any: + + subexp = digit / cJU_BITSPERSUBEXPB; + assert(subexp < cJU_NUMSUBEXPB); // falls in expected range. + bitposmask = JU_BITPOSMASKB(digit); + offset = SEARCHBITMAPB(JU_JBB_BITMAP(Pjbb, subexp), digit, + bitposmask); + // right range: + assert((offset >= -1) && (offset < (int) cJU_BITSPERSUBEXPB)); + +// Found JP matching current digit in *PIndex: +// +// Record the parent JP and the next JPs offset; and iterate to the next JP. + +// if (JU_BITMAPTESTB(Pjbb, digit)) // slower. + if (JU_JBB_BITMAP(Pjbb, subexp) & bitposmask) // faster. + { + // not negative since at least one bit is set: + assert(offset >= 0); + + HISTPUSH(Pjp, HISTPUSHBOFF(subexp, offset, digit)); + + if ((Pjp = P_JP(JU_JBB_PJP(Pjbb, subexp))) == (Pjp_t) NULL) + { + JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + } + + Pjp += offset; + goto SM1Get; // iterate to next JP. + } + +// Dead end, no JP in BranchB for next digit in *PIndex: +// +// If theres a next-left/right JP in the current BranchB, shortcut to +// SM3Findlimit. Note: offset is already set to the correct value for the +// next-left/right JP. + +#ifdef JUDYPREV + if (offset >= 0) // next-left JP is in this subexpanse. + goto SM1BranchBFindlimit; + + while (--subexp >= 0) // search next-left subexpanses. +#else + if (JU_JBB_BITMAP(Pjbb, subexp) & JU_MASKHIGHEREXC(bitposmask)) + { + ++offset; // next-left => next-right. + goto SM1BranchBFindlimit; + } + + while (++subexp < cJU_NUMSUBEXPB) // search next-right subexps. +#endif + { + if (! JU_JBB_PJP(Pjbb, subexp)) continue; // empty subexpanse. + +#ifdef JUDYPREV + offset = SEARCHBITMAPMAXB(JU_JBB_BITMAP(Pjbb, subexp)); + // expected range: + assert((offset >= 0) && (offset < cJU_BITSPERSUBEXPB)); +#else + offset = 0; +#endif + +// Save the next-left/right JPs digit in *PIndex: + +SM1BranchBFindlimit: + JU_BITMAPDIGITB(digit, subexp, JU_JBB_BITMAP(Pjbb, subexp), + offset); + JU_SETDIGIT(*PIndex, digit, state); + + if ((Pjp = P_JP(JU_JBB_PJP(Pjbb, subexp))) == (Pjp_t) NULL) + { + JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + } + + Pjp += offset; + goto SM3Findlimit; + } + +// Theres no next-left/right JP in the BranchB: +// +// Shortcut and start backtracking one level up; ignore the current Pjp because +// it points to a BranchB with no next-left/right JP. + + goto SM2Backtrack; + + +// ---------------------------------------------------------------------------- +// UNCOMPRESSED BRANCH: +// +// Check Decode bytes, if any, in the current JP, then look for a JP for the +// next digit in *PIndex. + + case cJU_JPBRANCH_U2: CHECKDCD(2); SM1PREPB(2, SM1BranchU); + case cJU_JPBRANCH_U3: CHECKDCD(3); SM1PREPB(3, SM1BranchU); +#ifdef JU_64BIT + case cJU_JPBRANCH_U4: CHECKDCD(4); SM1PREPB(4, SM1BranchU); + case cJU_JPBRANCH_U5: CHECKDCD(5); SM1PREPB(5, SM1BranchU); + case cJU_JPBRANCH_U6: CHECKDCD(6); SM1PREPB(6, SM1BranchU); + case cJU_JPBRANCH_U7: CHECKDCD(7); SM1PREPB(7, SM1BranchU); +#endif + case cJU_JPBRANCH_U: SM1PREPB(cJU_ROOTSTATE, SM1BranchU); + +// Common code (state-independent) for all cases of uncompressed branches: + +SM1BranchU: + Pjbu = P_JBU(Pjp->jp_Addr); + Pjp2 = (Pjbu->jbu_jp) + digit; + +// Found JP matching current digit in *PIndex: +// +// Record the parent JP and the next JPs digit, and iterate to the next JP. +// +// TBD: Instead of this, just goto SM1Get, and add cJU_JPNULL* cases to the +// SM1Get state machine? Then backtrack? However, it means you cant detect +// an inappropriate cJU_JPNULL*, when it occurs in other than a BranchU, and +// return JU_RET_CORRUPT. + + if (! JPNULL(JU_JPTYPE(Pjp2))) // digit has a JP. + { + HISTPUSH(Pjp, digit); + Pjp = Pjp2; + goto SM1Get; + } + +// Dead end, no JP in BranchU for next digit in *PIndex: +// +// Search for a next-left/right JP in the current BranchU, and if one is found, +// save its digit in *PIndex and shortcut to SM3Findlimit: + +#ifdef JUDYPREV + while (digit >= 1) + { + Pjp = (Pjbu->jbu_jp) + (--digit); +#else + while (digit < cJU_BRANCHUNUMJPS - 1) + { + Pjp = (Pjbu->jbu_jp) + (++digit); +#endif + if (JPNULL(JU_JPTYPE(Pjp))) continue; + + JU_SETDIGIT(*PIndex, digit, state); + goto SM3Findlimit; + } + +// Theres no next-left/right JP in the BranchU: +// +// Shortcut and start backtracking one level up; ignore the current Pjp because +// it points to a BranchU with no next-left/right JP. + + goto SM2Backtrack; + + +// ---------------------------------------------------------------------------- +// LINEAR LEAF: +// +// Check Decode bytes, if any, in the current JP, then search the leaf for +// *PIndex. + +#define SM1LEAFL(Func) \ + Pjll = P_JLL(Pjp->jp_Addr); \ + pop1 = JU_JPLEAF_POP0(Pjp) + 1; \ + offset = Func(Pjll, pop1, *PIndex); \ + goto SM1LeafLImm + +#if (defined(JUDYL) || (! defined(JU_64BIT))) + case cJU_JPLEAF1: CHECKDCD(1); SM1LEAFL(j__udySearchLeaf1); +#endif + case cJU_JPLEAF2: CHECKDCD(2); SM1LEAFL(j__udySearchLeaf2); + case cJU_JPLEAF3: CHECKDCD(3); SM1LEAFL(j__udySearchLeaf3); + +#ifdef JU_64BIT + case cJU_JPLEAF4: CHECKDCD(4); SM1LEAFL(j__udySearchLeaf4); + case cJU_JPLEAF5: CHECKDCD(5); SM1LEAFL(j__udySearchLeaf5); + case cJU_JPLEAF6: CHECKDCD(6); SM1LEAFL(j__udySearchLeaf6); + case cJU_JPLEAF7: CHECKDCD(7); SM1LEAFL(j__udySearchLeaf7); +#endif + +// Common code (state-independent) for all cases of linear leaves and +// immediates: + +SM1LeafLImm: + if (offset >= 0) // *PIndex is in LeafL / Immed. +#ifdef JUDY1 + JU_RET_FOUND; +#else + { // JudyL is trickier... + switch (JU_JPTYPE(Pjp)) + { +#if (defined(JUDYL) || (! defined(JU_64BIT))) + case cJU_JPLEAF1: JU_RET_FOUND_LEAF1(Pjll, pop1, offset); +#endif + case cJU_JPLEAF2: JU_RET_FOUND_LEAF2(Pjll, pop1, offset); + case cJU_JPLEAF3: JU_RET_FOUND_LEAF3(Pjll, pop1, offset); +#ifdef JU_64BIT + case cJU_JPLEAF4: JU_RET_FOUND_LEAF4(Pjll, pop1, offset); + case cJU_JPLEAF5: JU_RET_FOUND_LEAF5(Pjll, pop1, offset); + case cJU_JPLEAF6: JU_RET_FOUND_LEAF6(Pjll, pop1, offset); + case cJU_JPLEAF7: JU_RET_FOUND_LEAF7(Pjll, pop1, offset); +#endif + + case cJU_JPIMMED_1_01: + case cJU_JPIMMED_2_01: + case cJU_JPIMMED_3_01: +#ifdef JU_64BIT + case cJU_JPIMMED_4_01: + case cJU_JPIMMED_5_01: + case cJU_JPIMMED_6_01: + case cJU_JPIMMED_7_01: +#endif + JU_RET_FOUND_IMM_01(Pjp); + + case cJU_JPIMMED_1_02: + case cJU_JPIMMED_1_03: +#ifdef JU_64BIT + case cJU_JPIMMED_1_04: + case cJU_JPIMMED_1_05: + case cJU_JPIMMED_1_06: + case cJU_JPIMMED_1_07: + case cJU_JPIMMED_2_02: + case cJU_JPIMMED_2_03: + case cJU_JPIMMED_3_02: +#endif + JU_RET_FOUND_IMM(Pjp, offset); + } + + JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); // impossible? + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + + } // found *PIndex + +#endif // JUDYL + +// Dead end, no Index in LeafL / Immed for remaining digit(s) in *PIndex: +// +// Get the ideal location of Index, and if theres no next-left/right Index in +// the LeafL / Immed, shortcut and start backtracking one level up; ignore the +// current Pjp because it points to a LeafL / Immed with no next-left/right +// Index. + +#ifdef JUDYPREV + if ((offset = (~offset) - 1) < 0) // no next-left Index. +#else + if ((offset = (~offset)) >= pop1) // no next-right Index. +#endif + goto SM2Backtrack; + +// Theres a next-left/right Index in the current LeafL / Immed; shortcut by +// copying its digit(s) to *PIndex and returning it. +// +// Unfortunately this is pretty hairy, especially avoiding endian issues. +// +// The cJU_JPLEAF* cases are very similar to same-index-size cJU_JPIMMED* cases +// for *_02 and above, but must return differently, at least for JudyL, so +// spell them out separately here at the cost of a little redundant code for +// Judy1. + + switch (JU_JPTYPE(Pjp)) + { +#if (defined(JUDYL) || (! defined(JU_64BIT))) + case cJU_JPLEAF1: + + JU_SETDIGIT1(*PIndex, ((uint8_t *) Pjll)[offset]); + JU_RET_FOUND_LEAF1(Pjll, pop1, offset); +#endif + + case cJU_JPLEAF2: + + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(2))) + | ((uint16_t *) Pjll)[offset]; + JU_RET_FOUND_LEAF2(Pjll, pop1, offset); + + case cJU_JPLEAF3: + { + Word_t lsb; + JU_COPY3_PINDEX_TO_LONG(lsb, ((uint8_t *) Pjll) + (3 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(3))) | lsb; + JU_RET_FOUND_LEAF3(Pjll, pop1, offset); + } + +#ifdef JU_64BIT + case cJU_JPLEAF4: + + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(4))) + | ((uint32_t *) Pjll)[offset]; + JU_RET_FOUND_LEAF4(Pjll, pop1, offset); + + case cJU_JPLEAF5: + { + Word_t lsb; + JU_COPY5_PINDEX_TO_LONG(lsb, ((uint8_t *) Pjll) + (5 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(5))) | lsb; + JU_RET_FOUND_LEAF5(Pjll, pop1, offset); + } + + case cJU_JPLEAF6: + { + Word_t lsb; + JU_COPY6_PINDEX_TO_LONG(lsb, ((uint8_t *) Pjll) + (6 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(6))) | lsb; + JU_RET_FOUND_LEAF6(Pjll, pop1, offset); + } + + case cJU_JPLEAF7: + { + Word_t lsb; + JU_COPY7_PINDEX_TO_LONG(lsb, ((uint8_t *) Pjll) + (7 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(7))) | lsb; + JU_RET_FOUND_LEAF7(Pjll, pop1, offset); + } + +#endif // JU_64BIT + +#define SET_01(cState) JU_SETDIGITS(*PIndex, JU_JPDCDPOP0(Pjp), cState) + + case cJU_JPIMMED_1_01: SET_01(1); goto SM1Imm_01; + case cJU_JPIMMED_2_01: SET_01(2); goto SM1Imm_01; + case cJU_JPIMMED_3_01: SET_01(3); goto SM1Imm_01; +#ifdef JU_64BIT + case cJU_JPIMMED_4_01: SET_01(4); goto SM1Imm_01; + case cJU_JPIMMED_5_01: SET_01(5); goto SM1Imm_01; + case cJU_JPIMMED_6_01: SET_01(6); goto SM1Imm_01; + case cJU_JPIMMED_7_01: SET_01(7); goto SM1Imm_01; +#endif +SM1Imm_01: JU_RET_FOUND_IMM_01(Pjp); + +// Shorthand for where to find start of Index bytes array: + +#ifdef JUDY1 +#define PJI (Pjp->jp_1Index) +#else +#define PJI (Pjp->jp_LIndex) +#endif + + case cJU_JPIMMED_1_02: + case cJU_JPIMMED_1_03: +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_1_04: + case cJU_JPIMMED_1_05: + case cJU_JPIMMED_1_06: + case cJU_JPIMMED_1_07: +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_1_08: + case cJ1_JPIMMED_1_09: + case cJ1_JPIMMED_1_10: + case cJ1_JPIMMED_1_11: + case cJ1_JPIMMED_1_12: + case cJ1_JPIMMED_1_13: + case cJ1_JPIMMED_1_14: + case cJ1_JPIMMED_1_15: +#endif + JU_SETDIGIT1(*PIndex, ((uint8_t *) PJI)[offset]); + JU_RET_FOUND_IMM(Pjp, offset); + +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_2_02: + case cJU_JPIMMED_2_03: +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_2_04: + case cJ1_JPIMMED_2_05: + case cJ1_JPIMMED_2_06: + case cJ1_JPIMMED_2_07: +#endif +#if (defined(JUDY1) || defined(JU_64BIT)) + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(2))) + | ((uint16_t *) PJI)[offset]; + JU_RET_FOUND_IMM(Pjp, offset); +#endif + +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_3_02: +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_3_03: + case cJ1_JPIMMED_3_04: + case cJ1_JPIMMED_3_05: +#endif +#if (defined(JUDY1) || defined(JU_64BIT)) + { + Word_t lsb; + JU_COPY3_PINDEX_TO_LONG(lsb, ((uint8_t *) PJI) + (3 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(3))) | lsb; + JU_RET_FOUND_IMM(Pjp, offset); + } +#endif + +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_4_02: + case cJ1_JPIMMED_4_03: + + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(4))) + | ((uint32_t *) PJI)[offset]; + JU_RET_FOUND_IMM(Pjp, offset); + + case cJ1_JPIMMED_5_02: + case cJ1_JPIMMED_5_03: + { + Word_t lsb; + JU_COPY5_PINDEX_TO_LONG(lsb, ((uint8_t *) PJI) + (5 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(5))) | lsb; + JU_RET_FOUND_IMM(Pjp, offset); + } + + case cJ1_JPIMMED_6_02: + { + Word_t lsb; + JU_COPY6_PINDEX_TO_LONG(lsb, ((uint8_t *) PJI) + (6 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(6))) | lsb; + JU_RET_FOUND_IMM(Pjp, offset); + } + + case cJ1_JPIMMED_7_02: + { + Word_t lsb; + JU_COPY7_PINDEX_TO_LONG(lsb, ((uint8_t *) PJI) + (7 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(7))) | lsb; + JU_RET_FOUND_IMM(Pjp, offset); + } + +#endif // (JUDY1 && JU_64BIT) + + } // switch for not-found *PIndex + + JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); // impossible? + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + + +// ---------------------------------------------------------------------------- +// BITMAP LEAF: +// +// Check Decode bytes, if any, in the current JP, then look in the leaf for +// *PIndex. + + case cJU_JPLEAF_B1: + { + Pjlb_t Pjlb; + CHECKDCD(1); + + Pjlb = P_JLB(Pjp->jp_Addr); + digit = JU_DIGITATSTATE(*PIndex, 1); + subexp = JU_SUBEXPL(digit); + bitposmask = JU_BITPOSMASKL(digit); + assert(subexp < cJU_NUMSUBEXPL); // falls in expected range. + +// *PIndex exists in LeafB1: + +// if (JU_BITMAPTESTL(Pjlb, digit)) // slower. + if (JU_JLB_BITMAP(Pjlb, subexp) & bitposmask) // faster. + { +#ifdef JUDYL // needs offset at this point: + offset = SEARCHBITMAPL(JU_JLB_BITMAP(Pjlb, subexp), digit, bitposmask); +#endif + JU_RET_FOUND_LEAF_B1(Pjlb, subexp, offset); +// == return((PPvoid_t) (P_JV(JL_JLB_PVALUE(Pjlb, subexp)) + (offset))); + } + +// Dead end, no Index in LeafB1 for remaining digit in *PIndex: +// +// If theres a next-left/right Index in the current LeafB1, which for +// Judy*Next() is true if any bits are set for higher Indexes, shortcut by +// returning it. Note: For Judy*Prev(), offset is set here to the correct +// value for the next-left JP. + + offset = SEARCHBITMAPL(JU_JLB_BITMAP(Pjlb, subexp), digit, + bitposmask); + // right range: + assert((offset >= -1) && (offset < (int) cJU_BITSPERSUBEXPL)); + +#ifdef JUDYPREV + if (offset >= 0) // next-left JP is in this subexpanse. + goto SM1LeafB1Findlimit; + + while (--subexp >= 0) // search next-left subexpanses. +#else + if (JU_JLB_BITMAP(Pjlb, subexp) & JU_MASKHIGHEREXC(bitposmask)) + { + ++offset; // next-left => next-right. + goto SM1LeafB1Findlimit; + } + + while (++subexp < cJU_NUMSUBEXPL) // search next-right subexps. +#endif + { + if (! JU_JLB_BITMAP(Pjlb, subexp)) continue; // empty subexp. + +#ifdef JUDYPREV + offset = SEARCHBITMAPMAXL(JU_JLB_BITMAP(Pjlb, subexp)); + // expected range: + assert((offset >= 0) && (offset < (int) cJU_BITSPERSUBEXPL)); +#else + offset = 0; +#endif + +// Save the next-left/right Indexess digit in *PIndex: + +SM1LeafB1Findlimit: + JU_BITMAPDIGITL(digit, subexp, JU_JLB_BITMAP(Pjlb, subexp), offset); + JU_SETDIGIT1(*PIndex, digit); + JU_RET_FOUND_LEAF_B1(Pjlb, subexp, offset); +// == return((PPvoid_t) (P_JV(JL_JLB_PVALUE(Pjlb, subexp)) + (offset))); + } + +// Theres no next-left/right Index in the LeafB1: +// +// Shortcut and start backtracking one level up; ignore the current Pjp because +// it points to a LeafB1 with no next-left/right Index. + + goto SM2Backtrack; + + } // case cJU_JPLEAF_B1 + +#ifdef JUDY1 +// ---------------------------------------------------------------------------- +// FULL POPULATION: +// +// If the Decode bytes match, *PIndex is found (without modification). + + case cJ1_JPFULLPOPU1: + + CHECKDCD(1); + JU_RET_FOUND_FULLPOPU1; +#endif + + +// ---------------------------------------------------------------------------- +// IMMEDIATE: + +#ifdef JUDYPREV +#define SM1IMM_SETPOP1(cPop1) +#else +#define SM1IMM_SETPOP1(cPop1) pop1 = (cPop1) +#endif + +#define SM1IMM(Func,cPop1) \ + SM1IMM_SETPOP1(cPop1); \ + offset = Func((Pjll_t) (PJI), cPop1, *PIndex); \ + goto SM1LeafLImm + +// Special case for Pop1 = 1 Immediate JPs: +// +// If *PIndex is in the immediate, offset is 0, otherwise the binary NOT of the +// offset where it belongs, 0 or 1, same as from the search functions. + +#ifdef JUDYPREV +#define SM1IMM_01_SETPOP1 +#else +#define SM1IMM_01_SETPOP1 pop1 = 1 +#endif + +#define SM1IMM_01 \ + SM1IMM_01_SETPOP1; \ + offset = ((JU_JPDCDPOP0(Pjp) < JU_TRIMTODCDSIZE(*PIndex)) ? ~1 : \ + (JU_JPDCDPOP0(Pjp) == JU_TRIMTODCDSIZE(*PIndex)) ? 0 : \ + ~0); \ + goto SM1LeafLImm + + case cJU_JPIMMED_1_01: + case cJU_JPIMMED_2_01: + case cJU_JPIMMED_3_01: +#ifdef JU_64BIT + case cJU_JPIMMED_4_01: + case cJU_JPIMMED_5_01: + case cJU_JPIMMED_6_01: + case cJU_JPIMMED_7_01: +#endif + SM1IMM_01; + +// TBD: Doug says it would be OK to have fewer calls and calculate arg 2, here +// and in Judy*Count() also. + + case cJU_JPIMMED_1_02: SM1IMM(j__udySearchLeaf1, 2); + case cJU_JPIMMED_1_03: SM1IMM(j__udySearchLeaf1, 3); +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_1_04: SM1IMM(j__udySearchLeaf1, 4); + case cJU_JPIMMED_1_05: SM1IMM(j__udySearchLeaf1, 5); + case cJU_JPIMMED_1_06: SM1IMM(j__udySearchLeaf1, 6); + case cJU_JPIMMED_1_07: SM1IMM(j__udySearchLeaf1, 7); +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_1_08: SM1IMM(j__udySearchLeaf1, 8); + case cJ1_JPIMMED_1_09: SM1IMM(j__udySearchLeaf1, 9); + case cJ1_JPIMMED_1_10: SM1IMM(j__udySearchLeaf1, 10); + case cJ1_JPIMMED_1_11: SM1IMM(j__udySearchLeaf1, 11); + case cJ1_JPIMMED_1_12: SM1IMM(j__udySearchLeaf1, 12); + case cJ1_JPIMMED_1_13: SM1IMM(j__udySearchLeaf1, 13); + case cJ1_JPIMMED_1_14: SM1IMM(j__udySearchLeaf1, 14); + case cJ1_JPIMMED_1_15: SM1IMM(j__udySearchLeaf1, 15); +#endif + +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_2_02: SM1IMM(j__udySearchLeaf2, 2); + case cJU_JPIMMED_2_03: SM1IMM(j__udySearchLeaf2, 3); +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_2_04: SM1IMM(j__udySearchLeaf2, 4); + case cJ1_JPIMMED_2_05: SM1IMM(j__udySearchLeaf2, 5); + case cJ1_JPIMMED_2_06: SM1IMM(j__udySearchLeaf2, 6); + case cJ1_JPIMMED_2_07: SM1IMM(j__udySearchLeaf2, 7); +#endif + +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_3_02: SM1IMM(j__udySearchLeaf3, 2); +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_3_03: SM1IMM(j__udySearchLeaf3, 3); + case cJ1_JPIMMED_3_04: SM1IMM(j__udySearchLeaf3, 4); + case cJ1_JPIMMED_3_05: SM1IMM(j__udySearchLeaf3, 5); + + case cJ1_JPIMMED_4_02: SM1IMM(j__udySearchLeaf4, 2); + case cJ1_JPIMMED_4_03: SM1IMM(j__udySearchLeaf4, 3); + + case cJ1_JPIMMED_5_02: SM1IMM(j__udySearchLeaf5, 2); + case cJ1_JPIMMED_5_03: SM1IMM(j__udySearchLeaf5, 3); + + case cJ1_JPIMMED_6_02: SM1IMM(j__udySearchLeaf6, 2); + + case cJ1_JPIMMED_7_02: SM1IMM(j__udySearchLeaf7, 2); +#endif + + +// ---------------------------------------------------------------------------- +// INVALID JP TYPE: + + default: JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + + } // SM1Get switch. + + /*NOTREACHED*/ + + +// ============================================================================ +// STATE MACHINE 2 -- BACKTRACK BRANCH TO PREVIOUS JP: +// +// Look for the next-left/right JP in a branch, backing up the history list as +// necessary. Upon finding a next-left/right JP, modify the corresponding +// digit in *PIndex before passing control to SM3Findlimit. +// +// Note: As described earlier, only branch JPs are expected here; other types +// fall into the default case. +// +// Note: If a found JP contains needed Dcd bytes, thats OK, theyre copied to +// *PIndex in SM3Findlimit. +// +// TBD: This code has a lot in common with similar code in the shortcut cases +// in SM1Get. Can combine this code somehow? +// +// ENTRY: List, possibly empty, of JPs and offsets in APjphist[] and +// Aoffhist[]; see earlier comments. +// +// EXIT: Execute JU_RET_NOTFOUND if no previous/next JP; otherwise jump to +// SM3Findlimit to resume a new but different downward search. + +SM2Backtrack: // come or return here for first/next sideways search. + + HISTPOP(Pjp, offset); + + switch (JU_JPTYPE(Pjp)) + { + + +// ---------------------------------------------------------------------------- +// LINEAR BRANCH: + + case cJU_JPBRANCH_L2: state = 2; goto SM2BranchL; + case cJU_JPBRANCH_L3: state = 3; goto SM2BranchL; +#ifdef JU_64BIT + case cJU_JPBRANCH_L4: state = 4; goto SM2BranchL; + case cJU_JPBRANCH_L5: state = 5; goto SM2BranchL; + case cJU_JPBRANCH_L6: state = 6; goto SM2BranchL; + case cJU_JPBRANCH_L7: state = 7; goto SM2BranchL; +#endif + case cJU_JPBRANCH_L: state = cJU_ROOTSTATE; goto SM2BranchL; + +SM2BranchL: +#ifdef JUDYPREV + if (--offset < 0) goto SM2Backtrack; // no next-left JP in BranchL. +#endif + Pjbl = P_JBL(Pjp->jp_Addr); +#ifdef JUDYNEXT + if (++offset >= (Pjbl->jbl_NumJPs)) goto SM2Backtrack; + // no next-right JP in BranchL. +#endif + +// Theres a next-left/right JP in the current BranchL; save its digit in +// *PIndex and continue with SM3Findlimit: + + JU_SETDIGIT(*PIndex, Pjbl->jbl_Expanse[offset], state); + Pjp = (Pjbl->jbl_jp) + offset; + goto SM3Findlimit; + + +// ---------------------------------------------------------------------------- +// BITMAP BRANCH: + + case cJU_JPBRANCH_B2: state = 2; goto SM2BranchB; + case cJU_JPBRANCH_B3: state = 3; goto SM2BranchB; +#ifdef JU_64BIT + case cJU_JPBRANCH_B4: state = 4; goto SM2BranchB; + case cJU_JPBRANCH_B5: state = 5; goto SM2BranchB; + case cJU_JPBRANCH_B6: state = 6; goto SM2BranchB; + case cJU_JPBRANCH_B7: state = 7; goto SM2BranchB; +#endif + case cJU_JPBRANCH_B: state = cJU_ROOTSTATE; goto SM2BranchB; + +SM2BranchB: + Pjbb = P_JBB(Pjp->jp_Addr); + HISTPOPBOFF(subexp, offset, digit); // unpack values. + +// If theres a next-left/right JP in the current BranchB, which for +// Judy*Next() is true if any bits are set for higher Indexes, continue to +// SM3Findlimit: +// +// Note: offset is set to the JP previously traversed; go one to the +// left/right. + +#ifdef JUDYPREV + if (offset > 0) // next-left JP is in this subexpanse. + { + --offset; + goto SM2BranchBFindlimit; + } + + while (--subexp >= 0) // search next-left subexpanses. +#else + if (JU_JBB_BITMAP(Pjbb, subexp) + & JU_MASKHIGHEREXC(JU_BITPOSMASKB(digit))) + { + ++offset; // next-left => next-right. + goto SM2BranchBFindlimit; + } + + while (++subexp < cJU_NUMSUBEXPB) // search next-right subexps. +#endif + { + if (! JU_JBB_PJP(Pjbb, subexp)) continue; // empty subexpanse. + +#ifdef JUDYPREV + offset = SEARCHBITMAPMAXB(JU_JBB_BITMAP(Pjbb, subexp)); + // expected range: + assert((offset >= 0) && (offset < cJU_BITSPERSUBEXPB)); +#else + offset = 0; +#endif + +// Save the next-left/right JPs digit in *PIndex: + +SM2BranchBFindlimit: + JU_BITMAPDIGITB(digit, subexp, JU_JBB_BITMAP(Pjbb, subexp), + offset); + JU_SETDIGIT(*PIndex, digit, state); + + if ((Pjp = P_JP(JU_JBB_PJP(Pjbb, subexp))) == (Pjp_t) NULL) + { + JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + } + + Pjp += offset; + goto SM3Findlimit; + } + +// Theres no next-left/right JP in the BranchB: + + goto SM2Backtrack; + + +// ---------------------------------------------------------------------------- +// UNCOMPRESSED BRANCH: + + case cJU_JPBRANCH_U2: state = 2; goto SM2BranchU; + case cJU_JPBRANCH_U3: state = 3; goto SM2BranchU; +#ifdef JU_64BIT + case cJU_JPBRANCH_U4: state = 4; goto SM2BranchU; + case cJU_JPBRANCH_U5: state = 5; goto SM2BranchU; + case cJU_JPBRANCH_U6: state = 6; goto SM2BranchU; + case cJU_JPBRANCH_U7: state = 7; goto SM2BranchU; +#endif + case cJU_JPBRANCH_U: state = cJU_ROOTSTATE; goto SM2BranchU; + +SM2BranchU: + +// Search for a next-left/right JP in the current BranchU, and if one is found, +// save its digit in *PIndex and continue to SM3Findlimit: + + Pjbu = P_JBU(Pjp->jp_Addr); + digit = offset; + +#ifdef JUDYPREV + while (digit >= 1) + { + Pjp = (Pjbu->jbu_jp) + (--digit); +#else + while (digit < cJU_BRANCHUNUMJPS - 1) + { + Pjp = (Pjbu->jbu_jp) + (++digit); +#endif + if (JPNULL(JU_JPTYPE(Pjp))) continue; + + JU_SETDIGIT(*PIndex, digit, state); + goto SM3Findlimit; + } + +// Theres no next-left/right JP in the BranchU: + + goto SM2Backtrack; + + +// ---------------------------------------------------------------------------- +// INVALID JP TYPE: + + default: JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + + } // SM2Backtrack switch. + + /*NOTREACHED*/ + + +// ============================================================================ +// STATE MACHINE 3 -- FIND LIMIT JP/INDEX: +// +// Look for the highest/lowest (right/left-most) JP in each branch and the +// highest/lowest Index in a leaf or immediate, and return it. While +// traversing, modify appropriate digit(s) in *PIndex to reflect the path +// taken, including Dcd bytes in each JP (which could hold critical missing +// digits for skipped branches). +// +// ENTRY: Pjp set to a JP under which to find max/min JPs (if a branch JP) or +// a max/min Index and return (if a leaf or immediate JP). +// +// EXIT: Execute JU_RET_FOUND* upon reaching a leaf or immediate. Should be +// impossible to fail, unless the Judy array is corrupt. + +SM3Findlimit: // come or return here for first/next branch/leaf. + + switch (JU_JPTYPE(Pjp)) + { +// ---------------------------------------------------------------------------- +// LINEAR BRANCH: +// +// Simply use the highest/lowest (right/left-most) JP in the BranchL, but first +// copy the Dcd bytes to *PIndex if there are any (only if state < +// cJU_ROOTSTATE - 1). + + case cJU_JPBRANCH_L2: SM3PREPB_DCD(2, SM3BranchL); +#ifndef JU_64BIT + case cJU_JPBRANCH_L3: SM3PREPB( 3, SM3BranchL); +#else + case cJU_JPBRANCH_L3: SM3PREPB_DCD(3, SM3BranchL); + case cJU_JPBRANCH_L4: SM3PREPB_DCD(4, SM3BranchL); + case cJU_JPBRANCH_L5: SM3PREPB_DCD(5, SM3BranchL); + case cJU_JPBRANCH_L6: SM3PREPB_DCD(6, SM3BranchL); + case cJU_JPBRANCH_L7: SM3PREPB( 7, SM3BranchL); +#endif + case cJU_JPBRANCH_L: SM3PREPB( cJU_ROOTSTATE, SM3BranchL); + +SM3BranchL: + Pjbl = P_JBL(Pjp->jp_Addr); + +#ifdef JUDYPREV + if ((offset = (Pjbl->jbl_NumJPs) - 1) < 0) +#else + offset = 0; if ((Pjbl->jbl_NumJPs) == 0) +#endif + { + JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + } + + JU_SETDIGIT(*PIndex, Pjbl->jbl_Expanse[offset], state); + Pjp = (Pjbl->jbl_jp) + offset; + goto SM3Findlimit; + + +// ---------------------------------------------------------------------------- +// BITMAP BRANCH: +// +// Look for the highest/lowest (right/left-most) non-null subexpanse, then use +// the highest/lowest JP in that subexpanse, but first copy Dcd bytes, if there +// are any (only if state < cJU_ROOTSTATE - 1), to *PIndex. + + case cJU_JPBRANCH_B2: SM3PREPB_DCD(2, SM3BranchB); +#ifndef JU_64BIT + case cJU_JPBRANCH_B3: SM3PREPB( 3, SM3BranchB); +#else + case cJU_JPBRANCH_B3: SM3PREPB_DCD(3, SM3BranchB); + case cJU_JPBRANCH_B4: SM3PREPB_DCD(4, SM3BranchB); + case cJU_JPBRANCH_B5: SM3PREPB_DCD(5, SM3BranchB); + case cJU_JPBRANCH_B6: SM3PREPB_DCD(6, SM3BranchB); + case cJU_JPBRANCH_B7: SM3PREPB( 7, SM3BranchB); +#endif + case cJU_JPBRANCH_B: SM3PREPB( cJU_ROOTSTATE, SM3BranchB); + +SM3BranchB: + Pjbb = P_JBB(Pjp->jp_Addr); +#ifdef JUDYPREV + subexp = cJU_NUMSUBEXPB; + + while (! (JU_JBB_BITMAP(Pjbb, --subexp))) // find non-empty subexp. + { + if (subexp <= 0) // wholly empty bitmap. + { + JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + } + } + + offset = SEARCHBITMAPMAXB(JU_JBB_BITMAP(Pjbb, subexp)); + // expected range: + assert((offset >= 0) && (offset < cJU_BITSPERSUBEXPB)); +#else + subexp = -1; + + while (! (JU_JBB_BITMAP(Pjbb, ++subexp))) // find non-empty subexp. + { + if (subexp >= cJU_NUMSUBEXPB - 1) // didnt find one. + { + JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + } + } + + offset = 0; +#endif + + JU_BITMAPDIGITB(digit, subexp, JU_JBB_BITMAP(Pjbb, subexp), offset); + JU_SETDIGIT(*PIndex, digit, state); + + if ((Pjp = P_JP(JU_JBB_PJP(Pjbb, subexp))) == (Pjp_t) NULL) + { + JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + } + + Pjp += offset; + goto SM3Findlimit; + + +// ---------------------------------------------------------------------------- +// UNCOMPRESSED BRANCH: +// +// Look for the highest/lowest (right/left-most) non-null JP, and use it, but +// first copy Dcd bytes to *PIndex if there are any (only if state < +// cJU_ROOTSTATE - 1). + + case cJU_JPBRANCH_U2: SM3PREPB_DCD(2, SM3BranchU); +#ifndef JU_64BIT + case cJU_JPBRANCH_U3: SM3PREPB( 3, SM3BranchU); +#else + case cJU_JPBRANCH_U3: SM3PREPB_DCD(3, SM3BranchU); + case cJU_JPBRANCH_U4: SM3PREPB_DCD(4, SM3BranchU); + case cJU_JPBRANCH_U5: SM3PREPB_DCD(5, SM3BranchU); + case cJU_JPBRANCH_U6: SM3PREPB_DCD(6, SM3BranchU); + case cJU_JPBRANCH_U7: SM3PREPB( 7, SM3BranchU); +#endif + case cJU_JPBRANCH_U: SM3PREPB( cJU_ROOTSTATE, SM3BranchU); + +SM3BranchU: + Pjbu = P_JBU(Pjp->jp_Addr); +#ifdef JUDYPREV + digit = cJU_BRANCHUNUMJPS; + + while (digit >= 1) + { + Pjp = (Pjbu->jbu_jp) + (--digit); +#else + + for (digit = 0; digit < cJU_BRANCHUNUMJPS; ++digit) + { + Pjp = (Pjbu->jbu_jp) + digit; +#endif + if (JPNULL(JU_JPTYPE(Pjp))) continue; + + JU_SETDIGIT(*PIndex, digit, state); + goto SM3Findlimit; + } + +// No non-null JPs in BranchU: + + JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + + +// ---------------------------------------------------------------------------- +// LINEAR LEAF: +// +// Simply use the highest/lowest (right/left-most) Index in the LeafL, but the +// details vary depending on leaf Index Size. First copy Dcd bytes, if there +// are any (only if state < cJU_ROOTSTATE - 1), to *PIndex. + +#define SM3LEAFLDCD(cState) \ + JU_SETDCD(*PIndex, Pjp, cState); \ + SM3LEAFLNODCD + +#ifdef JUDY1 +#define SM3LEAFL_SETPOP1 // not needed in any cases. +#else +#define SM3LEAFL_SETPOP1 pop1 = JU_JPLEAF_POP0(Pjp) + 1 +#endif + +#ifdef JUDYPREV +#define SM3LEAFLNODCD \ + Pjll = P_JLL(Pjp->jp_Addr); \ + SM3LEAFL_SETPOP1; \ + offset = JU_JPLEAF_POP0(Pjp); assert(offset >= 0) +#else +#define SM3LEAFLNODCD \ + Pjll = P_JLL(Pjp->jp_Addr); \ + SM3LEAFL_SETPOP1; \ + offset = 0; assert(JU_JPLEAF_POP0(Pjp) >= 0); +#endif + +#if (defined(JUDYL) || (! defined(JU_64BIT))) + case cJU_JPLEAF1: + + SM3LEAFLDCD(1); + JU_SETDIGIT1(*PIndex, ((uint8_t *) Pjll)[offset]); + JU_RET_FOUND_LEAF1(Pjll, pop1, offset); +#endif + + case cJU_JPLEAF2: + + SM3LEAFLDCD(2); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(2))) + | ((uint16_t *) Pjll)[offset]; + JU_RET_FOUND_LEAF2(Pjll, pop1, offset); + +#ifndef JU_64BIT + case cJU_JPLEAF3: + { + Word_t lsb; + SM3LEAFLNODCD; + JU_COPY3_PINDEX_TO_LONG(lsb, ((uint8_t *) Pjll) + (3 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(3))) | lsb; + JU_RET_FOUND_LEAF3(Pjll, pop1, offset); + } + +#else + case cJU_JPLEAF3: + { + Word_t lsb; + SM3LEAFLDCD(3); + JU_COPY3_PINDEX_TO_LONG(lsb, ((uint8_t *) Pjll) + (3 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(3))) | lsb; + JU_RET_FOUND_LEAF3(Pjll, pop1, offset); + } + + case cJU_JPLEAF4: + + SM3LEAFLDCD(4); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(4))) + | ((uint32_t *) Pjll)[offset]; + JU_RET_FOUND_LEAF4(Pjll, pop1, offset); + + case cJU_JPLEAF5: + { + Word_t lsb; + SM3LEAFLDCD(5); + JU_COPY5_PINDEX_TO_LONG(lsb, ((uint8_t *) Pjll) + (5 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(5))) | lsb; + JU_RET_FOUND_LEAF5(Pjll, pop1, offset); + } + + case cJU_JPLEAF6: + { + Word_t lsb; + SM3LEAFLDCD(6); + JU_COPY6_PINDEX_TO_LONG(lsb, ((uint8_t *) Pjll) + (6 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(6))) | lsb; + JU_RET_FOUND_LEAF6(Pjll, pop1, offset); + } + + case cJU_JPLEAF7: + { + Word_t lsb; + SM3LEAFLNODCD; + JU_COPY7_PINDEX_TO_LONG(lsb, ((uint8_t *) Pjll) + (7 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(7))) | lsb; + JU_RET_FOUND_LEAF7(Pjll, pop1, offset); + } +#endif + + +// ---------------------------------------------------------------------------- +// BITMAP LEAF: +// +// Look for the highest/lowest (right/left-most) non-null subexpanse, then use +// the highest/lowest Index in that subexpanse, but first copy Dcd bytes +// (always present since state 1 < cJU_ROOTSTATE) to *PIndex. + + case cJU_JPLEAF_B1: + { + Pjlb_t Pjlb; + + JU_SETDCD(*PIndex, Pjp, 1); + + Pjlb = P_JLB(Pjp->jp_Addr); +#ifdef JUDYPREV + subexp = cJU_NUMSUBEXPL; + + while (! JU_JLB_BITMAP(Pjlb, --subexp)) // find non-empty subexp. + { + if (subexp <= 0) // wholly empty bitmap. + { + JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + } + } + +// TBD: Might it be faster to just use a variant of BITMAPDIGIT*() that yields +// the digit for the right-most Index with a bit set? + + offset = SEARCHBITMAPMAXL(JU_JLB_BITMAP(Pjlb, subexp)); + // expected range: + assert((offset >= 0) && (offset < cJU_BITSPERSUBEXPL)); +#else + subexp = -1; + + while (! JU_JLB_BITMAP(Pjlb, ++subexp)) // find non-empty subexp. + { + if (subexp >= cJU_NUMSUBEXPL - 1) // didnt find one. + { + JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + } + } + + offset = 0; +#endif + + JU_BITMAPDIGITL(digit, subexp, JU_JLB_BITMAP(Pjlb, subexp), offset); + JU_SETDIGIT1(*PIndex, digit); + JU_RET_FOUND_LEAF_B1(Pjlb, subexp, offset); +// == return((PPvoid_t) (P_JV(JL_JLB_PVALUE(Pjlb, subexp)) + (offset))); + + } // case cJU_JPLEAF_B1 + +#ifdef JUDY1 +// ---------------------------------------------------------------------------- +// FULL POPULATION: +// +// Copy Dcd bytes to *PIndex (always present since state 1 < cJU_ROOTSTATE), +// then set the highest/lowest possible digit as the LSB in *PIndex. + + case cJ1_JPFULLPOPU1: + + JU_SETDCD( *PIndex, Pjp, 1); +#ifdef JUDYPREV + JU_SETDIGIT1(*PIndex, cJU_BITSPERBITMAP - 1); +#else + JU_SETDIGIT1(*PIndex, 0); +#endif + JU_RET_FOUND_FULLPOPU1; +#endif // JUDY1 + + +// ---------------------------------------------------------------------------- +// IMMEDIATE: +// +// Simply use the highest/lowest (right/left-most) Index in the Imm, but the +// details vary depending on leaf Index Size and pop1. Note: There are no Dcd +// bytes in an Immediate JP, but in a cJU_JPIMMED_*_01 JP, the field holds the +// least bytes of the immediate Index. + + case cJU_JPIMMED_1_01: SET_01(1); goto SM3Imm_01; + case cJU_JPIMMED_2_01: SET_01(2); goto SM3Imm_01; + case cJU_JPIMMED_3_01: SET_01(3); goto SM3Imm_01; +#ifdef JU_64BIT + case cJU_JPIMMED_4_01: SET_01(4); goto SM3Imm_01; + case cJU_JPIMMED_5_01: SET_01(5); goto SM3Imm_01; + case cJU_JPIMMED_6_01: SET_01(6); goto SM3Imm_01; + case cJU_JPIMMED_7_01: SET_01(7); goto SM3Imm_01; +#endif +SM3Imm_01: JU_RET_FOUND_IMM_01(Pjp); + +#ifdef JUDYPREV +#define SM3IMM_OFFSET(cPop1) (cPop1) - 1 // highest. +#else +#define SM3IMM_OFFSET(cPop1) 0 // lowest. +#endif + +#define SM3IMM(cPop1,Next) \ + offset = SM3IMM_OFFSET(cPop1); \ + goto Next + + case cJU_JPIMMED_1_02: SM3IMM( 2, SM3Imm1); + case cJU_JPIMMED_1_03: SM3IMM( 3, SM3Imm1); +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_1_04: SM3IMM( 4, SM3Imm1); + case cJU_JPIMMED_1_05: SM3IMM( 5, SM3Imm1); + case cJU_JPIMMED_1_06: SM3IMM( 6, SM3Imm1); + case cJU_JPIMMED_1_07: SM3IMM( 7, SM3Imm1); +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_1_08: SM3IMM( 8, SM3Imm1); + case cJ1_JPIMMED_1_09: SM3IMM( 9, SM3Imm1); + case cJ1_JPIMMED_1_10: SM3IMM(10, SM3Imm1); + case cJ1_JPIMMED_1_11: SM3IMM(11, SM3Imm1); + case cJ1_JPIMMED_1_12: SM3IMM(12, SM3Imm1); + case cJ1_JPIMMED_1_13: SM3IMM(13, SM3Imm1); + case cJ1_JPIMMED_1_14: SM3IMM(14, SM3Imm1); + case cJ1_JPIMMED_1_15: SM3IMM(15, SM3Imm1); +#endif + +SM3Imm1: JU_SETDIGIT1(*PIndex, ((uint8_t *) PJI)[offset]); + JU_RET_FOUND_IMM(Pjp, offset); + +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_2_02: SM3IMM(2, SM3Imm2); + case cJU_JPIMMED_2_03: SM3IMM(3, SM3Imm2); +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_2_04: SM3IMM(4, SM3Imm2); + case cJ1_JPIMMED_2_05: SM3IMM(5, SM3Imm2); + case cJ1_JPIMMED_2_06: SM3IMM(6, SM3Imm2); + case cJ1_JPIMMED_2_07: SM3IMM(7, SM3Imm2); +#endif + +#if (defined(JUDY1) || defined(JU_64BIT)) +SM3Imm2: *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(2))) + | ((uint16_t *) PJI)[offset]; + JU_RET_FOUND_IMM(Pjp, offset); +#endif + +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_3_02: SM3IMM(2, SM3Imm3); +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_3_03: SM3IMM(3, SM3Imm3); + case cJ1_JPIMMED_3_04: SM3IMM(4, SM3Imm3); + case cJ1_JPIMMED_3_05: SM3IMM(5, SM3Imm3); +#endif + +#if (defined(JUDY1) || defined(JU_64BIT)) +SM3Imm3: + { + Word_t lsb; + JU_COPY3_PINDEX_TO_LONG(lsb, ((uint8_t *) PJI) + (3 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(3))) | lsb; + JU_RET_FOUND_IMM(Pjp, offset); + } +#endif + +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_4_02: SM3IMM(2, SM3Imm4); + case cJ1_JPIMMED_4_03: SM3IMM(3, SM3Imm4); + +SM3Imm4: *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(4))) + | ((uint32_t *) PJI)[offset]; + JU_RET_FOUND_IMM(Pjp, offset); + + case cJ1_JPIMMED_5_02: SM3IMM(2, SM3Imm5); + case cJ1_JPIMMED_5_03: SM3IMM(3, SM3Imm5); + +SM3Imm5: + { + Word_t lsb; + JU_COPY5_PINDEX_TO_LONG(lsb, ((uint8_t *) PJI) + (5 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(5))) | lsb; + JU_RET_FOUND_IMM(Pjp, offset); + } + + case cJ1_JPIMMED_6_02: SM3IMM(2, SM3Imm6); + +SM3Imm6: + { + Word_t lsb; + JU_COPY6_PINDEX_TO_LONG(lsb, ((uint8_t *) PJI) + (6 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(6))) | lsb; + JU_RET_FOUND_IMM(Pjp, offset); + } + + case cJ1_JPIMMED_7_02: SM3IMM(2, SM3Imm7); + +SM3Imm7: + { + Word_t lsb; + JU_COPY7_PINDEX_TO_LONG(lsb, ((uint8_t *) PJI) + (7 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(7))) | lsb; + JU_RET_FOUND_IMM(Pjp, offset); + } +#endif // (JUDY1 && JU_64BIT) + + +// ---------------------------------------------------------------------------- +// OTHER CASES: + + default: JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + + } // SM3Findlimit switch. + + /*NOTREACHED*/ + +} // Judy1Prev() / Judy1Next() / JudyLPrev() / JudyLNext() diff --git a/src/libnetdata/libjudy/src/JudyL/JudyLNextEmpty.c b/src/libnetdata/libjudy/src/JudyL/JudyLNextEmpty.c new file mode 100644 index 00000000..4da43565 --- /dev/null +++ b/src/libnetdata/libjudy/src/JudyL/JudyLNextEmpty.c @@ -0,0 +1,1390 @@ +// Copyright (C) 2000 - 2002 Hewlett-Packard Company +// +// This program is free software; you can redistribute it and/or modify it +// under the term of the GNU Lesser General Public License as published by the +// Free Software Foundation; either version 2 of the License, or (at your +// option) any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License +// for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this program; if not, write to the Free Software Foundation, +// Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// _________________ + +// @(#) $Revision: 4.32 $ $Source: /judy/src/JudyCommon/JudyPrevNextEmpty.c $ +// +// Judy*PrevEmpty() and Judy*NextEmpty() functions for Judy1 and JudyL. +// Compile with one of -DJUDY1 or -DJUDYL. +// +// Compile with -DJUDYNEXT for the Judy*NextEmpty() function; otherwise +// defaults to Judy*PrevEmpty(). +// +// Compile with -DTRACEJPSE to trace JP traversals. +// +// This file is separate from JudyPrevNext.c because it differs too greatly for +// ifdefs. This might be a bit surprising, but there are two reasons: +// +// - First, down in the details, searching for an empty index (SearchEmpty) is +// remarkably asymmetric with searching for a valid index (SearchValid), +// mainly with respect to: No return of a value area for JudyL; partially- +// full versus totally-full JPs; and handling of narrow pointers. +// +// - Second, we chose to implement SearchEmpty without a backtrack stack or +// backtrack engine, partly as an experiment, and partly because we think +// restarting from the top of the tree is less likely for SearchEmpty than +// for SearchValid, because empty indexes are more likely than valid indexes. +// +// A word about naming: A prior version of this feature (see 4.13) was named +// Judy*Free(), but there were concerns about that being read as a verb rather +// than an adjective. After prolonged debate and based on user input, we +// changed "Free" to "Empty". + +#if (! (defined(JUDY1) || defined(JUDYL))) +#error: One of -DJUDY1 or -DJUDYL must be specified. +#endif + +#ifndef JUDYNEXT +#ifndef JUDYPREV +#define JUDYPREV 1 // neither set => use default. +#endif +#endif + +#ifdef JUDY1 +#include "Judy1.h" +#else +#include "JudyL.h" +#endif + +#include "JudyPrivate1L.h" + +#ifdef TRACEJPSE +#include "JudyPrintJP.c" +#endif + + +// **************************************************************************** +// J U D Y 1 P R E V E M P T Y +// J U D Y 1 N E X T E M P T Y +// J U D Y L P R E V E M P T Y +// J U D Y L N E X T E M P T Y +// +// See the manual entry for the API. +// +// OVERVIEW OF Judy*PrevEmpty() / Judy*NextEmpty(): +// +// See also for comparison the equivalent comments in JudyPrevNext.c. +// +// Take the callers *PIndex and subtract/add 1, but watch out for +// underflow/overflow, which means "no previous/next empty index found." Use a +// reentrant switch statement (state machine, see SMGetRestart and +// SMGetContinue) to decode Index, starting with the JRP (PArray), through a +// JPM and branches, if any, down to an immediate or a leaf. Look for Index in +// that immediate or leaf, and if not found (invalid index), return success +// (Index is empty). +// +// This search can result in a dead end where taking a different path is +// required. There are four kinds of dead ends: +// +// BRANCH PRIMARY dead end: Encountering a fully-populated JP for the +// appropriate digit in Index. Search sideways in the branch for the +// previous/next absent/null/non-full JP, and if one is found, set Index to the +// highest/lowest index possible in that JPs expanse. Then if the JP is an +// absent or null JP, return success; otherwise for a non-full JP, traverse +// through the partially populated JP. +// +// BRANCH SECONDARY dead end: Reaching the end of a branch during a sideways +// search after a branch primary dead end. Set Index to the lowest/highest +// index possible in the whole branchs expanse (one higher/lower than the +// previous/next branchs expanse), then restart at the top of the tree, which +// includes pre-decrementing/incrementing Index (again) and watching for +// underflow/overflow (again). +// +// LEAF PRIMARY dead end: Finding a valid (non-empty) index in an immediate or +// leaf matching Index. Search sideways in the immediate/leaf for the +// previous/next empty index; if found, set *PIndex to match and return success. +// +// LEAF SECONDARY dead end: Reaching the end of an immediate or leaf during a +// sideways search after a leaf primary dead end. Just as for a branch +// secondary dead end, restart at the top of the tree with Index set to the +// lowest/highest index possible in the whole immediate/leafs expanse. +// TBD: If leaf secondary dead end occurs, could shortcut and treat it as a +// branch primary dead end; but this would require remembering the parent +// branchs type and offset (a "one-deep stack"), and also wrestling with +// narrow pointers, at least for leaves (but not for immediates). +// +// Note some ASYMMETRIES between SearchValid and SearchEmpty: +// +// - The SearchValid code, upon descending through a narrow pointer, if Index +// is outside the expanse of the subsidiary node (effectively a secondary +// dead end), must decide whether to backtrack or findlimit. But the +// SearchEmpty code simply returns success (Index is empty). +// +// - Similarly, the SearchValid code, upon finding no previous/next index in +// the expanse of a narrow pointer (again, a secondary dead end), can simply +// start to backtrack at the parent JP. But the SearchEmpty code would have +// to first determine whether or not the parent JPs narrow expanse contains +// a previous/next empty index outside the subexpanse. Rather than keeping a +// parent state stack and backtracking this way, upon a secondary dead end, +// the SearchEmpty code simply restarts at the top of the tree, whether or +// not a narrow pointer is involved. Again, see the equivalent comments in +// JudyPrevNext.c for comparison. +// +// This function is written iteratively for speed, rather than recursively. +// +// TBD: Wed like to enhance this function to make successive searches faster. +// This would require saving some previous state, including the previous Index +// returned, and in which leaf it was found. If the next call is for the same +// Index and the array has not been modified, start at the same leaf. This +// should be much easier to implement since this is iterative rather than +// recursive code. + +#ifdef JUDY1 +#ifdef JUDYPREV +FUNCTION int Judy1PrevEmpty +#else +FUNCTION int Judy1NextEmpty +#endif +#else +#ifdef JUDYPREV +FUNCTION int JudyLPrevEmpty +#else +FUNCTION int JudyLNextEmpty +#endif +#endif + ( + Pcvoid_t PArray, // Judy array to search. + Word_t * PIndex, // starting point and result. + PJError_t PJError // optional, for returning error info. + ) +{ + Word_t Index; // fast copy, in a register. + Pjp_t Pjp; // current JP. + Pjbl_t Pjbl; // Pjp->jp_Addr masked and cast to types: + Pjbb_t Pjbb; + Pjbu_t Pjbu; + Pjlb_t Pjlb; + PWord_t Pword; // alternate name for use by GET* macros. + + Word_t digit; // next digit to decode from Index. + Word_t digits; // current state in SM = digits left to decode. + Word_t pop0; // in a leaf. + Word_t pop0mask; // precalculated to avoid variable shifts. + long offset; // within a branch or leaf (can be large). + int subexp; // subexpanse in a bitmap branch. + BITMAPB_t bitposmaskB; // bit in bitmap for bitmap branch. + BITMAPL_t bitposmaskL; // bit in bitmap for bitmap leaf. + Word_t possfullJP1; // JP types for possibly full subexpanses: + Word_t possfullJP2; + Word_t possfullJP3; + + +// ---------------------------------------------------------------------------- +// M A C R O S +// +// These are intended to make the code a bit more readable and less redundant. + + +// CHECK FOR NULL JP: +// +// TBD: In principle this can be reduced (here and in other *.c files) to just +// the latter clause since no Type should ever be below cJU_JPNULL1, but in +// fact some root pointer types can be lower, so for safety do both checks. + +#define JPNULL(Type) (((Type) >= cJU_JPNULL1) && ((Type) <= cJU_JPNULLMAX)) + + +// CHECK FOR A FULL JP: +// +// Given a JP, indicate if it is fully populated. Use digits, pop0mask, and +// possfullJP1..3 in the context. +// +// This is a difficult problem because it requires checking the Pop0 bits for +// all-ones, but the number of bytes depends on the JP type, which is not +// directly related to the parent branchs type or level -- the JPs child +// could be under a narrow pointer (hence not full). The simple answer +// requires switching on or otherwise calculating the JP type, which could be +// slow. Instead, in SMPREPB* precalculate pop0mask and also record in +// possfullJP1..3 the child JP (branch) types that could possibly be full (one +// level down), and use them here. For level-2 branches (with digits == 2), +// the test for a full child depends on Judy1/JudyL. +// +// Note: This cannot be applied to the JP in a JPM because it doesnt have +// enough pop0 digits. +// +// TBD: JPFULL_BRANCH diligently checks for BranchL or BranchB, where neither +// of those can ever be full as it turns out. Could just check for a BranchU +// at the right level. Also, pop0mask might be overkill, its not used much, +// so perhaps just call cJU_POP0MASK(digits - 1) here? +// +// First, JPFULL_BRANCH checks for a full expanse for a JP whose child can be a +// branch, that is, a JP in a branch at level 3 or higher: + +#define JPFULL_BRANCH(Pjp) \ + ((((JU_JPDCDPOP0(Pjp) ^ cJU_ALLONES) & pop0mask) == 0) \ + && ((JU_JPTYPE(Pjp) == possfullJP1) \ + || (JU_JPTYPE(Pjp) == possfullJP2) \ + || (JU_JPTYPE(Pjp) == possfullJP3))) + +#ifdef JUDY1 +#define JPFULL(Pjp) \ + ((digits == 2) ? \ + (JU_JPTYPE(Pjp) == cJ1_JPFULLPOPU1) : JPFULL_BRANCH(Pjp)) +#else +#define JPFULL(Pjp) \ + ((digits == 2) ? \ + (JU_JPTYPE(Pjp) == cJU_JPLEAF_B1) \ + && (((JU_JPDCDPOP0(Pjp) & cJU_POP0MASK(1)) == cJU_POP0MASK(1))) : \ + JPFULL_BRANCH(Pjp)) +#endif + + +// RETURN SUCCESS: +// +// This hides the need to set *PIndex back to the local value of Index -- use a +// local value for faster operation. Note that the callers *PIndex is ALWAYS +// modified upon success, at least decremented/incremented. + +#define RET_SUCCESS { *PIndex = Index; return(1); } + + +// RETURN A CORRUPTION: + +#define RET_CORRUPT { JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); return(JERRI); } + + +// SEARCH A BITMAP BRANCH: +// +// This is a weak analog of j__udySearchLeaf*() for bitmap branches. Return +// the actual or next-left position, base 0, of Digit in a BITMAPB_t bitmap +// (subexpanse of a full bitmap), also given a Bitposmask for Digit. The +// position is the offset within the set bits. +// +// Unlike j__udySearchLeaf*(), the offset is not returned bit-complemented if +// Digits bit is unset, because the caller can check the bitmap themselves to +// determine that. Also, if Digits bit is unset, the returned offset is to +// the next-left JP or index (including -1), not to the "ideal" position for +// the index = next-right JP or index. +// +// Shortcut and skip calling j__udyCountBitsB() if the bitmap is full, in which +// case (Digit % cJU_BITSPERSUBEXPB) itself is the base-0 offset. + +#define SEARCHBITMAPB(Bitmap,Digit,Bitposmask) \ + (((Bitmap) == cJU_FULLBITMAPB) ? (Digit % cJU_BITSPERSUBEXPB) : \ + j__udyCountBitsB((Bitmap) & JU_MASKLOWERINC(Bitposmask)) - 1) + +#ifdef JUDYPREV +// Equivalent to search for the highest offset in Bitmap, that is, one less +// than the number of bits set: + +#define SEARCHBITMAPMAXB(Bitmap) \ + (((Bitmap) == cJU_FULLBITMAPB) ? cJU_BITSPERSUBEXPB - 1 : \ + j__udyCountBitsB(Bitmap) - 1) +#endif + + +// CHECK DECODE BYTES: +// +// Check Decode bytes in a JP against the equivalent portion of Index. If they +// dont match, Index is outside the subexpanse of a narrow pointer, hence is +// empty. + +#define CHECKDCD(cDigits) \ + if (JU_DCDNOTMATCHINDEX(Index, Pjp, cDigits)) RET_SUCCESS + + +// REVISE REMAINDER OF INDEX: +// +// Put one digit in place in Index and clear/set the lower digits, if any, so +// the resulting Index is at the start/end of an expanse, or just clear/set the +// least digits. +// +// Actually, to make simple use of JU_LEASTBYTESMASK, first clear/set all least +// digits of Index including the digit to be overridden, then set the value of +// that one digit. If Digits == 1 the first operation is redundant, but either +// very fast or even removed by the optimizer. + +#define CLEARLEASTDIGITS(Digits) Index &= ~JU_LEASTBYTESMASK(Digits) +#define SETLEASTDIGITS( Digits) Index |= JU_LEASTBYTESMASK(Digits) + +#define CLEARLEASTDIGITS_D(Digit,Digits) \ + { \ + CLEARLEASTDIGITS(Digits); \ + JU_SETDIGIT(Index, Digit, Digits); \ + } + +#define SETLEASTDIGITS_D(Digit,Digits) \ + { \ + SETLEASTDIGITS(Digits); \ + JU_SETDIGIT(Index, Digit, Digits); \ + } + + +// SET REMAINDER OF INDEX AND THEN RETURN OR CONTINUE: + +#define SET_AND_RETURN(OpLeastDigits,Digit,Digits) \ + { \ + OpLeastDigits(Digit, Digits); \ + RET_SUCCESS; \ + } + +#define SET_AND_CONTINUE(OpLeastDigits,Digit,Digits) \ + { \ + OpLeastDigits(Digit, Digits); \ + goto SMGetContinue; \ + } + + +// PREPARE TO HANDLE A LEAFW OR JP BRANCH IN THE STATE MACHINE: +// +// Extract a state-dependent digit from Index in a "constant" way, then jump to +// common code for multiple cases. +// +// TBD: Should this macro do more, such as preparing variable-shift masks for +// use in CLEARLEASTDIGITS and SETLEASTDIGITS? + +#define SMPREPB(cDigits,Next,PossFullJP1,PossFullJP2,PossFullJP3) \ + digits = (cDigits); \ + digit = JU_DIGITATSTATE(Index, cDigits); \ + pop0mask = cJU_POP0MASK((cDigits) - 1); /* for branchs JPs */ \ + possfullJP1 = (PossFullJP1); \ + possfullJP2 = (PossFullJP2); \ + possfullJP3 = (PossFullJP3); \ + goto Next + +// Variations for specific-level branches and for shorthands: +// +// Note: SMPREPB2 need not initialize possfullJP* because JPFULL does not use +// them for digits == 2, but gcc -Wall isnt quite smart enough to see this, so +// waste a bit of time and space to get rid of the warning: + +#define SMPREPB2(Next) \ + digits = 2; \ + digit = JU_DIGITATSTATE(Index, 2); \ + pop0mask = cJU_POP0MASK(1); /* for branchs JPs */ \ + possfullJP1 = possfullJP2 = possfullJP3 = 0; \ + goto Next + +#define SMPREPB3(Next) SMPREPB(3, Next, cJU_JPBRANCH_L2, \ + cJU_JPBRANCH_B2, \ + cJU_JPBRANCH_U2) +#ifndef JU_64BIT +#define SMPREPBL(Next) SMPREPB(cJU_ROOTSTATE, Next, cJU_JPBRANCH_L3, \ + cJU_JPBRANCH_B3, \ + cJU_JPBRANCH_U3) +#else +#define SMPREPB4(Next) SMPREPB(4, Next, cJU_JPBRANCH_L3, \ + cJU_JPBRANCH_B3, \ + cJU_JPBRANCH_U3) +#define SMPREPB5(Next) SMPREPB(5, Next, cJU_JPBRANCH_L4, \ + cJU_JPBRANCH_B4, \ + cJU_JPBRANCH_U4) +#define SMPREPB6(Next) SMPREPB(6, Next, cJU_JPBRANCH_L5, \ + cJU_JPBRANCH_B5, \ + cJU_JPBRANCH_U5) +#define SMPREPB7(Next) SMPREPB(7, Next, cJU_JPBRANCH_L6, \ + cJU_JPBRANCH_B6, \ + cJU_JPBRANCH_U6) +#define SMPREPBL(Next) SMPREPB(cJU_ROOTSTATE, Next, cJU_JPBRANCH_L7, \ + cJU_JPBRANCH_B7, \ + cJU_JPBRANCH_U7) +#endif + + +// RESTART AFTER SECONDARY DEAD END: +// +// Set Index to the first/last index in the branch or leaf subexpanse and start +// over at the top of the tree. + +#ifdef JUDYPREV +#define SMRESTART(Digits) { CLEARLEASTDIGITS(Digits); goto SMGetRestart; } +#else +#define SMRESTART(Digits) { SETLEASTDIGITS( Digits); goto SMGetRestart; } +#endif + + +// CHECK EDGE OF LEAFS EXPANSE: +// +// Given the LSBs of the lowest/highest valid index in a leaf (or equivalently +// in an immediate JP), the level (index size) of the leaf, and the full index +// to return (as Index in the context) already set to the full index matching +// the lowest/highest one, determine if there is an empty index in the leafs +// expanse below/above the lowest/highest index, which is true if the +// lowest/highest index is not at the "edge" of the leafs expanse based on its +// LSBs. If so, return Index decremented/incremented; otherwise restart at the +// top of the tree. +// +// Note: In many cases Index is already at the right spot and calling +// SMRESTART instead of just going directly to SMGetRestart is a bit of +// overkill. +// +// Note: Variable shift occurs if Digits is not a constant. + +#ifdef JUDYPREV +#define LEAF_EDGE(MinIndex,Digits) \ + { \ + if (MinIndex) { --Index; RET_SUCCESS; } \ + SMRESTART(Digits); \ + } +#else +#define LEAF_EDGE(MaxIndex,Digits) \ + { \ + if ((MaxIndex) != JU_LEASTBYTES(cJU_ALLONES, Digits)) \ + { ++Index; RET_SUCCESS; } \ + SMRESTART(Digits); \ + } +#endif + +// Same as above except Index is not already set to match the lowest/highest +// index, so do that before decrementing/incrementing it: + +#ifdef JUDYPREV +#define LEAF_EDGE_SET(MinIndex,Digits) \ + { \ + if (MinIndex) \ + { JU_SETDIGITS(Index, MinIndex, Digits); --Index; RET_SUCCESS; } \ + SMRESTART(Digits); \ + } +#else +#define LEAF_EDGE_SET(MaxIndex,Digits) \ + { \ + if ((MaxIndex) != JU_LEASTBYTES(cJU_ALLONES, Digits)) \ + { JU_SETDIGITS(Index, MaxIndex, Digits); ++Index; RET_SUCCESS; } \ + SMRESTART(Digits); \ + } +#endif + + +// FIND A HOLE (EMPTY INDEX) IN AN IMMEDIATE OR LEAF: +// +// Given an index location in a leaf (or equivalently an immediate JP) known to +// contain a usable hole (an empty index less/greater than Index), and the LSBs +// of a minimum/maximum index to locate, find the previous/next empty index and +// return it. +// +// Note: "Even" index sizes (1,2,4[,8] bytes) have corresponding native C +// types; "odd" index sizes dont, but they are not represented here because +// they are handled completely differently; see elsewhere. + +#ifdef JUDYPREV + +#define LEAF_HOLE_EVEN(cDigits,Pjll,IndexLSB) \ + { \ + while (*(Pjll) > (IndexLSB)) --(Pjll); /* too high */ \ + if (*(Pjll) < (IndexLSB)) RET_SUCCESS /* Index is empty */ \ + while (*(--(Pjll)) == --(IndexLSB)) /* null, find a hole */;\ + JU_SETDIGITS(Index, IndexLSB, cDigits); \ + RET_SUCCESS; \ + } +#else +#define LEAF_HOLE_EVEN(cDigits,Pjll,IndexLSB) \ + { \ + while (*(Pjll) < (IndexLSB)) ++(Pjll); /* too low */ \ + if (*(Pjll) > (IndexLSB)) RET_SUCCESS /* Index is empty */ \ + while (*(++(Pjll)) == ++(IndexLSB)) /* null, find a hole */;\ + JU_SETDIGITS(Index, IndexLSB, cDigits); \ + RET_SUCCESS; \ + } +#endif + + +// SEARCH FOR AN EMPTY INDEX IN AN IMMEDIATE OR LEAF: +// +// Given a pointer to the first index in a leaf (or equivalently an immediate +// JP), the population of the leaf, and a first empty Index to find (inclusive, +// as Index in the context), where Index is known to fall within the expanse of +// the leaf to search, efficiently find the previous/next empty index in the +// leaf, if any. For simplicity the following overview is stated in terms of +// Judy*NextEmpty() only, but the same concepts apply symmetrically for +// Judy*PrevEmpty(). Also, in each case the comparisons are for the LSBs of +// Index and leaf indexes, according to the leafs level. +// +// 1. If Index is GREATER than the last (highest) index in the leaf +// (maxindex), return success, Index is empty. (Remember, Index is known +// to be in the leafs expanse.) +// +// 2. If Index is EQUAL to maxindex: If maxindex is not at the edge of the +// leafs expanse, increment Index and return success, there is an empty +// Index one higher than any in the leaf; otherwise restart with Index +// reset to the upper edge of the leafs expanse. Note: This might cause +// an extra cache line fill, but this is OK for repeatedly-called search +// code, and it saves CPU time. +// +// 3. If Index is LESS than maxindex, check for "dense to end of leaf": +// Subtract Index from maxindex, and back up that many slots in the leaf. +// If the resulting offset is not before the start of the leaf then compare +// the index at this offset (baseindex) with Index: +// +// 3a. If GREATER, the leaf must be corrupt, since indexes are sorted and +// there are no duplicates. +// +// 3b. If EQUAL, the leaf is "dense" from Index to maxindex, meaning there is +// no reason to search it. "Slide right" to the high end of the leaf +// (modify Index to maxindex) and continue with step 2 above. +// +// 3c. If LESS, continue with step 4. +// +// 4. If the offset based on maxindex minus Index falls BEFORE the start of +// the leaf, or if, per 3c above, baseindex is LESS than Index, the leaf is +// guaranteed "not dense to the end" and a usable empty Index must exist. +// This supports a more efficient search loop. Start at the FIRST index in +// the leaf, or one BEYOND baseindex, respectively, and search the leaf as +// follows, comparing each current index (currindex) with Index: +// +// 4a. If LESS, keep going to next index. Note: This is certain to terminate +// because maxindex is known to be greater than Index, hence the loop can +// be small and fast. +// +// 4b. If EQUAL, loop and increment Index until finding currindex greater than +// Index, and return success with the modified Index. +// +// 4c. If GREATER, return success, Index (unmodified) is empty. +// +// Note: These are macros rather than functions for speed. + +#ifdef JUDYPREV + +#define JSLE_EVEN(Addr,Pop0,cDigits,LeafType) \ + { \ + LeafType * PjllLSB = (LeafType *) (Addr); \ + LeafType IndexLSB = Index; /* auto-masking */ \ + \ + /* Index before or at start of leaf: */ \ + \ + if (*PjllLSB >= IndexLSB) /* no need to search */ \ + { \ + if (*PjllLSB > IndexLSB) RET_SUCCESS; /* Index empty */ \ + LEAF_EDGE(*PjllLSB, cDigits); \ + } \ + \ + /* Index in or after leaf: */ \ + \ + offset = IndexLSB - *PjllLSB; /* tentative offset */ \ + if (offset <= (Pop0)) /* can check density */ \ + { \ + PjllLSB += offset; /* move to slot */ \ + \ + if (*PjllLSB <= IndexLSB) /* dense or corrupt */ \ + { \ + if (*PjllLSB == IndexLSB) /* dense, check edge */ \ + LEAF_EDGE_SET(PjllLSB[-offset], cDigits); \ + RET_CORRUPT; \ + } \ + --PjllLSB; /* not dense, start at previous */ \ + } \ + else PjllLSB = ((LeafType *) (Addr)) + (Pop0); /* start at max */ \ + \ + LEAF_HOLE_EVEN(cDigits, PjllLSB, IndexLSB); \ + } + +// JSLE_ODD is completely different from JSLE_EVEN because its important to +// minimize copying odd indexes to compare them (see 4.14). Furthermore, a +// very complex version (4.17, but abandoned before fully debugged) that +// avoided calling j__udySearchLeaf*() ran twice as fast as 4.14, but still +// half as fast as SearchValid. Doug suggested that to minimize complexity and +// share common code we should use j__udySearchLeaf*() for the initial search +// to establish if Index is empty, which should be common. If Index is valid +// in a leaf or immediate indexes, odds are good that an empty Index is nearby, +// so for simplicity just use a *COPY* function to linearly search the +// remainder. +// +// TBD: Pathological case? Average performance should be good, but worst-case +// might suffer. When Search says the initial Index is valid, so a linear +// copy-and-compare is begun, if the caller builds fairly large leaves with +// dense clusters AND frequently does a SearchEmpty at one end of such a +// cluster, performance wont be very good. Might a dense-check help? This +// means checking offset against the index at offset, and then against the +// first/last index in the leaf. We doubt the pathological case will appear +// much in real applications because they will probably alternate SearchValid +// and SearchEmpty calls. + +#define JSLE_ODD(cDigits,Pjll,Pop0,Search,Copy) \ + { \ + Word_t IndexLSB; /* least bytes only */ \ + Word_t IndexFound; /* in leaf */ \ + \ + if ((offset = Search(Pjll, (Pop0) + 1, Index)) < 0) \ + RET_SUCCESS; /* Index is empty */ \ + \ + IndexLSB = JU_LEASTBYTES(Index, cDigits); \ + offset *= (cDigits); \ + \ + while ((offset -= (cDigits)) >= 0) \ + { /* skip until empty or start */ \ + Copy(IndexFound, ((uint8_t *) (Pjll)) + offset); \ + if (IndexFound != (--IndexLSB)) /* found an empty */ \ + { JU_SETDIGITS(Index, IndexLSB, cDigits); RET_SUCCESS; }\ + } \ + LEAF_EDGE_SET(IndexLSB, cDigits); \ + } + +#else // JUDYNEXT + +#define JSLE_EVEN(Addr,Pop0,cDigits,LeafType) \ + { \ + LeafType * PjllLSB = ((LeafType *) (Addr)) + (Pop0); \ + LeafType IndexLSB = Index; /* auto-masking */ \ + \ + /* Index at or after end of leaf: */ \ + \ + if (*PjllLSB <= IndexLSB) /* no need to search */ \ + { \ + if (*PjllLSB < IndexLSB) RET_SUCCESS; /* Index empty */\ + LEAF_EDGE(*PjllLSB, cDigits); \ + } \ + \ + /* Index before or in leaf: */ \ + \ + offset = *PjllLSB - IndexLSB; /* tentative offset */ \ + if (offset <= (Pop0)) /* can check density */ \ + { \ + PjllLSB -= offset; /* move to slot */ \ + \ + if (*PjllLSB >= IndexLSB) /* dense or corrupt */ \ + { \ + if (*PjllLSB == IndexLSB) /* dense, check edge */ \ + LEAF_EDGE_SET(PjllLSB[offset], cDigits); \ + RET_CORRUPT; \ + } \ + ++PjllLSB; /* not dense, start at next */ \ + } \ + else PjllLSB = (LeafType *) (Addr); /* start at minimum */ \ + \ + LEAF_HOLE_EVEN(cDigits, PjllLSB, IndexLSB); \ + } + +#define JSLE_ODD(cDigits,Pjll,Pop0,Search,Copy) \ + { \ + Word_t IndexLSB; /* least bytes only */ \ + Word_t IndexFound; /* in leaf */ \ + int offsetmax; /* in bytes */ \ + \ + if ((offset = Search(Pjll, (Pop0) + 1, Index)) < 0) \ + RET_SUCCESS; /* Index is empty */ \ + \ + IndexLSB = JU_LEASTBYTES(Index, cDigits); \ + offset *= (cDigits); \ + offsetmax = (Pop0) * (cDigits); /* single multiply */ \ + \ + while ((offset += (cDigits)) <= offsetmax) \ + { /* skip until empty or end */ \ + Copy(IndexFound, ((uint8_t *) (Pjll)) + offset); \ + if (IndexFound != (++IndexLSB)) /* found an empty */ \ + { JU_SETDIGITS(Index, IndexLSB, cDigits); RET_SUCCESS; } \ + } \ + LEAF_EDGE_SET(IndexLSB, cDigits); \ + } + +#endif // JUDYNEXT + +// Note: Immediate indexes never fill a single index group, so for odd index +// sizes, save time by calling JSLE_ODD_IMM instead of JSLE_ODD. + +#define j__udySearchLeafEmpty1(Addr,Pop0) \ + JSLE_EVEN(Addr, Pop0, 1, uint8_t) + +#define j__udySearchLeafEmpty2(Addr,Pop0) \ + JSLE_EVEN(Addr, Pop0, 2, uint16_t) + +#define j__udySearchLeafEmpty3(Addr,Pop0) \ + JSLE_ODD(3, Addr, Pop0, j__udySearchLeaf3, JU_COPY3_PINDEX_TO_LONG) + +#ifndef JU_64BIT + +#define j__udySearchLeafEmptyL(Addr,Pop0) \ + JSLE_EVEN(Addr, Pop0, 4, Word_t) + +#else + +#define j__udySearchLeafEmpty4(Addr,Pop0) \ + JSLE_EVEN(Addr, Pop0, 4, uint32_t) + +#define j__udySearchLeafEmpty5(Addr,Pop0) \ + JSLE_ODD(5, Addr, Pop0, j__udySearchLeaf5, JU_COPY5_PINDEX_TO_LONG) + +#define j__udySearchLeafEmpty6(Addr,Pop0) \ + JSLE_ODD(6, Addr, Pop0, j__udySearchLeaf6, JU_COPY6_PINDEX_TO_LONG) + +#define j__udySearchLeafEmpty7(Addr,Pop0) \ + JSLE_ODD(7, Addr, Pop0, j__udySearchLeaf7, JU_COPY7_PINDEX_TO_LONG) + +#define j__udySearchLeafEmptyL(Addr,Pop0) \ + JSLE_EVEN(Addr, Pop0, 8, Word_t) + +#endif // JU_64BIT + + +// ---------------------------------------------------------------------------- +// START OF CODE: +// +// CHECK FOR SHORTCUTS: +// +// Error out if PIndex is null. + + if (PIndex == (PWord_t) NULL) + { + JU_SET_ERRNO(PJError, JU_ERRNO_NULLPINDEX); + return(JERRI); + } + + Index = *PIndex; // fast local copy. + +// Set and pre-decrement/increment Index, watching for underflow/overflow: +// +// An out-of-bounds Index means failure: No previous/next empty index. + +SMGetRestart: // return here with revised Index. + +#ifdef JUDYPREV + if (Index-- == 0) return(0); +#else + if (++Index == 0) return(0); +#endif + +// An empty array with an in-bounds (not underflowed/overflowed) Index means +// success: +// +// Note: This check is redundant after restarting at SMGetRestart, but should +// take insignificant time. + + if (PArray == (Pvoid_t) NULL) RET_SUCCESS; + +// ---------------------------------------------------------------------------- +// ROOT-LEVEL LEAF that starts with a Pop0 word; just look within the leaf: +// +// If Index is not in the leaf, return success; otherwise return the first +// empty Index, if any, below/above where it would belong. + + if (JU_LEAFW_POP0(PArray) < cJU_LEAFW_MAXPOP1) // must be a LEAFW + { + Pjlw_t Pjlw = P_JLW(PArray); // first word of leaf. + pop0 = Pjlw[0]; + +#ifdef JUDY1 + if (pop0 == 0) // special case. + { +#ifdef JUDYPREV + if ((Index != Pjlw[1]) || (Index-- != 0)) RET_SUCCESS; +#else + if ((Index != Pjlw[1]) || (++Index != 0)) RET_SUCCESS; +#endif + return(0); // no previous/next empty index. + } +#endif // JUDY1 + + j__udySearchLeafEmptyL(Pjlw + 1, pop0); + +// No return -- thanks ALAN + + } + else + +// ---------------------------------------------------------------------------- +// HANDLE JRP Branch: +// +// For JRP branches, traverse the JPM; handle LEAFW +// directly; but look for the most common cases first. + + { + Pjpm_t Pjpm = P_JPM(PArray); + Pjp = &(Pjpm->jpm_JP); + +// goto SMGetContinue; + } + + +// ============================================================================ +// STATE MACHINE -- GET INDEX: +// +// Search for Index (already decremented/incremented so as to be an inclusive +// search). If not found (empty index), return success. Otherwise do a +// previous/next search, and if successful modify Index to the empty index +// found. See function header comments. +// +// ENTRY: Pjp points to next JP to interpret, whose Decode bytes have not yet +// been checked. +// +// Note: Check Decode bytes at the start of each loop, not after looking up a +// new JP, so its easy to do constant shifts/masks. +// +// EXIT: Return, or branch to SMGetRestart with modified Index, or branch to +// SMGetContinue with a modified Pjp, as described elsewhere. +// +// WARNING: For run-time efficiency the following cases replicate code with +// varying constants, rather than using common code with variable values! + +SMGetContinue: // return here for next branch/leaf. + +#ifdef TRACEJPSE + JudyPrintJP(Pjp, "sf", __LINE__); +#endif + + switch (JU_JPTYPE(Pjp)) + { + + +// ---------------------------------------------------------------------------- +// LINEAR BRANCH: +// +// Check Decode bytes, if any, in the current JP, then search for a JP for the +// next digit in Index. + + case cJU_JPBRANCH_L2: CHECKDCD(2); SMPREPB2(SMBranchL); + case cJU_JPBRANCH_L3: CHECKDCD(3); SMPREPB3(SMBranchL); +#ifdef JU_64BIT + case cJU_JPBRANCH_L4: CHECKDCD(4); SMPREPB4(SMBranchL); + case cJU_JPBRANCH_L5: CHECKDCD(5); SMPREPB5(SMBranchL); + case cJU_JPBRANCH_L6: CHECKDCD(6); SMPREPB6(SMBranchL); + case cJU_JPBRANCH_L7: CHECKDCD(7); SMPREPB7(SMBranchL); +#endif + case cJU_JPBRANCH_L: SMPREPBL(SMBranchL); + +// Common code (state-independent) for all cases of linear branches: + +SMBranchL: + Pjbl = P_JBL(Pjp->jp_Addr); + +// First, check if Indexs expanse (digit) is below/above the first/last +// populated expanse in the BranchL, in which case Index is empty; otherwise +// find the offset of the lowest/highest populated expanse at or above/below +// digit, if any: +// +// Note: The for-loop is guaranteed to exit eventually because the first/last +// expanse is known to be a terminator. +// +// Note: Cannot use j__udySearchLeaf*Empty1() here because it only applies to +// leaves and does not know about partial versus full JPs, unlike the use of +// j__udySearchLeaf1() for BranchLs in SearchValid code. Also, since linear +// leaf expanse lists are small, dont waste time calling j__udySearchLeaf1(), +// just scan the expanse list. + +#ifdef JUDYPREV + if ((Pjbl->jbl_Expanse[0]) > digit) RET_SUCCESS; + + for (offset = (Pjbl->jbl_NumJPs) - 1; /* null */; --offset) +#else + if ((Pjbl->jbl_Expanse[(Pjbl->jbl_NumJPs) - 1]) < digit) + RET_SUCCESS; + + for (offset = 0; /* null */; ++offset) +#endif + { + +// Too low/high, keep going; or too high/low, meaning the loop passed a hole +// and the initial Index is empty: + +#ifdef JUDYPREV + if ((Pjbl->jbl_Expanse[offset]) > digit) continue; + if ((Pjbl->jbl_Expanse[offset]) < digit) RET_SUCCESS; +#else + if ((Pjbl->jbl_Expanse[offset]) < digit) continue; + if ((Pjbl->jbl_Expanse[offset]) > digit) RET_SUCCESS; +#endif + +// Found expanse matching digit; if its not full, traverse through it: + + if (! JPFULL((Pjbl->jbl_jp) + offset)) + { + Pjp = (Pjbl->jbl_jp) + offset; + goto SMGetContinue; + } + +// Common code: While searching for a lower/higher hole or a non-full JP, upon +// finding a lower/higher hole, adjust Index using the revised digit and +// return; or upon finding a consecutive lower/higher expanse, if the expanses +// JP is non-full, modify Index and traverse through the JP: + +#define BRANCHL_CHECK(OpIncDec,OpLeastDigits,Digit,Digits) \ + { \ + if ((Pjbl->jbl_Expanse[offset]) != OpIncDec digit) \ + SET_AND_RETURN(OpLeastDigits, Digit, Digits); \ + \ + if (! JPFULL((Pjbl->jbl_jp) + offset)) \ + { \ + Pjp = (Pjbl->jbl_jp) + offset; \ + SET_AND_CONTINUE(OpLeastDigits, Digit, Digits); \ + } \ + } + +// BranchL primary dead end: Expanse matching Index/digit is full (rare except +// for dense/sequential indexes): +// +// Search for a lower/higher hole, a non-full JP, or the end of the expanse +// list, while decrementing/incrementing digit. + +#ifdef JUDYPREV + while (--offset >= 0) + BRANCHL_CHECK(--, SETLEASTDIGITS_D, digit, digits) +#else + while (++offset < Pjbl->jbl_NumJPs) + BRANCHL_CHECK(++, CLEARLEASTDIGITS_D, digit, digits) +#endif + +// Passed end of BranchL expanse list after finding a matching but full +// expanse: +// +// Digit now matches the lowest/highest expanse, which is a full expanse; if +// digit is at the end of BranchLs expanse (no hole before/after), break out +// of the loop; otherwise modify Index to the next lower/higher digit and +// return success: + +#ifdef JUDYPREV + if (digit == 0) break; + --digit; SET_AND_RETURN(SETLEASTDIGITS_D, digit, digits); +#else + if (digit == JU_LEASTBYTES(cJU_ALLONES, 1)) break; + ++digit; SET_AND_RETURN(CLEARLEASTDIGITS_D, digit, digits); +#endif + } // for-loop + +// BranchL secondary dead end, no non-full previous/next JP: + + SMRESTART(digits); + + +// ---------------------------------------------------------------------------- +// BITMAP BRANCH: +// +// Check Decode bytes, if any, in the current JP, then search for a JP for the +// next digit in Index. + + case cJU_JPBRANCH_B2: CHECKDCD(2); SMPREPB2(SMBranchB); + case cJU_JPBRANCH_B3: CHECKDCD(3); SMPREPB3(SMBranchB); +#ifdef JU_64BIT + case cJU_JPBRANCH_B4: CHECKDCD(4); SMPREPB4(SMBranchB); + case cJU_JPBRANCH_B5: CHECKDCD(5); SMPREPB5(SMBranchB); + case cJU_JPBRANCH_B6: CHECKDCD(6); SMPREPB6(SMBranchB); + case cJU_JPBRANCH_B7: CHECKDCD(7); SMPREPB7(SMBranchB); +#endif + case cJU_JPBRANCH_B: SMPREPBL(SMBranchB); + +// Common code (state-independent) for all cases of bitmap branches: + +SMBranchB: + Pjbb = P_JBB(Pjp->jp_Addr); + +// Locate the digits JP in the subexpanse list, if present: + + subexp = digit / cJU_BITSPERSUBEXPB; + assert(subexp < cJU_NUMSUBEXPB); // falls in expected range. + bitposmaskB = JU_BITPOSMASKB(digit); + +// Absent JP = no JP matches current digit in Index: + +// if (! JU_BITMAPTESTB(Pjbb, digit)) // slower. + if (! (JU_JBB_BITMAP(Pjbb, subexp) & bitposmaskB)) // faster. + RET_SUCCESS; + +// Non-full JP matches current digit in Index: +// +// Iterate to the subsidiary non-full JP. + + offset = SEARCHBITMAPB(JU_JBB_BITMAP(Pjbb, subexp), digit, + bitposmaskB); + // not negative since at least one bit is set: + assert(offset >= 0); + assert(offset < (int) cJU_BITSPERSUBEXPB); + +// Watch for null JP subarray pointer with non-null bitmap (a corruption): + + if ((Pjp = P_JP(JU_JBB_PJP(Pjbb, subexp))) + == (Pjp_t) NULL) RET_CORRUPT; + + Pjp += offset; + if (! JPFULL(Pjp)) goto SMGetContinue; + +// BranchB primary dead end: +// +// Upon hitting a full JP in a BranchB for the next digit in Index, search +// sideways for a previous/next absent JP (unset bit) or non-full JP (set bit +// with non-full JP); first in the current bitmap subexpanse, then in +// lower/higher subexpanses. Upon entry, Pjp points to a known-unusable JP, +// ready to decrement/increment. +// +// Note: The preceding code is separate from this loop because Index does not +// need revising (see SET_AND_*()) if the initial index is an empty index. +// +// TBD: For speed, shift bitposmaskB instead of using JU_BITMAPTESTB or +// JU_BITPOSMASKB, but this shift has knowledge of bit order that really should +// be encapsulated in a header file. + +#define BRANCHB_CHECKBIT(OpLeastDigits) \ + if (! (JU_JBB_BITMAP(Pjbb, subexp) & bitposmaskB)) /* absent JP */ \ + SET_AND_RETURN(OpLeastDigits, digit, digits) + +#define BRANCHB_CHECKJPFULL(OpLeastDigits) \ + if (! JPFULL(Pjp)) \ + SET_AND_CONTINUE(OpLeastDigits, digit, digits) + +#define BRANCHB_STARTSUBEXP(OpLeastDigits) \ + if (! JU_JBB_BITMAP(Pjbb, subexp)) /* empty subexpanse, shortcut */ \ + SET_AND_RETURN(OpLeastDigits, digit, digits) \ + if ((Pjp = P_JP(JU_JBB_PJP(Pjbb, subexp))) == (Pjp_t) NULL) RET_CORRUPT + +#ifdef JUDYPREV + + --digit; // skip initial digit. + bitposmaskB >>= 1; // see TBD above. + +BranchBNextSubexp: // return here to check next bitmap subexpanse. + + while (bitposmaskB) // more bits to check in subexp. + { + BRANCHB_CHECKBIT(SETLEASTDIGITS_D); + --Pjp; // previous in subarray. + BRANCHB_CHECKJPFULL(SETLEASTDIGITS_D); + assert(digit >= 0); + --digit; + bitposmaskB >>= 1; + } + + if (subexp-- > 0) // more subexpanses. + { + BRANCHB_STARTSUBEXP(SETLEASTDIGITS_D); + Pjp += SEARCHBITMAPMAXB(JU_JBB_BITMAP(Pjbb, subexp)) + 1; + bitposmaskB = (1U << (cJU_BITSPERSUBEXPB - 1)); + goto BranchBNextSubexp; + } + +#else // JUDYNEXT + + ++digit; // skip initial digit. + bitposmaskB <<= 1; // note: BITMAPB_t. + +BranchBNextSubexp: // return here to check next bitmap subexpanse. + + while (bitposmaskB) // more bits to check in subexp. + { + BRANCHB_CHECKBIT(CLEARLEASTDIGITS_D); + ++Pjp; // previous in subarray. + BRANCHB_CHECKJPFULL(CLEARLEASTDIGITS_D); + assert(digit < cJU_SUBEXPPERSTATE); + ++digit; + bitposmaskB <<= 1; // note: BITMAPB_t. + } + + if (++subexp < cJU_NUMSUBEXPB) // more subexpanses. + { + BRANCHB_STARTSUBEXP(CLEARLEASTDIGITS_D); + --Pjp; // pre-decrement. + bitposmaskB = 1; + goto BranchBNextSubexp; + } + +#endif // JUDYNEXT + +// BranchB secondary dead end, no non-full previous/next JP: + + SMRESTART(digits); + + +// ---------------------------------------------------------------------------- +// UNCOMPRESSED BRANCH: +// +// Check Decode bytes, if any, in the current JP, then search for a JP for the +// next digit in Index. + + case cJU_JPBRANCH_U2: CHECKDCD(2); SMPREPB2(SMBranchU); + case cJU_JPBRANCH_U3: CHECKDCD(3); SMPREPB3(SMBranchU); +#ifdef JU_64BIT + case cJU_JPBRANCH_U4: CHECKDCD(4); SMPREPB4(SMBranchU); + case cJU_JPBRANCH_U5: CHECKDCD(5); SMPREPB5(SMBranchU); + case cJU_JPBRANCH_U6: CHECKDCD(6); SMPREPB6(SMBranchU); + case cJU_JPBRANCH_U7: CHECKDCD(7); SMPREPB7(SMBranchU); +#endif + case cJU_JPBRANCH_U: SMPREPBL(SMBranchU); + +// Common code (state-independent) for all cases of uncompressed branches: + +SMBranchU: + Pjbu = P_JBU(Pjp->jp_Addr); + Pjp = (Pjbu->jbu_jp) + digit; + +// Absent JP = null JP for current digit in Index: + + if (JPNULL(JU_JPTYPE(Pjp))) RET_SUCCESS; + +// Non-full JP matches current digit in Index: +// +// Iterate to the subsidiary JP. + + if (! JPFULL(Pjp)) goto SMGetContinue; + +// BranchU primary dead end: +// +// Upon hitting a full JP in a BranchU for the next digit in Index, search +// sideways for a previous/next null or non-full JP. BRANCHU_CHECKJP() is +// shorthand for common code. +// +// Note: The preceding code is separate from this loop because Index does not +// need revising (see SET_AND_*()) if the initial index is an empty index. + +#define BRANCHU_CHECKJP(OpIncDec,OpLeastDigits) \ + { \ + OpIncDec Pjp; \ + \ + if (JPNULL(JU_JPTYPE(Pjp))) \ + SET_AND_RETURN(OpLeastDigits, digit, digits) \ + \ + if (! JPFULL(Pjp)) \ + SET_AND_CONTINUE(OpLeastDigits, digit, digits) \ + } + +#ifdef JUDYPREV + while (digit-- > 0) + BRANCHU_CHECKJP(--, SETLEASTDIGITS_D); +#else + while (++digit < cJU_BRANCHUNUMJPS) + BRANCHU_CHECKJP(++, CLEARLEASTDIGITS_D); +#endif + +// BranchU secondary dead end, no non-full previous/next JP: + + SMRESTART(digits); + + +// ---------------------------------------------------------------------------- +// LINEAR LEAF: +// +// Check Decode bytes, if any, in the current JP, then search the leaf for the +// previous/next empty index starting at Index. Primary leaf dead end is +// hidden within j__udySearchLeaf*Empty*(). In case of secondary leaf dead +// end, restart at the top of the tree. +// +// Note: Pword is the name known to GET*; think of it as Pjlw. + +#define SMLEAFL(cDigits,Func) \ + Pword = (PWord_t) P_JLW(Pjp->jp_Addr); \ + pop0 = JU_JPLEAF_POP0(Pjp); \ + Func(Pword, pop0) + +#if (defined(JUDYL) || (! defined(JU_64BIT))) + case cJU_JPLEAF1: CHECKDCD(1); SMLEAFL(1, j__udySearchLeafEmpty1); +#endif + case cJU_JPLEAF2: CHECKDCD(2); SMLEAFL(2, j__udySearchLeafEmpty2); + case cJU_JPLEAF3: CHECKDCD(3); SMLEAFL(3, j__udySearchLeafEmpty3); + +#ifdef JU_64BIT + case cJU_JPLEAF4: CHECKDCD(4); SMLEAFL(4, j__udySearchLeafEmpty4); + case cJU_JPLEAF5: CHECKDCD(5); SMLEAFL(5, j__udySearchLeafEmpty5); + case cJU_JPLEAF6: CHECKDCD(6); SMLEAFL(6, j__udySearchLeafEmpty6); + case cJU_JPLEAF7: CHECKDCD(7); SMLEAFL(7, j__udySearchLeafEmpty7); +#endif + + +// ---------------------------------------------------------------------------- +// BITMAP LEAF: +// +// Check Decode bytes, if any, in the current JP, then search the leaf for the +// previous/next empty index starting at Index. + + case cJU_JPLEAF_B1: + + CHECKDCD(1); + + Pjlb = P_JLB(Pjp->jp_Addr); + digit = JU_DIGITATSTATE(Index, 1); + subexp = digit / cJU_BITSPERSUBEXPL; + bitposmaskL = JU_BITPOSMASKL(digit); + assert(subexp < cJU_NUMSUBEXPL); // falls in expected range. + +// Absent index = no index matches current digit in Index: + +// if (! JU_BITMAPTESTL(Pjlb, digit)) // slower. + if (! (JU_JLB_BITMAP(Pjlb, subexp) & bitposmaskL)) // faster. + RET_SUCCESS; + +// LeafB1 primary dead end: +// +// Upon hitting a valid (non-empty) index in a LeafB1 for the last digit in +// Index, search sideways for a previous/next absent index, first in the +// current bitmap subexpanse, then in lower/higher subexpanses. +// LEAFB1_CHECKBIT() is shorthand for common code to handle one bit in one +// bitmap subexpanse. +// +// Note: The preceding code is separate from this loop because Index does not +// need revising (see SET_AND_*()) if the initial index is an empty index. +// +// TBD: For speed, shift bitposmaskL instead of using JU_BITMAPTESTL or +// JU_BITPOSMASKL, but this shift has knowledge of bit order that really should +// be encapsulated in a header file. + +#define LEAFB1_CHECKBIT(OpLeastDigits) \ + if (! (JU_JLB_BITMAP(Pjlb, subexp) & bitposmaskL)) \ + SET_AND_RETURN(OpLeastDigits, digit, 1) + +#define LEAFB1_STARTSUBEXP(OpLeastDigits) \ + if (! JU_JLB_BITMAP(Pjlb, subexp)) /* empty subexp */ \ + SET_AND_RETURN(OpLeastDigits, digit, 1) + +#ifdef JUDYPREV + + --digit; // skip initial digit. + bitposmaskL >>= 1; // see TBD above. + +LeafB1NextSubexp: // return here to check next bitmap subexpanse. + + while (bitposmaskL) // more bits to check in subexp. + { + LEAFB1_CHECKBIT(SETLEASTDIGITS_D); + assert(digit >= 0); + --digit; + bitposmaskL >>= 1; + } + + if (subexp-- > 0) // more subexpanses. + { + LEAFB1_STARTSUBEXP(SETLEASTDIGITS_D); + bitposmaskL = (1UL << (cJU_BITSPERSUBEXPL - 1)); + goto LeafB1NextSubexp; + } + +#else // JUDYNEXT + + ++digit; // skip initial digit. + bitposmaskL <<= 1; // note: BITMAPL_t. + +LeafB1NextSubexp: // return here to check next bitmap subexpanse. + + while (bitposmaskL) // more bits to check in subexp. + { + LEAFB1_CHECKBIT(CLEARLEASTDIGITS_D); + assert(digit < cJU_SUBEXPPERSTATE); + ++digit; + bitposmaskL <<= 1; // note: BITMAPL_t. + } + + if (++subexp < cJU_NUMSUBEXPL) // more subexpanses. + { + LEAFB1_STARTSUBEXP(CLEARLEASTDIGITS_D); + bitposmaskL = 1; + goto LeafB1NextSubexp; + } + +#endif // JUDYNEXT + +// LeafB1 secondary dead end, no empty index: + + SMRESTART(1); + + +#ifdef JUDY1 +// ---------------------------------------------------------------------------- +// FULL POPULATION: +// +// If the Decode bytes do not match, Index is empty (without modification); +// otherwise restart. + + case cJ1_JPFULLPOPU1: + + CHECKDCD(1); + SMRESTART(1); +#endif + + +// ---------------------------------------------------------------------------- +// IMMEDIATE: +// +// Pop1 = 1 Immediate JPs: +// +// If Index is not in the immediate JP, return success; otherwise check if +// there is an empty index below/above the immediate JPs index, and if so, +// return success with modified Index, else restart. +// +// Note: Doug says its fast enough to calculate the index size (digits) in +// the following; no need to set it separately for each case. + + case cJU_JPIMMED_1_01: + case cJU_JPIMMED_2_01: + case cJU_JPIMMED_3_01: +#ifdef JU_64BIT + case cJU_JPIMMED_4_01: + case cJU_JPIMMED_5_01: + case cJU_JPIMMED_6_01: + case cJU_JPIMMED_7_01: +#endif + if (JU_JPDCDPOP0(Pjp) != JU_TRIMTODCDSIZE(Index)) RET_SUCCESS; + digits = JU_JPTYPE(Pjp) - cJU_JPIMMED_1_01 + 1; + LEAF_EDGE(JU_LEASTBYTES(JU_JPDCDPOP0(Pjp), digits), digits); + +// Immediate JPs with Pop1 > 1: + +#define IMM_MULTI(Func,BaseJPType) \ + JUDY1CODE(Pword = (PWord_t) (Pjp->jp_1Index);) \ + JUDYLCODE(Pword = (PWord_t) (Pjp->jp_LIndex);) \ + Func(Pword, JU_JPTYPE(Pjp) - (BaseJPType) + 1) + + case cJU_JPIMMED_1_02: + case cJU_JPIMMED_1_03: +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_1_04: + case cJU_JPIMMED_1_05: + case cJU_JPIMMED_1_06: + case cJU_JPIMMED_1_07: +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_1_08: + case cJ1_JPIMMED_1_09: + case cJ1_JPIMMED_1_10: + case cJ1_JPIMMED_1_11: + case cJ1_JPIMMED_1_12: + case cJ1_JPIMMED_1_13: + case cJ1_JPIMMED_1_14: + case cJ1_JPIMMED_1_15: +#endif + IMM_MULTI(j__udySearchLeafEmpty1, cJU_JPIMMED_1_02); + +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_2_02: + case cJU_JPIMMED_2_03: +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_2_04: + case cJ1_JPIMMED_2_05: + case cJ1_JPIMMED_2_06: + case cJ1_JPIMMED_2_07: +#endif +#if (defined(JUDY1) || defined(JU_64BIT)) + IMM_MULTI(j__udySearchLeafEmpty2, cJU_JPIMMED_2_02); +#endif + +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_3_02: +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_3_03: + case cJ1_JPIMMED_3_04: + case cJ1_JPIMMED_3_05: +#endif +#if (defined(JUDY1) || defined(JU_64BIT)) + IMM_MULTI(j__udySearchLeafEmpty3, cJU_JPIMMED_3_02); +#endif + +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_4_02: + case cJ1_JPIMMED_4_03: + IMM_MULTI(j__udySearchLeafEmpty4, cJ1_JPIMMED_4_02); + + case cJ1_JPIMMED_5_02: + case cJ1_JPIMMED_5_03: + IMM_MULTI(j__udySearchLeafEmpty5, cJ1_JPIMMED_5_02); + + case cJ1_JPIMMED_6_02: + IMM_MULTI(j__udySearchLeafEmpty6, cJ1_JPIMMED_6_02); + + case cJ1_JPIMMED_7_02: + IMM_MULTI(j__udySearchLeafEmpty7, cJ1_JPIMMED_7_02); +#endif + + +// ---------------------------------------------------------------------------- +// INVALID JP TYPE: + + default: RET_CORRUPT; + + } // SMGet switch. + +} // Judy1PrevEmpty() / Judy1NextEmpty() / JudyLPrevEmpty() / JudyLNextEmpty() diff --git a/src/libnetdata/libjudy/src/JudyL/JudyLPrev.c b/src/libnetdata/libjudy/src/JudyL/JudyLPrev.c new file mode 100644 index 00000000..4bcdccf1 --- /dev/null +++ b/src/libnetdata/libjudy/src/JudyL/JudyLPrev.c @@ -0,0 +1,1890 @@ +// Copyright (C) 2000 - 2002 Hewlett-Packard Company +// +// This program is free software; you can redistribute it and/or modify it +// under the term of the GNU Lesser General Public License as published by the +// Free Software Foundation; either version 2 of the License, or (at your +// option) any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License +// for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this program; if not, write to the Free Software Foundation, +// Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// _________________ + +// @(#) $Revision: 4.54 $ $Source: /judy/src/JudyCommon/JudyPrevNext.c $ +// +// Judy*Prev() and Judy*Next() functions for Judy1 and JudyL. +// Compile with one of -DJUDY1 or -DJUDYL. +// +// Compile with -DJUDYNEXT for the Judy*Next() function; otherwise defaults to +// Judy*Prev(). + +#if (! (defined(JUDY1) || defined(JUDYL))) +#error: One of -DJUDY1 or -DJUDYL must be specified. +#endif + +#ifndef JUDYNEXT +#ifndef JUDYPREV +#define JUDYPREV 1 // neither set => use default. +#endif +#endif + +#ifdef JUDY1 +#include "Judy1.h" +#else +#include "JudyL.h" +#endif + +#include "JudyPrivate1L.h" + + +// **************************************************************************** +// J U D Y 1 P R E V +// J U D Y 1 N E X T +// J U D Y L P R E V +// J U D Y L N E X T +// +// See the manual entry for the API. +// +// OVERVIEW OF Judy*Prev(): +// +// Use a reentrant switch statement (state machine, SM1 = "get") to decode the +// callers *PIndex-1, starting with the (PArray), through branches, if +// any, down to an immediate or a leaf. Look for *PIndex-1 in that leaf, and +// if found, return it. +// +// A dead end is either a branch that does not contain a JP for the appropriate +// digit in *PIndex-1, or a leaf that does not contain the undecoded digits of +// *PIndex-1. Upon reaching a dead end, backtrack through the leaf/branches +// that were just traversed, using a list (history) of parent JPs that is built +// while going forward in SM1Get. Start with the current leaf or branch. In a +// backtracked leaf, look for an Index less than *PIndex-1. In each +// backtracked branch, look "sideways" for the next JP, if any, lower than the +// one for the digit (from *PIndex-1) that was previously decoded. While +// backtracking, if a leaf has no previous Index or a branch has no lower JP, +// go to its parent branch in turn. Upon reaching the JRP, return failure, "no +// previous Index". The backtrack process is sufficiently different from +// SM1Get to merit its own separate reentrant switch statement (SM2 = +// "backtrack"). +// +// While backtracking, upon finding a lower JP in a branch, there is certain to +// be a "prev" Index under that JP (unless the Judy array is corrupt). +// Traverse forward again, this time taking the last (highest, right-most) JP +// in each branch, and the last (highest) Index upon reaching an immediate or a +// leaf. This traversal is sufficiently different from SM1Get and SM2Backtrack +// to merit its own separate reentrant switch statement (SM3 = "findlimit"). +// +// "Decode" bytes in JPs complicate this process a little. In SM1Get, when a +// JP is a narrow pointer, that is, when states are skipped (so the skipped +// digits are stored in jp_DcdPopO), compare the relevant digits to the same +// digits in *PIndex-1. If they are EQUAL, proceed in SM1Get as before. If +// jp_DcdPopOs digits are GREATER, treat the JP as a dead end and proceed in +// SM2Backtrack. If jp_DcdPopOs digits are LESS, treat the JP as if it had +// just been found during a backtrack and proceed directly in SM3Findlimit. +// +// Note that Decode bytes can be ignored in SM3Findlimit; they dont matter. +// Also note that in practice the Decode bytes are routinely compared with +// *PIndex-1 because thats simpler and no slower than first testing for +// narrowness. +// +// Decode bytes also make it unnecessary to construct the Index to return (the +// revised *PIndex) during the search. This step is deferred until finding an +// Index during backtrack or findlimit, before returning it. The first digit +// of *PIndex is derived (saved) based on which JP is used in a JRP branch. +// The remaining digits are obtained from the jp_DcdPopO field in the JP (if +// any) above the immediate or leaf containing the found (prev) Index, plus the +// remaining digit(s) in the immediate or leaf itself. In the case of a LEAFW, +// the Index to return is found directly in the leaf. +// +// Note: Theoretically, as described above, upon reaching a dead end, SM1Get +// passes control to SM2Backtrack to look sideways, even in a leaf. Actually +// its a little more efficient for the SM1Get leaf cases to shortcut this and +// take care of the sideways searches themselves. Hence the history list only +// contains branch JPs, and SM2Backtrack only handles branches. In fact, even +// the branch handling cases in SM1Get do some shortcutting (sideways +// searching) to avoid pushing history and calling SM2Backtrack unnecessarily. +// +// Upon reaching an Index to return after backtracking, *PIndex must be +// modified to the found Index. In principle this could be done by building +// the Index from a saved rootdigit (in the top branch) plus the Dcd bytes from +// the parent JP plus the appropriate Index bytes from the leaf. However, +// Immediates are difficult because their parent JPs lack one (last) digit. So +// instead just build the *PIndex to return "top down" while backtracking and +// findlimiting. +// +// This function is written iteratively for speed, rather than recursively. +// +// CAVEATS: +// +// Why use a backtrack list (history stack), since it has finite size? The +// size is small for Judy on both 32-bit and 64-bit systems, and a list (really +// just an array) is fast to maintain and use. Other alternatives include +// doing a lookahead (lookaside) in each branch while traversing forward +// (decoding), and restarting from the top upon a dead end. +// +// A lookahead means noting the last branch traversed which contained a +// non-null JP lower than the one specified by a digit in *PIndex-1, and +// returning to that point for SM3Findlimit. This seems like a good idea, and +// should be pretty cheap for linear and bitmap branches, but it could result +// in up to 31 unnecessary additional cache line fills (in extreme cases) for +// every uncompressed branch traversed. We have considered means of attaching +// to or hiding within an uncompressed branch (in null JPs) a "cache line map" +// or other structure, such as an offset to the next non-null JP, that would +// speed this up, but it seems unnecessary merely to avoid having a +// finite-length list (array). (If JudySL is ever made "native", the finite +// list length will be an issue.) +// +// Restarting at the top of the Judy array after a dead end requires a careful +// modification of *PIndex-1 to decrement the digit for the parent branch and +// set the remaining lower digits to all 1s. This must be repeated each time a +// parent branch contains another dead end, so even though it should all happen +// in cache, the CPU time can be excessive. (For JudySL or an equivalent +// "infinitely deep" Judy array, consider a hybrid of a large, finite, +// "circular" list and a restart-at-top when the list is backtracked to +// exhaustion.) +// +// Why search for *PIndex-1 instead of *PIndex during SM1Get? In rare +// instances this prevents an unnecessary decode down the wrong path followed +// by a backtrack; its pretty cheap to set up initially; and it means the +// SM1Get machine can simply return if/when it finds that Index. +// +// TBD: Wed like to enhance this function to make successive searches faster. +// This would require saving some previous state, including the previous Index +// returned, and in which leaf it was found. If the next call is for the same +// Index and the array has not been modified, start at the same leaf. This +// should be much easier to implement since this is iterative rather than +// recursive code. +// +// VARIATIONS FOR Judy*Next(): +// +// The Judy*Next() code is nearly a perfect mirror of the Judy*Prev() code. +// See the Judy*Prev() overview comments, and mentally switch the following: +// +// - "*PIndex-1" => "*PIndex+1" +// - "less than" => "greater than" +// - "lower" => "higher" +// - "lowest" => "highest" +// - "next-left" => "next-right" +// - "right-most" => "left-most" +// +// Note: SM3Findlimit could be called SM3Findmax/SM3Findmin, but a common name +// for both Prev and Next means many fewer ifdefs in this code. +// +// TBD: Currently this code traverses a JP whether its expanse is partially or +// completely full (populated). For Judy1 (only), since there is no value area +// needed, consider shortcutting to a "success" return upon encountering a full +// JP in SM1Get (or even SM3Findlimit?) A full JP looks like this: +// +// (((JU_JPDCDPOP0(Pjp) ^ cJU_ALLONES) & cJU_POP0MASK(cLevel)) == 0) + +#ifdef JUDY1 +#ifdef JUDYPREV +FUNCTION int Judy1Prev +#else +FUNCTION int Judy1Next +#endif +#else +#ifdef JUDYPREV +FUNCTION PPvoid_t JudyLPrev +#else +FUNCTION PPvoid_t JudyLNext +#endif +#endif + ( + Pcvoid_t PArray, // Judy array to search. + Word_t * PIndex, // starting point and result. + PJError_t PJError // optional, for returning error info. + ) +{ + Pjp_t Pjp, Pjp2; // current JPs. + Pjbl_t Pjbl; // Pjp->jp_Addr masked and cast to types: + Pjbb_t Pjbb; + Pjbu_t Pjbu; + +// Note: The following initialization is not strictly required but it makes +// gcc -Wall happy because there is an "impossible" path from Immed handling to +// SM1LeafLImm code that looks like Pjll might be used before set: + + Pjll_t Pjll = (Pjll_t) NULL; + Word_t state; // current state in SM. + Word_t digit; // next digit to decode from Index. + +// Note: The following initialization is not strictly required but it makes +// gcc -Wall happy because there is an "impossible" path from Immed handling to +// SM1LeafLImm code (for JudyL & JudyPrev only) that looks like pop1 might be +// used before set: + +#if (defined(JUDYL) && defined(JUDYPREV)) + Word_t pop1 = 0; // in a leaf. +#else + Word_t pop1; // in a leaf. +#endif + int offset; // linear branch/leaf, from j__udySearchLeaf*(). + int subexp; // subexpanse in a bitmap branch. + Word_t bitposmask; // bit in bitmap for Index. + +// History for SM2Backtrack: +// +// For a given histnum, APjphist[histnum] is a parent JP that points to a +// branch, and Aoffhist[histnum] is the offset of the NEXT JP in the branch to +// which the parent JP points. The meaning of Aoffhist[histnum] depends on the +// type of branch to which the parent JP points: +// +// Linear: Offset of the next JP in the JP list. +// +// Bitmap: Which subexpanse, plus the offset of the next JP in the +// subexpanses JP list (to avoid bit-counting again), plus for Judy*Next(), +// hidden one byte to the left, which digit, because Judy*Next() also needs +// this. +// +// Uncompressed: Digit, which is actually the offset of the JP in the branch. +// +// Note: Only branch JPs are stored in APjphist[] because, as explained +// earlier, SM1Get shortcuts sideways searches in leaves (and even in branches +// in some cases), so SM2Backtrack only handles branches. + +#define HISTNUMMAX cJU_ROOTSTATE // maximum branches traversable. + Pjp_t APjphist[HISTNUMMAX]; // list of branch JPs traversed. + int Aoffhist[HISTNUMMAX]; // list of next JP offsets; see above. + int histnum = 0; // number of JPs now in list. + + +// ---------------------------------------------------------------------------- +// M A C R O S +// +// These are intended to make the code a bit more readable and less redundant. + + +// "PUSH" AND "POP" Pjp AND offset ON HISTORY STACKS: +// +// Note: Ensure a corrupt Judy array does not overflow *hist[]. Meanwhile, +// underflowing *hist[] simply means theres no more room to backtrack => +// "no previous/next Index". + +#define HISTPUSH(Pjp,Offset) \ + APjphist[histnum] = (Pjp); \ + Aoffhist[histnum] = (Offset); \ + \ + if (++histnum >= HISTNUMMAX) \ + { \ + JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT) \ + JUDY1CODE(return(JERRI );) \ + JUDYLCODE(return(PPJERR);) \ + } + +#define HISTPOP(Pjp,Offset) \ + if ((histnum--) < 1) JU_RET_NOTFOUND; \ + (Pjp) = APjphist[histnum]; \ + (Offset) = Aoffhist[histnum] + +// How to pack/unpack Aoffhist[] values for bitmap branches: + +#ifdef JUDYPREV + +#define HISTPUSHBOFF(Subexp,Offset,Digit) \ + (((Subexp) * cJU_BITSPERSUBEXPB) | (Offset)) + +#define HISTPOPBOFF(Subexp,Offset,Digit) \ + (Subexp) = (Offset) / cJU_BITSPERSUBEXPB; \ + (Offset) %= cJU_BITSPERSUBEXPB +#else + +#define HISTPUSHBOFF(Subexp,Offset,Digit) \ + (((Digit) << cJU_BITSPERBYTE) \ + | ((Subexp) * cJU_BITSPERSUBEXPB) | (Offset)) + +#define HISTPOPBOFF(Subexp,Offset,Digit) \ + (Digit) = (Offset) >> cJU_BITSPERBYTE; \ + (Subexp) = ((Offset) & JU_LEASTBYTESMASK(1)) / cJU_BITSPERSUBEXPB; \ + (Offset) %= cJU_BITSPERSUBEXPB +#endif + + +// CHECK FOR NULL JP: + +#define JPNULL(Type) (((Type) >= cJU_JPNULL1) && ((Type) <= cJU_JPNULLMAX)) + + +// SEARCH A BITMAP: +// +// This is a weak analog of j__udySearchLeaf*() for bitmaps. Return the actual +// or next-left position, base 0, of Digit in the single uint32_t bitmap, also +// given a Bitposmask for Digit. +// +// Unlike j__udySearchLeaf*(), the offset is not returned bit-complemented if +// Digits bit is unset, because the caller can check the bitmap themselves to +// determine that. Also, if Digits bit is unset, the returned offset is to +// the next-left JP (including -1), not to the "ideal" position for the Index = +// next-right JP. +// +// Shortcut and skip calling j__udyCountBits*() if the bitmap is full, in which +// case (Digit % cJU_BITSPERSUBEXP*) itself is the base-0 offset. +// +// TBD for Judy*Next(): Should this return next-right instead of next-left? +// That is, +1 from current value? Maybe not, if Digits bit IS set, +1 would +// be wrong. + +#define SEARCHBITMAPB(Bitmap,Digit,Bitposmask) \ + (((Bitmap) == cJU_FULLBITMAPB) ? (Digit % cJU_BITSPERSUBEXPB) : \ + j__udyCountBitsB((Bitmap) & JU_MASKLOWERINC(Bitposmask)) - 1) + +#define SEARCHBITMAPL(Bitmap,Digit,Bitposmask) \ + (((Bitmap) == cJU_FULLBITMAPL) ? (Digit % cJU_BITSPERSUBEXPL) : \ + j__udyCountBitsL((Bitmap) & JU_MASKLOWERINC(Bitposmask)) - 1) + +#ifdef JUDYPREV +// Equivalent to search for the highest offset in Bitmap: + +#define SEARCHBITMAPMAXB(Bitmap) \ + (((Bitmap) == cJU_FULLBITMAPB) ? cJU_BITSPERSUBEXPB - 1 : \ + j__udyCountBitsB(Bitmap) - 1) + +#define SEARCHBITMAPMAXL(Bitmap) \ + (((Bitmap) == cJU_FULLBITMAPL) ? cJU_BITSPERSUBEXPL - 1 : \ + j__udyCountBitsL(Bitmap) - 1) +#endif + + +// CHECK DECODE BYTES: +// +// Check Decode bytes in a JP against the equivalent portion of *PIndex. If +// *PIndex is lower (for Judy*Prev()) or higher (for Judy*Next()), this JP is a +// dead end (the same as if it had been absent in a linear or bitmap branch or +// null in an uncompressed branch), enter SM2Backtrack; otherwise enter +// SM3Findlimit to find the highest/lowest Index under this JP, as if the code +// had already backtracked to this JP. + +#ifdef JUDYPREV +#define CDcmp__ < +#else +#define CDcmp__ > +#endif + +#define CHECKDCD(cState) \ + if (JU_DCDNOTMATCHINDEX(*PIndex, Pjp, cState)) \ + { \ + if ((*PIndex & cJU_DCDMASK(cState)) \ + CDcmp__(JU_JPDCDPOP0(Pjp) & cJU_DCDMASK(cState))) \ + { \ + goto SM2Backtrack; \ + } \ + goto SM3Findlimit; \ + } + + +// PREPARE TO HANDLE A LEAFW OR JRP BRANCH IN SM1: +// +// Extract a state-dependent digit from Index in a "constant" way, then jump to +// common code for multiple cases. + +#define SM1PREPB(cState,Next) \ + state = (cState); \ + digit = JU_DIGITATSTATE(*PIndex, cState); \ + goto Next + + +// PREPARE TO HANDLE A LEAFW OR JRP BRANCH IN SM3: +// +// Optionally save Dcd bytes into *PIndex, then save state and jump to common +// code for multiple cases. + +#define SM3PREPB_DCD(cState,Next) \ + JU_SETDCD(*PIndex, Pjp, cState); \ + SM3PREPB(cState,Next) + +#define SM3PREPB(cState,Next) state = (cState); goto Next + + +// ---------------------------------------------------------------------------- +// CHECK FOR SHORTCUTS: +// +// Error out if PIndex is null. Execute JU_RET_NOTFOUND if the Judy array is +// empty or *PIndex is already the minimum/maximum Index possible. +// +// Note: As documented, in case of failure *PIndex may be modified. + + if (PIndex == (PWord_t) NULL) + { + JU_SET_ERRNO(PJError, JU_ERRNO_NULLPINDEX); + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + } + +#ifdef JUDYPREV + if ((PArray == (Pvoid_t) NULL) || ((*PIndex)-- == 0)) +#else + if ((PArray == (Pvoid_t) NULL) || ((*PIndex)++ == cJU_ALLONES)) +#endif + JU_RET_NOTFOUND; + + +// HANDLE JRP: +// +// Before even entering SM1Get, check the JRP type. For JRP branches, traverse +// the JPM; handle LEAFW leaves directly; but look for the most common cases +// first. + +// ROOT-STATE LEAF that starts with a Pop0 word; just look within the leaf: +// +// If *PIndex is in the leaf, return it; otherwise return the Index, if any, +// below where it would belong. + + if (JU_LEAFW_POP0(PArray) < cJU_LEAFW_MAXPOP1) // must be a LEAFW + { + Pjlw_t Pjlw = P_JLW(PArray); // first word of leaf. + pop1 = Pjlw[0] + 1; + + if ((offset = j__udySearchLeafW(Pjlw + 1, pop1, *PIndex)) + >= 0) // Index is present. + { + assert(offset < pop1); // in expected range. + JU_RET_FOUND_LEAFW(Pjlw, pop1, offset); // *PIndex is set. + } + +#ifdef JUDYPREV + if ((offset = ~offset) == 0) // no next-left Index. +#else + if ((offset = ~offset) >= pop1) // no next-right Index. +#endif + JU_RET_NOTFOUND; + + assert(offset <= pop1); // valid result. + +#ifdef JUDYPREV + *PIndex = Pjlw[offset--]; // next-left Index, base 1. +#else + *PIndex = Pjlw[offset + 1]; // next-right Index, base 1. +#endif + JU_RET_FOUND_LEAFW(Pjlw, pop1, offset); // base 0. + + } + else // JRP BRANCH + { + Pjpm_t Pjpm = P_JPM(PArray); + Pjp = &(Pjpm->jpm_JP); + +// goto SM1Get; + } + +// ============================================================================ +// STATE MACHINE 1 -- GET INDEX: +// +// Search for *PIndex (already decremented/incremented so as to be inclusive). +// If found, return it. Otherwise in theory hand off to SM2Backtrack or +// SM3Findlimit, but in practice "shortcut" by first sideways searching the +// current branch or leaf upon hitting a dead end. During sideways search, +// modify *PIndex to a new path taken. +// +// ENTRY: Pjp points to next JP to interpret, whose Decode bytes have not yet +// been checked. This JP is not yet listed in history. +// +// Note: Check Decode bytes at the start of each loop, not after looking up a +// new JP, so its easy to do constant shifts/masks, although this requires +// cautious handling of Pjp, offset, and *hist[] for correct entry to +// SM2Backtrack. +// +// EXIT: Return, or branch to SM2Backtrack or SM3Findlimit with correct +// interface, as described elsewhere. +// +// WARNING: For run-time efficiency the following cases replicate code with +// varying constants, rather than using common code with variable values! + +SM1Get: // return here for next branch/leaf. + + switch (JU_JPTYPE(Pjp)) + { + + +// ---------------------------------------------------------------------------- +// LINEAR BRANCH: +// +// Check Decode bytes, if any, in the current JP, then search for a JP for the +// next digit in *PIndex. + + case cJU_JPBRANCH_L2: CHECKDCD(2); SM1PREPB(2, SM1BranchL); + case cJU_JPBRANCH_L3: CHECKDCD(3); SM1PREPB(3, SM1BranchL); +#ifdef JU_64BIT + case cJU_JPBRANCH_L4: CHECKDCD(4); SM1PREPB(4, SM1BranchL); + case cJU_JPBRANCH_L5: CHECKDCD(5); SM1PREPB(5, SM1BranchL); + case cJU_JPBRANCH_L6: CHECKDCD(6); SM1PREPB(6, SM1BranchL); + case cJU_JPBRANCH_L7: CHECKDCD(7); SM1PREPB(7, SM1BranchL); +#endif + case cJU_JPBRANCH_L: SM1PREPB(cJU_ROOTSTATE, SM1BranchL); + +// Common code (state-independent) for all cases of linear branches: + +SM1BranchL: + Pjbl = P_JBL(Pjp->jp_Addr); + +// Found JP matching current digit in *PIndex; record parent JP and the next +// JPs offset, and iterate to the next JP: + + if ((offset = j__udySearchLeaf1((Pjll_t) (Pjbl->jbl_Expanse), + Pjbl->jbl_NumJPs, digit)) >= 0) + { + HISTPUSH(Pjp, offset); + Pjp = (Pjbl->jbl_jp) + offset; + goto SM1Get; + } + +// Dead end, no JP in BranchL for next digit in *PIndex: +// +// Get the ideal location of digits JP, and if theres no next-left/right JP +// in the BranchL, shortcut and start backtracking one level up; ignore the +// current Pjp because it points to a BranchL with no next-left/right JP. + +#ifdef JUDYPREV + if ((offset = (~offset) - 1) < 0) // no next-left JP in BranchL. +#else + if ((offset = (~offset)) >= Pjbl->jbl_NumJPs) // no next-right. +#endif + goto SM2Backtrack; + +// Theres a next-left/right JP in the current BranchL; save its digit in +// *PIndex and shortcut to SM3Findlimit: + + JU_SETDIGIT(*PIndex, Pjbl->jbl_Expanse[offset], state); + Pjp = (Pjbl->jbl_jp) + offset; + goto SM3Findlimit; + + +// ---------------------------------------------------------------------------- +// BITMAP BRANCH: +// +// Check Decode bytes, if any, in the current JP, then look for a JP for the +// next digit in *PIndex. + + case cJU_JPBRANCH_B2: CHECKDCD(2); SM1PREPB(2, SM1BranchB); + case cJU_JPBRANCH_B3: CHECKDCD(3); SM1PREPB(3, SM1BranchB); +#ifdef JU_64BIT + case cJU_JPBRANCH_B4: CHECKDCD(4); SM1PREPB(4, SM1BranchB); + case cJU_JPBRANCH_B5: CHECKDCD(5); SM1PREPB(5, SM1BranchB); + case cJU_JPBRANCH_B6: CHECKDCD(6); SM1PREPB(6, SM1BranchB); + case cJU_JPBRANCH_B7: CHECKDCD(7); SM1PREPB(7, SM1BranchB); +#endif + case cJU_JPBRANCH_B: SM1PREPB(cJU_ROOTSTATE, SM1BranchB); + +// Common code (state-independent) for all cases of bitmap branches: + +SM1BranchB: + Pjbb = P_JBB(Pjp->jp_Addr); + +// Locate the digits JP in the subexpanse list, if present, otherwise the +// offset of the next-left JP, if any: + + subexp = digit / cJU_BITSPERSUBEXPB; + assert(subexp < cJU_NUMSUBEXPB); // falls in expected range. + bitposmask = JU_BITPOSMASKB(digit); + offset = SEARCHBITMAPB(JU_JBB_BITMAP(Pjbb, subexp), digit, + bitposmask); + // right range: + assert((offset >= -1) && (offset < (int) cJU_BITSPERSUBEXPB)); + +// Found JP matching current digit in *PIndex: +// +// Record the parent JP and the next JPs offset; and iterate to the next JP. + +// if (JU_BITMAPTESTB(Pjbb, digit)) // slower. + if (JU_JBB_BITMAP(Pjbb, subexp) & bitposmask) // faster. + { + // not negative since at least one bit is set: + assert(offset >= 0); + + HISTPUSH(Pjp, HISTPUSHBOFF(subexp, offset, digit)); + + if ((Pjp = P_JP(JU_JBB_PJP(Pjbb, subexp))) == (Pjp_t) NULL) + { + JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + } + + Pjp += offset; + goto SM1Get; // iterate to next JP. + } + +// Dead end, no JP in BranchB for next digit in *PIndex: +// +// If theres a next-left/right JP in the current BranchB, shortcut to +// SM3Findlimit. Note: offset is already set to the correct value for the +// next-left/right JP. + +#ifdef JUDYPREV + if (offset >= 0) // next-left JP is in this subexpanse. + goto SM1BranchBFindlimit; + + while (--subexp >= 0) // search next-left subexpanses. +#else + if (JU_JBB_BITMAP(Pjbb, subexp) & JU_MASKHIGHEREXC(bitposmask)) + { + ++offset; // next-left => next-right. + goto SM1BranchBFindlimit; + } + + while (++subexp < cJU_NUMSUBEXPB) // search next-right subexps. +#endif + { + if (! JU_JBB_PJP(Pjbb, subexp)) continue; // empty subexpanse. + +#ifdef JUDYPREV + offset = SEARCHBITMAPMAXB(JU_JBB_BITMAP(Pjbb, subexp)); + // expected range: + assert((offset >= 0) && (offset < cJU_BITSPERSUBEXPB)); +#else + offset = 0; +#endif + +// Save the next-left/right JPs digit in *PIndex: + +SM1BranchBFindlimit: + JU_BITMAPDIGITB(digit, subexp, JU_JBB_BITMAP(Pjbb, subexp), + offset); + JU_SETDIGIT(*PIndex, digit, state); + + if ((Pjp = P_JP(JU_JBB_PJP(Pjbb, subexp))) == (Pjp_t) NULL) + { + JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + } + + Pjp += offset; + goto SM3Findlimit; + } + +// Theres no next-left/right JP in the BranchB: +// +// Shortcut and start backtracking one level up; ignore the current Pjp because +// it points to a BranchB with no next-left/right JP. + + goto SM2Backtrack; + + +// ---------------------------------------------------------------------------- +// UNCOMPRESSED BRANCH: +// +// Check Decode bytes, if any, in the current JP, then look for a JP for the +// next digit in *PIndex. + + case cJU_JPBRANCH_U2: CHECKDCD(2); SM1PREPB(2, SM1BranchU); + case cJU_JPBRANCH_U3: CHECKDCD(3); SM1PREPB(3, SM1BranchU); +#ifdef JU_64BIT + case cJU_JPBRANCH_U4: CHECKDCD(4); SM1PREPB(4, SM1BranchU); + case cJU_JPBRANCH_U5: CHECKDCD(5); SM1PREPB(5, SM1BranchU); + case cJU_JPBRANCH_U6: CHECKDCD(6); SM1PREPB(6, SM1BranchU); + case cJU_JPBRANCH_U7: CHECKDCD(7); SM1PREPB(7, SM1BranchU); +#endif + case cJU_JPBRANCH_U: SM1PREPB(cJU_ROOTSTATE, SM1BranchU); + +// Common code (state-independent) for all cases of uncompressed branches: + +SM1BranchU: + Pjbu = P_JBU(Pjp->jp_Addr); + Pjp2 = (Pjbu->jbu_jp) + digit; + +// Found JP matching current digit in *PIndex: +// +// Record the parent JP and the next JPs digit, and iterate to the next JP. +// +// TBD: Instead of this, just goto SM1Get, and add cJU_JPNULL* cases to the +// SM1Get state machine? Then backtrack? However, it means you cant detect +// an inappropriate cJU_JPNULL*, when it occurs in other than a BranchU, and +// return JU_RET_CORRUPT. + + if (! JPNULL(JU_JPTYPE(Pjp2))) // digit has a JP. + { + HISTPUSH(Pjp, digit); + Pjp = Pjp2; + goto SM1Get; + } + +// Dead end, no JP in BranchU for next digit in *PIndex: +// +// Search for a next-left/right JP in the current BranchU, and if one is found, +// save its digit in *PIndex and shortcut to SM3Findlimit: + +#ifdef JUDYPREV + while (digit >= 1) + { + Pjp = (Pjbu->jbu_jp) + (--digit); +#else + while (digit < cJU_BRANCHUNUMJPS - 1) + { + Pjp = (Pjbu->jbu_jp) + (++digit); +#endif + if (JPNULL(JU_JPTYPE(Pjp))) continue; + + JU_SETDIGIT(*PIndex, digit, state); + goto SM3Findlimit; + } + +// Theres no next-left/right JP in the BranchU: +// +// Shortcut and start backtracking one level up; ignore the current Pjp because +// it points to a BranchU with no next-left/right JP. + + goto SM2Backtrack; + + +// ---------------------------------------------------------------------------- +// LINEAR LEAF: +// +// Check Decode bytes, if any, in the current JP, then search the leaf for +// *PIndex. + +#define SM1LEAFL(Func) \ + Pjll = P_JLL(Pjp->jp_Addr); \ + pop1 = JU_JPLEAF_POP0(Pjp) + 1; \ + offset = Func(Pjll, pop1, *PIndex); \ + goto SM1LeafLImm + +#if (defined(JUDYL) || (! defined(JU_64BIT))) + case cJU_JPLEAF1: CHECKDCD(1); SM1LEAFL(j__udySearchLeaf1); +#endif + case cJU_JPLEAF2: CHECKDCD(2); SM1LEAFL(j__udySearchLeaf2); + case cJU_JPLEAF3: CHECKDCD(3); SM1LEAFL(j__udySearchLeaf3); + +#ifdef JU_64BIT + case cJU_JPLEAF4: CHECKDCD(4); SM1LEAFL(j__udySearchLeaf4); + case cJU_JPLEAF5: CHECKDCD(5); SM1LEAFL(j__udySearchLeaf5); + case cJU_JPLEAF6: CHECKDCD(6); SM1LEAFL(j__udySearchLeaf6); + case cJU_JPLEAF7: CHECKDCD(7); SM1LEAFL(j__udySearchLeaf7); +#endif + +// Common code (state-independent) for all cases of linear leaves and +// immediates: + +SM1LeafLImm: + if (offset >= 0) // *PIndex is in LeafL / Immed. +#ifdef JUDY1 + JU_RET_FOUND; +#else + { // JudyL is trickier... + switch (JU_JPTYPE(Pjp)) + { +#if (defined(JUDYL) || (! defined(JU_64BIT))) + case cJU_JPLEAF1: JU_RET_FOUND_LEAF1(Pjll, pop1, offset); +#endif + case cJU_JPLEAF2: JU_RET_FOUND_LEAF2(Pjll, pop1, offset); + case cJU_JPLEAF3: JU_RET_FOUND_LEAF3(Pjll, pop1, offset); +#ifdef JU_64BIT + case cJU_JPLEAF4: JU_RET_FOUND_LEAF4(Pjll, pop1, offset); + case cJU_JPLEAF5: JU_RET_FOUND_LEAF5(Pjll, pop1, offset); + case cJU_JPLEAF6: JU_RET_FOUND_LEAF6(Pjll, pop1, offset); + case cJU_JPLEAF7: JU_RET_FOUND_LEAF7(Pjll, pop1, offset); +#endif + + case cJU_JPIMMED_1_01: + case cJU_JPIMMED_2_01: + case cJU_JPIMMED_3_01: +#ifdef JU_64BIT + case cJU_JPIMMED_4_01: + case cJU_JPIMMED_5_01: + case cJU_JPIMMED_6_01: + case cJU_JPIMMED_7_01: +#endif + JU_RET_FOUND_IMM_01(Pjp); + + case cJU_JPIMMED_1_02: + case cJU_JPIMMED_1_03: +#ifdef JU_64BIT + case cJU_JPIMMED_1_04: + case cJU_JPIMMED_1_05: + case cJU_JPIMMED_1_06: + case cJU_JPIMMED_1_07: + case cJU_JPIMMED_2_02: + case cJU_JPIMMED_2_03: + case cJU_JPIMMED_3_02: +#endif + JU_RET_FOUND_IMM(Pjp, offset); + } + + JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); // impossible? + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + + } // found *PIndex + +#endif // JUDYL + +// Dead end, no Index in LeafL / Immed for remaining digit(s) in *PIndex: +// +// Get the ideal location of Index, and if theres no next-left/right Index in +// the LeafL / Immed, shortcut and start backtracking one level up; ignore the +// current Pjp because it points to a LeafL / Immed with no next-left/right +// Index. + +#ifdef JUDYPREV + if ((offset = (~offset) - 1) < 0) // no next-left Index. +#else + if ((offset = (~offset)) >= pop1) // no next-right Index. +#endif + goto SM2Backtrack; + +// Theres a next-left/right Index in the current LeafL / Immed; shortcut by +// copying its digit(s) to *PIndex and returning it. +// +// Unfortunately this is pretty hairy, especially avoiding endian issues. +// +// The cJU_JPLEAF* cases are very similar to same-index-size cJU_JPIMMED* cases +// for *_02 and above, but must return differently, at least for JudyL, so +// spell them out separately here at the cost of a little redundant code for +// Judy1. + + switch (JU_JPTYPE(Pjp)) + { +#if (defined(JUDYL) || (! defined(JU_64BIT))) + case cJU_JPLEAF1: + + JU_SETDIGIT1(*PIndex, ((uint8_t *) Pjll)[offset]); + JU_RET_FOUND_LEAF1(Pjll, pop1, offset); +#endif + + case cJU_JPLEAF2: + + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(2))) + | ((uint16_t *) Pjll)[offset]; + JU_RET_FOUND_LEAF2(Pjll, pop1, offset); + + case cJU_JPLEAF3: + { + Word_t lsb; + JU_COPY3_PINDEX_TO_LONG(lsb, ((uint8_t *) Pjll) + (3 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(3))) | lsb; + JU_RET_FOUND_LEAF3(Pjll, pop1, offset); + } + +#ifdef JU_64BIT + case cJU_JPLEAF4: + + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(4))) + | ((uint32_t *) Pjll)[offset]; + JU_RET_FOUND_LEAF4(Pjll, pop1, offset); + + case cJU_JPLEAF5: + { + Word_t lsb; + JU_COPY5_PINDEX_TO_LONG(lsb, ((uint8_t *) Pjll) + (5 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(5))) | lsb; + JU_RET_FOUND_LEAF5(Pjll, pop1, offset); + } + + case cJU_JPLEAF6: + { + Word_t lsb; + JU_COPY6_PINDEX_TO_LONG(lsb, ((uint8_t *) Pjll) + (6 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(6))) | lsb; + JU_RET_FOUND_LEAF6(Pjll, pop1, offset); + } + + case cJU_JPLEAF7: + { + Word_t lsb; + JU_COPY7_PINDEX_TO_LONG(lsb, ((uint8_t *) Pjll) + (7 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(7))) | lsb; + JU_RET_FOUND_LEAF7(Pjll, pop1, offset); + } + +#endif // JU_64BIT + +#define SET_01(cState) JU_SETDIGITS(*PIndex, JU_JPDCDPOP0(Pjp), cState) + + case cJU_JPIMMED_1_01: SET_01(1); goto SM1Imm_01; + case cJU_JPIMMED_2_01: SET_01(2); goto SM1Imm_01; + case cJU_JPIMMED_3_01: SET_01(3); goto SM1Imm_01; +#ifdef JU_64BIT + case cJU_JPIMMED_4_01: SET_01(4); goto SM1Imm_01; + case cJU_JPIMMED_5_01: SET_01(5); goto SM1Imm_01; + case cJU_JPIMMED_6_01: SET_01(6); goto SM1Imm_01; + case cJU_JPIMMED_7_01: SET_01(7); goto SM1Imm_01; +#endif +SM1Imm_01: JU_RET_FOUND_IMM_01(Pjp); + +// Shorthand for where to find start of Index bytes array: + +#ifdef JUDY1 +#define PJI (Pjp->jp_1Index) +#else +#define PJI (Pjp->jp_LIndex) +#endif + + case cJU_JPIMMED_1_02: + case cJU_JPIMMED_1_03: +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_1_04: + case cJU_JPIMMED_1_05: + case cJU_JPIMMED_1_06: + case cJU_JPIMMED_1_07: +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_1_08: + case cJ1_JPIMMED_1_09: + case cJ1_JPIMMED_1_10: + case cJ1_JPIMMED_1_11: + case cJ1_JPIMMED_1_12: + case cJ1_JPIMMED_1_13: + case cJ1_JPIMMED_1_14: + case cJ1_JPIMMED_1_15: +#endif + JU_SETDIGIT1(*PIndex, ((uint8_t *) PJI)[offset]); + JU_RET_FOUND_IMM(Pjp, offset); + +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_2_02: + case cJU_JPIMMED_2_03: +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_2_04: + case cJ1_JPIMMED_2_05: + case cJ1_JPIMMED_2_06: + case cJ1_JPIMMED_2_07: +#endif +#if (defined(JUDY1) || defined(JU_64BIT)) + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(2))) + | ((uint16_t *) PJI)[offset]; + JU_RET_FOUND_IMM(Pjp, offset); +#endif + +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_3_02: +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_3_03: + case cJ1_JPIMMED_3_04: + case cJ1_JPIMMED_3_05: +#endif +#if (defined(JUDY1) || defined(JU_64BIT)) + { + Word_t lsb; + JU_COPY3_PINDEX_TO_LONG(lsb, ((uint8_t *) PJI) + (3 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(3))) | lsb; + JU_RET_FOUND_IMM(Pjp, offset); + } +#endif + +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_4_02: + case cJ1_JPIMMED_4_03: + + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(4))) + | ((uint32_t *) PJI)[offset]; + JU_RET_FOUND_IMM(Pjp, offset); + + case cJ1_JPIMMED_5_02: + case cJ1_JPIMMED_5_03: + { + Word_t lsb; + JU_COPY5_PINDEX_TO_LONG(lsb, ((uint8_t *) PJI) + (5 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(5))) | lsb; + JU_RET_FOUND_IMM(Pjp, offset); + } + + case cJ1_JPIMMED_6_02: + { + Word_t lsb; + JU_COPY6_PINDEX_TO_LONG(lsb, ((uint8_t *) PJI) + (6 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(6))) | lsb; + JU_RET_FOUND_IMM(Pjp, offset); + } + + case cJ1_JPIMMED_7_02: + { + Word_t lsb; + JU_COPY7_PINDEX_TO_LONG(lsb, ((uint8_t *) PJI) + (7 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(7))) | lsb; + JU_RET_FOUND_IMM(Pjp, offset); + } + +#endif // (JUDY1 && JU_64BIT) + + } // switch for not-found *PIndex + + JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); // impossible? + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + + +// ---------------------------------------------------------------------------- +// BITMAP LEAF: +// +// Check Decode bytes, if any, in the current JP, then look in the leaf for +// *PIndex. + + case cJU_JPLEAF_B1: + { + Pjlb_t Pjlb; + CHECKDCD(1); + + Pjlb = P_JLB(Pjp->jp_Addr); + digit = JU_DIGITATSTATE(*PIndex, 1); + subexp = JU_SUBEXPL(digit); + bitposmask = JU_BITPOSMASKL(digit); + assert(subexp < cJU_NUMSUBEXPL); // falls in expected range. + +// *PIndex exists in LeafB1: + +// if (JU_BITMAPTESTL(Pjlb, digit)) // slower. + if (JU_JLB_BITMAP(Pjlb, subexp) & bitposmask) // faster. + { +#ifdef JUDYL // needs offset at this point: + offset = SEARCHBITMAPL(JU_JLB_BITMAP(Pjlb, subexp), digit, bitposmask); +#endif + JU_RET_FOUND_LEAF_B1(Pjlb, subexp, offset); +// == return((PPvoid_t) (P_JV(JL_JLB_PVALUE(Pjlb, subexp)) + (offset))); + } + +// Dead end, no Index in LeafB1 for remaining digit in *PIndex: +// +// If theres a next-left/right Index in the current LeafB1, which for +// Judy*Next() is true if any bits are set for higher Indexes, shortcut by +// returning it. Note: For Judy*Prev(), offset is set here to the correct +// value for the next-left JP. + + offset = SEARCHBITMAPL(JU_JLB_BITMAP(Pjlb, subexp), digit, + bitposmask); + // right range: + assert((offset >= -1) && (offset < (int) cJU_BITSPERSUBEXPL)); + +#ifdef JUDYPREV + if (offset >= 0) // next-left JP is in this subexpanse. + goto SM1LeafB1Findlimit; + + while (--subexp >= 0) // search next-left subexpanses. +#else + if (JU_JLB_BITMAP(Pjlb, subexp) & JU_MASKHIGHEREXC(bitposmask)) + { + ++offset; // next-left => next-right. + goto SM1LeafB1Findlimit; + } + + while (++subexp < cJU_NUMSUBEXPL) // search next-right subexps. +#endif + { + if (! JU_JLB_BITMAP(Pjlb, subexp)) continue; // empty subexp. + +#ifdef JUDYPREV + offset = SEARCHBITMAPMAXL(JU_JLB_BITMAP(Pjlb, subexp)); + // expected range: + assert((offset >= 0) && (offset < (int) cJU_BITSPERSUBEXPL)); +#else + offset = 0; +#endif + +// Save the next-left/right Indexess digit in *PIndex: + +SM1LeafB1Findlimit: + JU_BITMAPDIGITL(digit, subexp, JU_JLB_BITMAP(Pjlb, subexp), offset); + JU_SETDIGIT1(*PIndex, digit); + JU_RET_FOUND_LEAF_B1(Pjlb, subexp, offset); +// == return((PPvoid_t) (P_JV(JL_JLB_PVALUE(Pjlb, subexp)) + (offset))); + } + +// Theres no next-left/right Index in the LeafB1: +// +// Shortcut and start backtracking one level up; ignore the current Pjp because +// it points to a LeafB1 with no next-left/right Index. + + goto SM2Backtrack; + + } // case cJU_JPLEAF_B1 + +#ifdef JUDY1 +// ---------------------------------------------------------------------------- +// FULL POPULATION: +// +// If the Decode bytes match, *PIndex is found (without modification). + + case cJ1_JPFULLPOPU1: + + CHECKDCD(1); + JU_RET_FOUND_FULLPOPU1; +#endif + + +// ---------------------------------------------------------------------------- +// IMMEDIATE: + +#ifdef JUDYPREV +#define SM1IMM_SETPOP1(cPop1) +#else +#define SM1IMM_SETPOP1(cPop1) pop1 = (cPop1) +#endif + +#define SM1IMM(Func,cPop1) \ + SM1IMM_SETPOP1(cPop1); \ + offset = Func((Pjll_t) (PJI), cPop1, *PIndex); \ + goto SM1LeafLImm + +// Special case for Pop1 = 1 Immediate JPs: +// +// If *PIndex is in the immediate, offset is 0, otherwise the binary NOT of the +// offset where it belongs, 0 or 1, same as from the search functions. + +#ifdef JUDYPREV +#define SM1IMM_01_SETPOP1 +#else +#define SM1IMM_01_SETPOP1 pop1 = 1 +#endif + +#define SM1IMM_01 \ + SM1IMM_01_SETPOP1; \ + offset = ((JU_JPDCDPOP0(Pjp) < JU_TRIMTODCDSIZE(*PIndex)) ? ~1 : \ + (JU_JPDCDPOP0(Pjp) == JU_TRIMTODCDSIZE(*PIndex)) ? 0 : \ + ~0); \ + goto SM1LeafLImm + + case cJU_JPIMMED_1_01: + case cJU_JPIMMED_2_01: + case cJU_JPIMMED_3_01: +#ifdef JU_64BIT + case cJU_JPIMMED_4_01: + case cJU_JPIMMED_5_01: + case cJU_JPIMMED_6_01: + case cJU_JPIMMED_7_01: +#endif + SM1IMM_01; + +// TBD: Doug says it would be OK to have fewer calls and calculate arg 2, here +// and in Judy*Count() also. + + case cJU_JPIMMED_1_02: SM1IMM(j__udySearchLeaf1, 2); + case cJU_JPIMMED_1_03: SM1IMM(j__udySearchLeaf1, 3); +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_1_04: SM1IMM(j__udySearchLeaf1, 4); + case cJU_JPIMMED_1_05: SM1IMM(j__udySearchLeaf1, 5); + case cJU_JPIMMED_1_06: SM1IMM(j__udySearchLeaf1, 6); + case cJU_JPIMMED_1_07: SM1IMM(j__udySearchLeaf1, 7); +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_1_08: SM1IMM(j__udySearchLeaf1, 8); + case cJ1_JPIMMED_1_09: SM1IMM(j__udySearchLeaf1, 9); + case cJ1_JPIMMED_1_10: SM1IMM(j__udySearchLeaf1, 10); + case cJ1_JPIMMED_1_11: SM1IMM(j__udySearchLeaf1, 11); + case cJ1_JPIMMED_1_12: SM1IMM(j__udySearchLeaf1, 12); + case cJ1_JPIMMED_1_13: SM1IMM(j__udySearchLeaf1, 13); + case cJ1_JPIMMED_1_14: SM1IMM(j__udySearchLeaf1, 14); + case cJ1_JPIMMED_1_15: SM1IMM(j__udySearchLeaf1, 15); +#endif + +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_2_02: SM1IMM(j__udySearchLeaf2, 2); + case cJU_JPIMMED_2_03: SM1IMM(j__udySearchLeaf2, 3); +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_2_04: SM1IMM(j__udySearchLeaf2, 4); + case cJ1_JPIMMED_2_05: SM1IMM(j__udySearchLeaf2, 5); + case cJ1_JPIMMED_2_06: SM1IMM(j__udySearchLeaf2, 6); + case cJ1_JPIMMED_2_07: SM1IMM(j__udySearchLeaf2, 7); +#endif + +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_3_02: SM1IMM(j__udySearchLeaf3, 2); +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_3_03: SM1IMM(j__udySearchLeaf3, 3); + case cJ1_JPIMMED_3_04: SM1IMM(j__udySearchLeaf3, 4); + case cJ1_JPIMMED_3_05: SM1IMM(j__udySearchLeaf3, 5); + + case cJ1_JPIMMED_4_02: SM1IMM(j__udySearchLeaf4, 2); + case cJ1_JPIMMED_4_03: SM1IMM(j__udySearchLeaf4, 3); + + case cJ1_JPIMMED_5_02: SM1IMM(j__udySearchLeaf5, 2); + case cJ1_JPIMMED_5_03: SM1IMM(j__udySearchLeaf5, 3); + + case cJ1_JPIMMED_6_02: SM1IMM(j__udySearchLeaf6, 2); + + case cJ1_JPIMMED_7_02: SM1IMM(j__udySearchLeaf7, 2); +#endif + + +// ---------------------------------------------------------------------------- +// INVALID JP TYPE: + + default: JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + + } // SM1Get switch. + + /*NOTREACHED*/ + + +// ============================================================================ +// STATE MACHINE 2 -- BACKTRACK BRANCH TO PREVIOUS JP: +// +// Look for the next-left/right JP in a branch, backing up the history list as +// necessary. Upon finding a next-left/right JP, modify the corresponding +// digit in *PIndex before passing control to SM3Findlimit. +// +// Note: As described earlier, only branch JPs are expected here; other types +// fall into the default case. +// +// Note: If a found JP contains needed Dcd bytes, thats OK, theyre copied to +// *PIndex in SM3Findlimit. +// +// TBD: This code has a lot in common with similar code in the shortcut cases +// in SM1Get. Can combine this code somehow? +// +// ENTRY: List, possibly empty, of JPs and offsets in APjphist[] and +// Aoffhist[]; see earlier comments. +// +// EXIT: Execute JU_RET_NOTFOUND if no previous/next JP; otherwise jump to +// SM3Findlimit to resume a new but different downward search. + +SM2Backtrack: // come or return here for first/next sideways search. + + HISTPOP(Pjp, offset); + + switch (JU_JPTYPE(Pjp)) + { + + +// ---------------------------------------------------------------------------- +// LINEAR BRANCH: + + case cJU_JPBRANCH_L2: state = 2; goto SM2BranchL; + case cJU_JPBRANCH_L3: state = 3; goto SM2BranchL; +#ifdef JU_64BIT + case cJU_JPBRANCH_L4: state = 4; goto SM2BranchL; + case cJU_JPBRANCH_L5: state = 5; goto SM2BranchL; + case cJU_JPBRANCH_L6: state = 6; goto SM2BranchL; + case cJU_JPBRANCH_L7: state = 7; goto SM2BranchL; +#endif + case cJU_JPBRANCH_L: state = cJU_ROOTSTATE; goto SM2BranchL; + +SM2BranchL: +#ifdef JUDYPREV + if (--offset < 0) goto SM2Backtrack; // no next-left JP in BranchL. +#endif + Pjbl = P_JBL(Pjp->jp_Addr); +#ifdef JUDYNEXT + if (++offset >= (Pjbl->jbl_NumJPs)) goto SM2Backtrack; + // no next-right JP in BranchL. +#endif + +// Theres a next-left/right JP in the current BranchL; save its digit in +// *PIndex and continue with SM3Findlimit: + + JU_SETDIGIT(*PIndex, Pjbl->jbl_Expanse[offset], state); + Pjp = (Pjbl->jbl_jp) + offset; + goto SM3Findlimit; + + +// ---------------------------------------------------------------------------- +// BITMAP BRANCH: + + case cJU_JPBRANCH_B2: state = 2; goto SM2BranchB; + case cJU_JPBRANCH_B3: state = 3; goto SM2BranchB; +#ifdef JU_64BIT + case cJU_JPBRANCH_B4: state = 4; goto SM2BranchB; + case cJU_JPBRANCH_B5: state = 5; goto SM2BranchB; + case cJU_JPBRANCH_B6: state = 6; goto SM2BranchB; + case cJU_JPBRANCH_B7: state = 7; goto SM2BranchB; +#endif + case cJU_JPBRANCH_B: state = cJU_ROOTSTATE; goto SM2BranchB; + +SM2BranchB: + Pjbb = P_JBB(Pjp->jp_Addr); + HISTPOPBOFF(subexp, offset, digit); // unpack values. + +// If theres a next-left/right JP in the current BranchB, which for +// Judy*Next() is true if any bits are set for higher Indexes, continue to +// SM3Findlimit: +// +// Note: offset is set to the JP previously traversed; go one to the +// left/right. + +#ifdef JUDYPREV + if (offset > 0) // next-left JP is in this subexpanse. + { + --offset; + goto SM2BranchBFindlimit; + } + + while (--subexp >= 0) // search next-left subexpanses. +#else + if (JU_JBB_BITMAP(Pjbb, subexp) + & JU_MASKHIGHEREXC(JU_BITPOSMASKB(digit))) + { + ++offset; // next-left => next-right. + goto SM2BranchBFindlimit; + } + + while (++subexp < cJU_NUMSUBEXPB) // search next-right subexps. +#endif + { + if (! JU_JBB_PJP(Pjbb, subexp)) continue; // empty subexpanse. + +#ifdef JUDYPREV + offset = SEARCHBITMAPMAXB(JU_JBB_BITMAP(Pjbb, subexp)); + // expected range: + assert((offset >= 0) && (offset < cJU_BITSPERSUBEXPB)); +#else + offset = 0; +#endif + +// Save the next-left/right JPs digit in *PIndex: + +SM2BranchBFindlimit: + JU_BITMAPDIGITB(digit, subexp, JU_JBB_BITMAP(Pjbb, subexp), + offset); + JU_SETDIGIT(*PIndex, digit, state); + + if ((Pjp = P_JP(JU_JBB_PJP(Pjbb, subexp))) == (Pjp_t) NULL) + { + JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + } + + Pjp += offset; + goto SM3Findlimit; + } + +// Theres no next-left/right JP in the BranchB: + + goto SM2Backtrack; + + +// ---------------------------------------------------------------------------- +// UNCOMPRESSED BRANCH: + + case cJU_JPBRANCH_U2: state = 2; goto SM2BranchU; + case cJU_JPBRANCH_U3: state = 3; goto SM2BranchU; +#ifdef JU_64BIT + case cJU_JPBRANCH_U4: state = 4; goto SM2BranchU; + case cJU_JPBRANCH_U5: state = 5; goto SM2BranchU; + case cJU_JPBRANCH_U6: state = 6; goto SM2BranchU; + case cJU_JPBRANCH_U7: state = 7; goto SM2BranchU; +#endif + case cJU_JPBRANCH_U: state = cJU_ROOTSTATE; goto SM2BranchU; + +SM2BranchU: + +// Search for a next-left/right JP in the current BranchU, and if one is found, +// save its digit in *PIndex and continue to SM3Findlimit: + + Pjbu = P_JBU(Pjp->jp_Addr); + digit = offset; + +#ifdef JUDYPREV + while (digit >= 1) + { + Pjp = (Pjbu->jbu_jp) + (--digit); +#else + while (digit < cJU_BRANCHUNUMJPS - 1) + { + Pjp = (Pjbu->jbu_jp) + (++digit); +#endif + if (JPNULL(JU_JPTYPE(Pjp))) continue; + + JU_SETDIGIT(*PIndex, digit, state); + goto SM3Findlimit; + } + +// Theres no next-left/right JP in the BranchU: + + goto SM2Backtrack; + + +// ---------------------------------------------------------------------------- +// INVALID JP TYPE: + + default: JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + + } // SM2Backtrack switch. + + /*NOTREACHED*/ + + +// ============================================================================ +// STATE MACHINE 3 -- FIND LIMIT JP/INDEX: +// +// Look for the highest/lowest (right/left-most) JP in each branch and the +// highest/lowest Index in a leaf or immediate, and return it. While +// traversing, modify appropriate digit(s) in *PIndex to reflect the path +// taken, including Dcd bytes in each JP (which could hold critical missing +// digits for skipped branches). +// +// ENTRY: Pjp set to a JP under which to find max/min JPs (if a branch JP) or +// a max/min Index and return (if a leaf or immediate JP). +// +// EXIT: Execute JU_RET_FOUND* upon reaching a leaf or immediate. Should be +// impossible to fail, unless the Judy array is corrupt. + +SM3Findlimit: // come or return here for first/next branch/leaf. + + switch (JU_JPTYPE(Pjp)) + { +// ---------------------------------------------------------------------------- +// LINEAR BRANCH: +// +// Simply use the highest/lowest (right/left-most) JP in the BranchL, but first +// copy the Dcd bytes to *PIndex if there are any (only if state < +// cJU_ROOTSTATE - 1). + + case cJU_JPBRANCH_L2: SM3PREPB_DCD(2, SM3BranchL); +#ifndef JU_64BIT + case cJU_JPBRANCH_L3: SM3PREPB( 3, SM3BranchL); +#else + case cJU_JPBRANCH_L3: SM3PREPB_DCD(3, SM3BranchL); + case cJU_JPBRANCH_L4: SM3PREPB_DCD(4, SM3BranchL); + case cJU_JPBRANCH_L5: SM3PREPB_DCD(5, SM3BranchL); + case cJU_JPBRANCH_L6: SM3PREPB_DCD(6, SM3BranchL); + case cJU_JPBRANCH_L7: SM3PREPB( 7, SM3BranchL); +#endif + case cJU_JPBRANCH_L: SM3PREPB( cJU_ROOTSTATE, SM3BranchL); + +SM3BranchL: + Pjbl = P_JBL(Pjp->jp_Addr); + +#ifdef JUDYPREV + if ((offset = (Pjbl->jbl_NumJPs) - 1) < 0) +#else + offset = 0; if ((Pjbl->jbl_NumJPs) == 0) +#endif + { + JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + } + + JU_SETDIGIT(*PIndex, Pjbl->jbl_Expanse[offset], state); + Pjp = (Pjbl->jbl_jp) + offset; + goto SM3Findlimit; + + +// ---------------------------------------------------------------------------- +// BITMAP BRANCH: +// +// Look for the highest/lowest (right/left-most) non-null subexpanse, then use +// the highest/lowest JP in that subexpanse, but first copy Dcd bytes, if there +// are any (only if state < cJU_ROOTSTATE - 1), to *PIndex. + + case cJU_JPBRANCH_B2: SM3PREPB_DCD(2, SM3BranchB); +#ifndef JU_64BIT + case cJU_JPBRANCH_B3: SM3PREPB( 3, SM3BranchB); +#else + case cJU_JPBRANCH_B3: SM3PREPB_DCD(3, SM3BranchB); + case cJU_JPBRANCH_B4: SM3PREPB_DCD(4, SM3BranchB); + case cJU_JPBRANCH_B5: SM3PREPB_DCD(5, SM3BranchB); + case cJU_JPBRANCH_B6: SM3PREPB_DCD(6, SM3BranchB); + case cJU_JPBRANCH_B7: SM3PREPB( 7, SM3BranchB); +#endif + case cJU_JPBRANCH_B: SM3PREPB( cJU_ROOTSTATE, SM3BranchB); + +SM3BranchB: + Pjbb = P_JBB(Pjp->jp_Addr); +#ifdef JUDYPREV + subexp = cJU_NUMSUBEXPB; + + while (! (JU_JBB_BITMAP(Pjbb, --subexp))) // find non-empty subexp. + { + if (subexp <= 0) // wholly empty bitmap. + { + JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + } + } + + offset = SEARCHBITMAPMAXB(JU_JBB_BITMAP(Pjbb, subexp)); + // expected range: + assert((offset >= 0) && (offset < cJU_BITSPERSUBEXPB)); +#else + subexp = -1; + + while (! (JU_JBB_BITMAP(Pjbb, ++subexp))) // find non-empty subexp. + { + if (subexp >= cJU_NUMSUBEXPB - 1) // didnt find one. + { + JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + } + } + + offset = 0; +#endif + + JU_BITMAPDIGITB(digit, subexp, JU_JBB_BITMAP(Pjbb, subexp), offset); + JU_SETDIGIT(*PIndex, digit, state); + + if ((Pjp = P_JP(JU_JBB_PJP(Pjbb, subexp))) == (Pjp_t) NULL) + { + JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + } + + Pjp += offset; + goto SM3Findlimit; + + +// ---------------------------------------------------------------------------- +// UNCOMPRESSED BRANCH: +// +// Look for the highest/lowest (right/left-most) non-null JP, and use it, but +// first copy Dcd bytes to *PIndex if there are any (only if state < +// cJU_ROOTSTATE - 1). + + case cJU_JPBRANCH_U2: SM3PREPB_DCD(2, SM3BranchU); +#ifndef JU_64BIT + case cJU_JPBRANCH_U3: SM3PREPB( 3, SM3BranchU); +#else + case cJU_JPBRANCH_U3: SM3PREPB_DCD(3, SM3BranchU); + case cJU_JPBRANCH_U4: SM3PREPB_DCD(4, SM3BranchU); + case cJU_JPBRANCH_U5: SM3PREPB_DCD(5, SM3BranchU); + case cJU_JPBRANCH_U6: SM3PREPB_DCD(6, SM3BranchU); + case cJU_JPBRANCH_U7: SM3PREPB( 7, SM3BranchU); +#endif + case cJU_JPBRANCH_U: SM3PREPB( cJU_ROOTSTATE, SM3BranchU); + +SM3BranchU: + Pjbu = P_JBU(Pjp->jp_Addr); +#ifdef JUDYPREV + digit = cJU_BRANCHUNUMJPS; + + while (digit >= 1) + { + Pjp = (Pjbu->jbu_jp) + (--digit); +#else + + for (digit = 0; digit < cJU_BRANCHUNUMJPS; ++digit) + { + Pjp = (Pjbu->jbu_jp) + digit; +#endif + if (JPNULL(JU_JPTYPE(Pjp))) continue; + + JU_SETDIGIT(*PIndex, digit, state); + goto SM3Findlimit; + } + +// No non-null JPs in BranchU: + + JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + + +// ---------------------------------------------------------------------------- +// LINEAR LEAF: +// +// Simply use the highest/lowest (right/left-most) Index in the LeafL, but the +// details vary depending on leaf Index Size. First copy Dcd bytes, if there +// are any (only if state < cJU_ROOTSTATE - 1), to *PIndex. + +#define SM3LEAFLDCD(cState) \ + JU_SETDCD(*PIndex, Pjp, cState); \ + SM3LEAFLNODCD + +#ifdef JUDY1 +#define SM3LEAFL_SETPOP1 // not needed in any cases. +#else +#define SM3LEAFL_SETPOP1 pop1 = JU_JPLEAF_POP0(Pjp) + 1 +#endif + +#ifdef JUDYPREV +#define SM3LEAFLNODCD \ + Pjll = P_JLL(Pjp->jp_Addr); \ + SM3LEAFL_SETPOP1; \ + offset = JU_JPLEAF_POP0(Pjp); assert(offset >= 0) +#else +#define SM3LEAFLNODCD \ + Pjll = P_JLL(Pjp->jp_Addr); \ + SM3LEAFL_SETPOP1; \ + offset = 0; assert(JU_JPLEAF_POP0(Pjp) >= 0); +#endif + +#if (defined(JUDYL) || (! defined(JU_64BIT))) + case cJU_JPLEAF1: + + SM3LEAFLDCD(1); + JU_SETDIGIT1(*PIndex, ((uint8_t *) Pjll)[offset]); + JU_RET_FOUND_LEAF1(Pjll, pop1, offset); +#endif + + case cJU_JPLEAF2: + + SM3LEAFLDCD(2); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(2))) + | ((uint16_t *) Pjll)[offset]; + JU_RET_FOUND_LEAF2(Pjll, pop1, offset); + +#ifndef JU_64BIT + case cJU_JPLEAF3: + { + Word_t lsb; + SM3LEAFLNODCD; + JU_COPY3_PINDEX_TO_LONG(lsb, ((uint8_t *) Pjll) + (3 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(3))) | lsb; + JU_RET_FOUND_LEAF3(Pjll, pop1, offset); + } + +#else + case cJU_JPLEAF3: + { + Word_t lsb; + SM3LEAFLDCD(3); + JU_COPY3_PINDEX_TO_LONG(lsb, ((uint8_t *) Pjll) + (3 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(3))) | lsb; + JU_RET_FOUND_LEAF3(Pjll, pop1, offset); + } + + case cJU_JPLEAF4: + + SM3LEAFLDCD(4); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(4))) + | ((uint32_t *) Pjll)[offset]; + JU_RET_FOUND_LEAF4(Pjll, pop1, offset); + + case cJU_JPLEAF5: + { + Word_t lsb; + SM3LEAFLDCD(5); + JU_COPY5_PINDEX_TO_LONG(lsb, ((uint8_t *) Pjll) + (5 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(5))) | lsb; + JU_RET_FOUND_LEAF5(Pjll, pop1, offset); + } + + case cJU_JPLEAF6: + { + Word_t lsb; + SM3LEAFLDCD(6); + JU_COPY6_PINDEX_TO_LONG(lsb, ((uint8_t *) Pjll) + (6 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(6))) | lsb; + JU_RET_FOUND_LEAF6(Pjll, pop1, offset); + } + + case cJU_JPLEAF7: + { + Word_t lsb; + SM3LEAFLNODCD; + JU_COPY7_PINDEX_TO_LONG(lsb, ((uint8_t *) Pjll) + (7 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(7))) | lsb; + JU_RET_FOUND_LEAF7(Pjll, pop1, offset); + } +#endif + + +// ---------------------------------------------------------------------------- +// BITMAP LEAF: +// +// Look for the highest/lowest (right/left-most) non-null subexpanse, then use +// the highest/lowest Index in that subexpanse, but first copy Dcd bytes +// (always present since state 1 < cJU_ROOTSTATE) to *PIndex. + + case cJU_JPLEAF_B1: + { + Pjlb_t Pjlb; + + JU_SETDCD(*PIndex, Pjp, 1); + + Pjlb = P_JLB(Pjp->jp_Addr); +#ifdef JUDYPREV + subexp = cJU_NUMSUBEXPL; + + while (! JU_JLB_BITMAP(Pjlb, --subexp)) // find non-empty subexp. + { + if (subexp <= 0) // wholly empty bitmap. + { + JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + } + } + +// TBD: Might it be faster to just use a variant of BITMAPDIGIT*() that yields +// the digit for the right-most Index with a bit set? + + offset = SEARCHBITMAPMAXL(JU_JLB_BITMAP(Pjlb, subexp)); + // expected range: + assert((offset >= 0) && (offset < cJU_BITSPERSUBEXPL)); +#else + subexp = -1; + + while (! JU_JLB_BITMAP(Pjlb, ++subexp)) // find non-empty subexp. + { + if (subexp >= cJU_NUMSUBEXPL - 1) // didnt find one. + { + JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + } + } + + offset = 0; +#endif + + JU_BITMAPDIGITL(digit, subexp, JU_JLB_BITMAP(Pjlb, subexp), offset); + JU_SETDIGIT1(*PIndex, digit); + JU_RET_FOUND_LEAF_B1(Pjlb, subexp, offset); +// == return((PPvoid_t) (P_JV(JL_JLB_PVALUE(Pjlb, subexp)) + (offset))); + + } // case cJU_JPLEAF_B1 + +#ifdef JUDY1 +// ---------------------------------------------------------------------------- +// FULL POPULATION: +// +// Copy Dcd bytes to *PIndex (always present since state 1 < cJU_ROOTSTATE), +// then set the highest/lowest possible digit as the LSB in *PIndex. + + case cJ1_JPFULLPOPU1: + + JU_SETDCD( *PIndex, Pjp, 1); +#ifdef JUDYPREV + JU_SETDIGIT1(*PIndex, cJU_BITSPERBITMAP - 1); +#else + JU_SETDIGIT1(*PIndex, 0); +#endif + JU_RET_FOUND_FULLPOPU1; +#endif // JUDY1 + + +// ---------------------------------------------------------------------------- +// IMMEDIATE: +// +// Simply use the highest/lowest (right/left-most) Index in the Imm, but the +// details vary depending on leaf Index Size and pop1. Note: There are no Dcd +// bytes in an Immediate JP, but in a cJU_JPIMMED_*_01 JP, the field holds the +// least bytes of the immediate Index. + + case cJU_JPIMMED_1_01: SET_01(1); goto SM3Imm_01; + case cJU_JPIMMED_2_01: SET_01(2); goto SM3Imm_01; + case cJU_JPIMMED_3_01: SET_01(3); goto SM3Imm_01; +#ifdef JU_64BIT + case cJU_JPIMMED_4_01: SET_01(4); goto SM3Imm_01; + case cJU_JPIMMED_5_01: SET_01(5); goto SM3Imm_01; + case cJU_JPIMMED_6_01: SET_01(6); goto SM3Imm_01; + case cJU_JPIMMED_7_01: SET_01(7); goto SM3Imm_01; +#endif +SM3Imm_01: JU_RET_FOUND_IMM_01(Pjp); + +#ifdef JUDYPREV +#define SM3IMM_OFFSET(cPop1) (cPop1) - 1 // highest. +#else +#define SM3IMM_OFFSET(cPop1) 0 // lowest. +#endif + +#define SM3IMM(cPop1,Next) \ + offset = SM3IMM_OFFSET(cPop1); \ + goto Next + + case cJU_JPIMMED_1_02: SM3IMM( 2, SM3Imm1); + case cJU_JPIMMED_1_03: SM3IMM( 3, SM3Imm1); +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_1_04: SM3IMM( 4, SM3Imm1); + case cJU_JPIMMED_1_05: SM3IMM( 5, SM3Imm1); + case cJU_JPIMMED_1_06: SM3IMM( 6, SM3Imm1); + case cJU_JPIMMED_1_07: SM3IMM( 7, SM3Imm1); +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_1_08: SM3IMM( 8, SM3Imm1); + case cJ1_JPIMMED_1_09: SM3IMM( 9, SM3Imm1); + case cJ1_JPIMMED_1_10: SM3IMM(10, SM3Imm1); + case cJ1_JPIMMED_1_11: SM3IMM(11, SM3Imm1); + case cJ1_JPIMMED_1_12: SM3IMM(12, SM3Imm1); + case cJ1_JPIMMED_1_13: SM3IMM(13, SM3Imm1); + case cJ1_JPIMMED_1_14: SM3IMM(14, SM3Imm1); + case cJ1_JPIMMED_1_15: SM3IMM(15, SM3Imm1); +#endif + +SM3Imm1: JU_SETDIGIT1(*PIndex, ((uint8_t *) PJI)[offset]); + JU_RET_FOUND_IMM(Pjp, offset); + +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_2_02: SM3IMM(2, SM3Imm2); + case cJU_JPIMMED_2_03: SM3IMM(3, SM3Imm2); +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_2_04: SM3IMM(4, SM3Imm2); + case cJ1_JPIMMED_2_05: SM3IMM(5, SM3Imm2); + case cJ1_JPIMMED_2_06: SM3IMM(6, SM3Imm2); + case cJ1_JPIMMED_2_07: SM3IMM(7, SM3Imm2); +#endif + +#if (defined(JUDY1) || defined(JU_64BIT)) +SM3Imm2: *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(2))) + | ((uint16_t *) PJI)[offset]; + JU_RET_FOUND_IMM(Pjp, offset); +#endif + +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_3_02: SM3IMM(2, SM3Imm3); +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_3_03: SM3IMM(3, SM3Imm3); + case cJ1_JPIMMED_3_04: SM3IMM(4, SM3Imm3); + case cJ1_JPIMMED_3_05: SM3IMM(5, SM3Imm3); +#endif + +#if (defined(JUDY1) || defined(JU_64BIT)) +SM3Imm3: + { + Word_t lsb; + JU_COPY3_PINDEX_TO_LONG(lsb, ((uint8_t *) PJI) + (3 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(3))) | lsb; + JU_RET_FOUND_IMM(Pjp, offset); + } +#endif + +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_4_02: SM3IMM(2, SM3Imm4); + case cJ1_JPIMMED_4_03: SM3IMM(3, SM3Imm4); + +SM3Imm4: *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(4))) + | ((uint32_t *) PJI)[offset]; + JU_RET_FOUND_IMM(Pjp, offset); + + case cJ1_JPIMMED_5_02: SM3IMM(2, SM3Imm5); + case cJ1_JPIMMED_5_03: SM3IMM(3, SM3Imm5); + +SM3Imm5: + { + Word_t lsb; + JU_COPY5_PINDEX_TO_LONG(lsb, ((uint8_t *) PJI) + (5 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(5))) | lsb; + JU_RET_FOUND_IMM(Pjp, offset); + } + + case cJ1_JPIMMED_6_02: SM3IMM(2, SM3Imm6); + +SM3Imm6: + { + Word_t lsb; + JU_COPY6_PINDEX_TO_LONG(lsb, ((uint8_t *) PJI) + (6 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(6))) | lsb; + JU_RET_FOUND_IMM(Pjp, offset); + } + + case cJ1_JPIMMED_7_02: SM3IMM(2, SM3Imm7); + +SM3Imm7: + { + Word_t lsb; + JU_COPY7_PINDEX_TO_LONG(lsb, ((uint8_t *) PJI) + (7 * offset)); + *PIndex = (*PIndex & (~JU_LEASTBYTESMASK(7))) | lsb; + JU_RET_FOUND_IMM(Pjp, offset); + } +#endif // (JUDY1 && JU_64BIT) + + +// ---------------------------------------------------------------------------- +// OTHER CASES: + + default: JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + + } // SM3Findlimit switch. + + /*NOTREACHED*/ + +} // Judy1Prev() / Judy1Next() / JudyLPrev() / JudyLNext() diff --git a/src/libnetdata/libjudy/src/JudyL/JudyLPrevEmpty.c b/src/libnetdata/libjudy/src/JudyL/JudyLPrevEmpty.c new file mode 100644 index 00000000..4da43565 --- /dev/null +++ b/src/libnetdata/libjudy/src/JudyL/JudyLPrevEmpty.c @@ -0,0 +1,1390 @@ +// Copyright (C) 2000 - 2002 Hewlett-Packard Company +// +// This program is free software; you can redistribute it and/or modify it +// under the term of the GNU Lesser General Public License as published by the +// Free Software Foundation; either version 2 of the License, or (at your +// option) any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License +// for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this program; if not, write to the Free Software Foundation, +// Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// _________________ + +// @(#) $Revision: 4.32 $ $Source: /judy/src/JudyCommon/JudyPrevNextEmpty.c $ +// +// Judy*PrevEmpty() and Judy*NextEmpty() functions for Judy1 and JudyL. +// Compile with one of -DJUDY1 or -DJUDYL. +// +// Compile with -DJUDYNEXT for the Judy*NextEmpty() function; otherwise +// defaults to Judy*PrevEmpty(). +// +// Compile with -DTRACEJPSE to trace JP traversals. +// +// This file is separate from JudyPrevNext.c because it differs too greatly for +// ifdefs. This might be a bit surprising, but there are two reasons: +// +// - First, down in the details, searching for an empty index (SearchEmpty) is +// remarkably asymmetric with searching for a valid index (SearchValid), +// mainly with respect to: No return of a value area for JudyL; partially- +// full versus totally-full JPs; and handling of narrow pointers. +// +// - Second, we chose to implement SearchEmpty without a backtrack stack or +// backtrack engine, partly as an experiment, and partly because we think +// restarting from the top of the tree is less likely for SearchEmpty than +// for SearchValid, because empty indexes are more likely than valid indexes. +// +// A word about naming: A prior version of this feature (see 4.13) was named +// Judy*Free(), but there were concerns about that being read as a verb rather +// than an adjective. After prolonged debate and based on user input, we +// changed "Free" to "Empty". + +#if (! (defined(JUDY1) || defined(JUDYL))) +#error: One of -DJUDY1 or -DJUDYL must be specified. +#endif + +#ifndef JUDYNEXT +#ifndef JUDYPREV +#define JUDYPREV 1 // neither set => use default. +#endif +#endif + +#ifdef JUDY1 +#include "Judy1.h" +#else +#include "JudyL.h" +#endif + +#include "JudyPrivate1L.h" + +#ifdef TRACEJPSE +#include "JudyPrintJP.c" +#endif + + +// **************************************************************************** +// J U D Y 1 P R E V E M P T Y +// J U D Y 1 N E X T E M P T Y +// J U D Y L P R E V E M P T Y +// J U D Y L N E X T E M P T Y +// +// See the manual entry for the API. +// +// OVERVIEW OF Judy*PrevEmpty() / Judy*NextEmpty(): +// +// See also for comparison the equivalent comments in JudyPrevNext.c. +// +// Take the callers *PIndex and subtract/add 1, but watch out for +// underflow/overflow, which means "no previous/next empty index found." Use a +// reentrant switch statement (state machine, see SMGetRestart and +// SMGetContinue) to decode Index, starting with the JRP (PArray), through a +// JPM and branches, if any, down to an immediate or a leaf. Look for Index in +// that immediate or leaf, and if not found (invalid index), return success +// (Index is empty). +// +// This search can result in a dead end where taking a different path is +// required. There are four kinds of dead ends: +// +// BRANCH PRIMARY dead end: Encountering a fully-populated JP for the +// appropriate digit in Index. Search sideways in the branch for the +// previous/next absent/null/non-full JP, and if one is found, set Index to the +// highest/lowest index possible in that JPs expanse. Then if the JP is an +// absent or null JP, return success; otherwise for a non-full JP, traverse +// through the partially populated JP. +// +// BRANCH SECONDARY dead end: Reaching the end of a branch during a sideways +// search after a branch primary dead end. Set Index to the lowest/highest +// index possible in the whole branchs expanse (one higher/lower than the +// previous/next branchs expanse), then restart at the top of the tree, which +// includes pre-decrementing/incrementing Index (again) and watching for +// underflow/overflow (again). +// +// LEAF PRIMARY dead end: Finding a valid (non-empty) index in an immediate or +// leaf matching Index. Search sideways in the immediate/leaf for the +// previous/next empty index; if found, set *PIndex to match and return success. +// +// LEAF SECONDARY dead end: Reaching the end of an immediate or leaf during a +// sideways search after a leaf primary dead end. Just as for a branch +// secondary dead end, restart at the top of the tree with Index set to the +// lowest/highest index possible in the whole immediate/leafs expanse. +// TBD: If leaf secondary dead end occurs, could shortcut and treat it as a +// branch primary dead end; but this would require remembering the parent +// branchs type and offset (a "one-deep stack"), and also wrestling with +// narrow pointers, at least for leaves (but not for immediates). +// +// Note some ASYMMETRIES between SearchValid and SearchEmpty: +// +// - The SearchValid code, upon descending through a narrow pointer, if Index +// is outside the expanse of the subsidiary node (effectively a secondary +// dead end), must decide whether to backtrack or findlimit. But the +// SearchEmpty code simply returns success (Index is empty). +// +// - Similarly, the SearchValid code, upon finding no previous/next index in +// the expanse of a narrow pointer (again, a secondary dead end), can simply +// start to backtrack at the parent JP. But the SearchEmpty code would have +// to first determine whether or not the parent JPs narrow expanse contains +// a previous/next empty index outside the subexpanse. Rather than keeping a +// parent state stack and backtracking this way, upon a secondary dead end, +// the SearchEmpty code simply restarts at the top of the tree, whether or +// not a narrow pointer is involved. Again, see the equivalent comments in +// JudyPrevNext.c for comparison. +// +// This function is written iteratively for speed, rather than recursively. +// +// TBD: Wed like to enhance this function to make successive searches faster. +// This would require saving some previous state, including the previous Index +// returned, and in which leaf it was found. If the next call is for the same +// Index and the array has not been modified, start at the same leaf. This +// should be much easier to implement since this is iterative rather than +// recursive code. + +#ifdef JUDY1 +#ifdef JUDYPREV +FUNCTION int Judy1PrevEmpty +#else +FUNCTION int Judy1NextEmpty +#endif +#else +#ifdef JUDYPREV +FUNCTION int JudyLPrevEmpty +#else +FUNCTION int JudyLNextEmpty +#endif +#endif + ( + Pcvoid_t PArray, // Judy array to search. + Word_t * PIndex, // starting point and result. + PJError_t PJError // optional, for returning error info. + ) +{ + Word_t Index; // fast copy, in a register. + Pjp_t Pjp; // current JP. + Pjbl_t Pjbl; // Pjp->jp_Addr masked and cast to types: + Pjbb_t Pjbb; + Pjbu_t Pjbu; + Pjlb_t Pjlb; + PWord_t Pword; // alternate name for use by GET* macros. + + Word_t digit; // next digit to decode from Index. + Word_t digits; // current state in SM = digits left to decode. + Word_t pop0; // in a leaf. + Word_t pop0mask; // precalculated to avoid variable shifts. + long offset; // within a branch or leaf (can be large). + int subexp; // subexpanse in a bitmap branch. + BITMAPB_t bitposmaskB; // bit in bitmap for bitmap branch. + BITMAPL_t bitposmaskL; // bit in bitmap for bitmap leaf. + Word_t possfullJP1; // JP types for possibly full subexpanses: + Word_t possfullJP2; + Word_t possfullJP3; + + +// ---------------------------------------------------------------------------- +// M A C R O S +// +// These are intended to make the code a bit more readable and less redundant. + + +// CHECK FOR NULL JP: +// +// TBD: In principle this can be reduced (here and in other *.c files) to just +// the latter clause since no Type should ever be below cJU_JPNULL1, but in +// fact some root pointer types can be lower, so for safety do both checks. + +#define JPNULL(Type) (((Type) >= cJU_JPNULL1) && ((Type) <= cJU_JPNULLMAX)) + + +// CHECK FOR A FULL JP: +// +// Given a JP, indicate if it is fully populated. Use digits, pop0mask, and +// possfullJP1..3 in the context. +// +// This is a difficult problem because it requires checking the Pop0 bits for +// all-ones, but the number of bytes depends on the JP type, which is not +// directly related to the parent branchs type or level -- the JPs child +// could be under a narrow pointer (hence not full). The simple answer +// requires switching on or otherwise calculating the JP type, which could be +// slow. Instead, in SMPREPB* precalculate pop0mask and also record in +// possfullJP1..3 the child JP (branch) types that could possibly be full (one +// level down), and use them here. For level-2 branches (with digits == 2), +// the test for a full child depends on Judy1/JudyL. +// +// Note: This cannot be applied to the JP in a JPM because it doesnt have +// enough pop0 digits. +// +// TBD: JPFULL_BRANCH diligently checks for BranchL or BranchB, where neither +// of those can ever be full as it turns out. Could just check for a BranchU +// at the right level. Also, pop0mask might be overkill, its not used much, +// so perhaps just call cJU_POP0MASK(digits - 1) here? +// +// First, JPFULL_BRANCH checks for a full expanse for a JP whose child can be a +// branch, that is, a JP in a branch at level 3 or higher: + +#define JPFULL_BRANCH(Pjp) \ + ((((JU_JPDCDPOP0(Pjp) ^ cJU_ALLONES) & pop0mask) == 0) \ + && ((JU_JPTYPE(Pjp) == possfullJP1) \ + || (JU_JPTYPE(Pjp) == possfullJP2) \ + || (JU_JPTYPE(Pjp) == possfullJP3))) + +#ifdef JUDY1 +#define JPFULL(Pjp) \ + ((digits == 2) ? \ + (JU_JPTYPE(Pjp) == cJ1_JPFULLPOPU1) : JPFULL_BRANCH(Pjp)) +#else +#define JPFULL(Pjp) \ + ((digits == 2) ? \ + (JU_JPTYPE(Pjp) == cJU_JPLEAF_B1) \ + && (((JU_JPDCDPOP0(Pjp) & cJU_POP0MASK(1)) == cJU_POP0MASK(1))) : \ + JPFULL_BRANCH(Pjp)) +#endif + + +// RETURN SUCCESS: +// +// This hides the need to set *PIndex back to the local value of Index -- use a +// local value for faster operation. Note that the callers *PIndex is ALWAYS +// modified upon success, at least decremented/incremented. + +#define RET_SUCCESS { *PIndex = Index; return(1); } + + +// RETURN A CORRUPTION: + +#define RET_CORRUPT { JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); return(JERRI); } + + +// SEARCH A BITMAP BRANCH: +// +// This is a weak analog of j__udySearchLeaf*() for bitmap branches. Return +// the actual or next-left position, base 0, of Digit in a BITMAPB_t bitmap +// (subexpanse of a full bitmap), also given a Bitposmask for Digit. The +// position is the offset within the set bits. +// +// Unlike j__udySearchLeaf*(), the offset is not returned bit-complemented if +// Digits bit is unset, because the caller can check the bitmap themselves to +// determine that. Also, if Digits bit is unset, the returned offset is to +// the next-left JP or index (including -1), not to the "ideal" position for +// the index = next-right JP or index. +// +// Shortcut and skip calling j__udyCountBitsB() if the bitmap is full, in which +// case (Digit % cJU_BITSPERSUBEXPB) itself is the base-0 offset. + +#define SEARCHBITMAPB(Bitmap,Digit,Bitposmask) \ + (((Bitmap) == cJU_FULLBITMAPB) ? (Digit % cJU_BITSPERSUBEXPB) : \ + j__udyCountBitsB((Bitmap) & JU_MASKLOWERINC(Bitposmask)) - 1) + +#ifdef JUDYPREV +// Equivalent to search for the highest offset in Bitmap, that is, one less +// than the number of bits set: + +#define SEARCHBITMAPMAXB(Bitmap) \ + (((Bitmap) == cJU_FULLBITMAPB) ? cJU_BITSPERSUBEXPB - 1 : \ + j__udyCountBitsB(Bitmap) - 1) +#endif + + +// CHECK DECODE BYTES: +// +// Check Decode bytes in a JP against the equivalent portion of Index. If they +// dont match, Index is outside the subexpanse of a narrow pointer, hence is +// empty. + +#define CHECKDCD(cDigits) \ + if (JU_DCDNOTMATCHINDEX(Index, Pjp, cDigits)) RET_SUCCESS + + +// REVISE REMAINDER OF INDEX: +// +// Put one digit in place in Index and clear/set the lower digits, if any, so +// the resulting Index is at the start/end of an expanse, or just clear/set the +// least digits. +// +// Actually, to make simple use of JU_LEASTBYTESMASK, first clear/set all least +// digits of Index including the digit to be overridden, then set the value of +// that one digit. If Digits == 1 the first operation is redundant, but either +// very fast or even removed by the optimizer. + +#define CLEARLEASTDIGITS(Digits) Index &= ~JU_LEASTBYTESMASK(Digits) +#define SETLEASTDIGITS( Digits) Index |= JU_LEASTBYTESMASK(Digits) + +#define CLEARLEASTDIGITS_D(Digit,Digits) \ + { \ + CLEARLEASTDIGITS(Digits); \ + JU_SETDIGIT(Index, Digit, Digits); \ + } + +#define SETLEASTDIGITS_D(Digit,Digits) \ + { \ + SETLEASTDIGITS(Digits); \ + JU_SETDIGIT(Index, Digit, Digits); \ + } + + +// SET REMAINDER OF INDEX AND THEN RETURN OR CONTINUE: + +#define SET_AND_RETURN(OpLeastDigits,Digit,Digits) \ + { \ + OpLeastDigits(Digit, Digits); \ + RET_SUCCESS; \ + } + +#define SET_AND_CONTINUE(OpLeastDigits,Digit,Digits) \ + { \ + OpLeastDigits(Digit, Digits); \ + goto SMGetContinue; \ + } + + +// PREPARE TO HANDLE A LEAFW OR JP BRANCH IN THE STATE MACHINE: +// +// Extract a state-dependent digit from Index in a "constant" way, then jump to +// common code for multiple cases. +// +// TBD: Should this macro do more, such as preparing variable-shift masks for +// use in CLEARLEASTDIGITS and SETLEASTDIGITS? + +#define SMPREPB(cDigits,Next,PossFullJP1,PossFullJP2,PossFullJP3) \ + digits = (cDigits); \ + digit = JU_DIGITATSTATE(Index, cDigits); \ + pop0mask = cJU_POP0MASK((cDigits) - 1); /* for branchs JPs */ \ + possfullJP1 = (PossFullJP1); \ + possfullJP2 = (PossFullJP2); \ + possfullJP3 = (PossFullJP3); \ + goto Next + +// Variations for specific-level branches and for shorthands: +// +// Note: SMPREPB2 need not initialize possfullJP* because JPFULL does not use +// them for digits == 2, but gcc -Wall isnt quite smart enough to see this, so +// waste a bit of time and space to get rid of the warning: + +#define SMPREPB2(Next) \ + digits = 2; \ + digit = JU_DIGITATSTATE(Index, 2); \ + pop0mask = cJU_POP0MASK(1); /* for branchs JPs */ \ + possfullJP1 = possfullJP2 = possfullJP3 = 0; \ + goto Next + +#define SMPREPB3(Next) SMPREPB(3, Next, cJU_JPBRANCH_L2, \ + cJU_JPBRANCH_B2, \ + cJU_JPBRANCH_U2) +#ifndef JU_64BIT +#define SMPREPBL(Next) SMPREPB(cJU_ROOTSTATE, Next, cJU_JPBRANCH_L3, \ + cJU_JPBRANCH_B3, \ + cJU_JPBRANCH_U3) +#else +#define SMPREPB4(Next) SMPREPB(4, Next, cJU_JPBRANCH_L3, \ + cJU_JPBRANCH_B3, \ + cJU_JPBRANCH_U3) +#define SMPREPB5(Next) SMPREPB(5, Next, cJU_JPBRANCH_L4, \ + cJU_JPBRANCH_B4, \ + cJU_JPBRANCH_U4) +#define SMPREPB6(Next) SMPREPB(6, Next, cJU_JPBRANCH_L5, \ + cJU_JPBRANCH_B5, \ + cJU_JPBRANCH_U5) +#define SMPREPB7(Next) SMPREPB(7, Next, cJU_JPBRANCH_L6, \ + cJU_JPBRANCH_B6, \ + cJU_JPBRANCH_U6) +#define SMPREPBL(Next) SMPREPB(cJU_ROOTSTATE, Next, cJU_JPBRANCH_L7, \ + cJU_JPBRANCH_B7, \ + cJU_JPBRANCH_U7) +#endif + + +// RESTART AFTER SECONDARY DEAD END: +// +// Set Index to the first/last index in the branch or leaf subexpanse and start +// over at the top of the tree. + +#ifdef JUDYPREV +#define SMRESTART(Digits) { CLEARLEASTDIGITS(Digits); goto SMGetRestart; } +#else +#define SMRESTART(Digits) { SETLEASTDIGITS( Digits); goto SMGetRestart; } +#endif + + +// CHECK EDGE OF LEAFS EXPANSE: +// +// Given the LSBs of the lowest/highest valid index in a leaf (or equivalently +// in an immediate JP), the level (index size) of the leaf, and the full index +// to return (as Index in the context) already set to the full index matching +// the lowest/highest one, determine if there is an empty index in the leafs +// expanse below/above the lowest/highest index, which is true if the +// lowest/highest index is not at the "edge" of the leafs expanse based on its +// LSBs. If so, return Index decremented/incremented; otherwise restart at the +// top of the tree. +// +// Note: In many cases Index is already at the right spot and calling +// SMRESTART instead of just going directly to SMGetRestart is a bit of +// overkill. +// +// Note: Variable shift occurs if Digits is not a constant. + +#ifdef JUDYPREV +#define LEAF_EDGE(MinIndex,Digits) \ + { \ + if (MinIndex) { --Index; RET_SUCCESS; } \ + SMRESTART(Digits); \ + } +#else +#define LEAF_EDGE(MaxIndex,Digits) \ + { \ + if ((MaxIndex) != JU_LEASTBYTES(cJU_ALLONES, Digits)) \ + { ++Index; RET_SUCCESS; } \ + SMRESTART(Digits); \ + } +#endif + +// Same as above except Index is not already set to match the lowest/highest +// index, so do that before decrementing/incrementing it: + +#ifdef JUDYPREV +#define LEAF_EDGE_SET(MinIndex,Digits) \ + { \ + if (MinIndex) \ + { JU_SETDIGITS(Index, MinIndex, Digits); --Index; RET_SUCCESS; } \ + SMRESTART(Digits); \ + } +#else +#define LEAF_EDGE_SET(MaxIndex,Digits) \ + { \ + if ((MaxIndex) != JU_LEASTBYTES(cJU_ALLONES, Digits)) \ + { JU_SETDIGITS(Index, MaxIndex, Digits); ++Index; RET_SUCCESS; } \ + SMRESTART(Digits); \ + } +#endif + + +// FIND A HOLE (EMPTY INDEX) IN AN IMMEDIATE OR LEAF: +// +// Given an index location in a leaf (or equivalently an immediate JP) known to +// contain a usable hole (an empty index less/greater than Index), and the LSBs +// of a minimum/maximum index to locate, find the previous/next empty index and +// return it. +// +// Note: "Even" index sizes (1,2,4[,8] bytes) have corresponding native C +// types; "odd" index sizes dont, but they are not represented here because +// they are handled completely differently; see elsewhere. + +#ifdef JUDYPREV + +#define LEAF_HOLE_EVEN(cDigits,Pjll,IndexLSB) \ + { \ + while (*(Pjll) > (IndexLSB)) --(Pjll); /* too high */ \ + if (*(Pjll) < (IndexLSB)) RET_SUCCESS /* Index is empty */ \ + while (*(--(Pjll)) == --(IndexLSB)) /* null, find a hole */;\ + JU_SETDIGITS(Index, IndexLSB, cDigits); \ + RET_SUCCESS; \ + } +#else +#define LEAF_HOLE_EVEN(cDigits,Pjll,IndexLSB) \ + { \ + while (*(Pjll) < (IndexLSB)) ++(Pjll); /* too low */ \ + if (*(Pjll) > (IndexLSB)) RET_SUCCESS /* Index is empty */ \ + while (*(++(Pjll)) == ++(IndexLSB)) /* null, find a hole */;\ + JU_SETDIGITS(Index, IndexLSB, cDigits); \ + RET_SUCCESS; \ + } +#endif + + +// SEARCH FOR AN EMPTY INDEX IN AN IMMEDIATE OR LEAF: +// +// Given a pointer to the first index in a leaf (or equivalently an immediate +// JP), the population of the leaf, and a first empty Index to find (inclusive, +// as Index in the context), where Index is known to fall within the expanse of +// the leaf to search, efficiently find the previous/next empty index in the +// leaf, if any. For simplicity the following overview is stated in terms of +// Judy*NextEmpty() only, but the same concepts apply symmetrically for +// Judy*PrevEmpty(). Also, in each case the comparisons are for the LSBs of +// Index and leaf indexes, according to the leafs level. +// +// 1. If Index is GREATER than the last (highest) index in the leaf +// (maxindex), return success, Index is empty. (Remember, Index is known +// to be in the leafs expanse.) +// +// 2. If Index is EQUAL to maxindex: If maxindex is not at the edge of the +// leafs expanse, increment Index and return success, there is an empty +// Index one higher than any in the leaf; otherwise restart with Index +// reset to the upper edge of the leafs expanse. Note: This might cause +// an extra cache line fill, but this is OK for repeatedly-called search +// code, and it saves CPU time. +// +// 3. If Index is LESS than maxindex, check for "dense to end of leaf": +// Subtract Index from maxindex, and back up that many slots in the leaf. +// If the resulting offset is not before the start of the leaf then compare +// the index at this offset (baseindex) with Index: +// +// 3a. If GREATER, the leaf must be corrupt, since indexes are sorted and +// there are no duplicates. +// +// 3b. If EQUAL, the leaf is "dense" from Index to maxindex, meaning there is +// no reason to search it. "Slide right" to the high end of the leaf +// (modify Index to maxindex) and continue with step 2 above. +// +// 3c. If LESS, continue with step 4. +// +// 4. If the offset based on maxindex minus Index falls BEFORE the start of +// the leaf, or if, per 3c above, baseindex is LESS than Index, the leaf is +// guaranteed "not dense to the end" and a usable empty Index must exist. +// This supports a more efficient search loop. Start at the FIRST index in +// the leaf, or one BEYOND baseindex, respectively, and search the leaf as +// follows, comparing each current index (currindex) with Index: +// +// 4a. If LESS, keep going to next index. Note: This is certain to terminate +// because maxindex is known to be greater than Index, hence the loop can +// be small and fast. +// +// 4b. If EQUAL, loop and increment Index until finding currindex greater than +// Index, and return success with the modified Index. +// +// 4c. If GREATER, return success, Index (unmodified) is empty. +// +// Note: These are macros rather than functions for speed. + +#ifdef JUDYPREV + +#define JSLE_EVEN(Addr,Pop0,cDigits,LeafType) \ + { \ + LeafType * PjllLSB = (LeafType *) (Addr); \ + LeafType IndexLSB = Index; /* auto-masking */ \ + \ + /* Index before or at start of leaf: */ \ + \ + if (*PjllLSB >= IndexLSB) /* no need to search */ \ + { \ + if (*PjllLSB > IndexLSB) RET_SUCCESS; /* Index empty */ \ + LEAF_EDGE(*PjllLSB, cDigits); \ + } \ + \ + /* Index in or after leaf: */ \ + \ + offset = IndexLSB - *PjllLSB; /* tentative offset */ \ + if (offset <= (Pop0)) /* can check density */ \ + { \ + PjllLSB += offset; /* move to slot */ \ + \ + if (*PjllLSB <= IndexLSB) /* dense or corrupt */ \ + { \ + if (*PjllLSB == IndexLSB) /* dense, check edge */ \ + LEAF_EDGE_SET(PjllLSB[-offset], cDigits); \ + RET_CORRUPT; \ + } \ + --PjllLSB; /* not dense, start at previous */ \ + } \ + else PjllLSB = ((LeafType *) (Addr)) + (Pop0); /* start at max */ \ + \ + LEAF_HOLE_EVEN(cDigits, PjllLSB, IndexLSB); \ + } + +// JSLE_ODD is completely different from JSLE_EVEN because its important to +// minimize copying odd indexes to compare them (see 4.14). Furthermore, a +// very complex version (4.17, but abandoned before fully debugged) that +// avoided calling j__udySearchLeaf*() ran twice as fast as 4.14, but still +// half as fast as SearchValid. Doug suggested that to minimize complexity and +// share common code we should use j__udySearchLeaf*() for the initial search +// to establish if Index is empty, which should be common. If Index is valid +// in a leaf or immediate indexes, odds are good that an empty Index is nearby, +// so for simplicity just use a *COPY* function to linearly search the +// remainder. +// +// TBD: Pathological case? Average performance should be good, but worst-case +// might suffer. When Search says the initial Index is valid, so a linear +// copy-and-compare is begun, if the caller builds fairly large leaves with +// dense clusters AND frequently does a SearchEmpty at one end of such a +// cluster, performance wont be very good. Might a dense-check help? This +// means checking offset against the index at offset, and then against the +// first/last index in the leaf. We doubt the pathological case will appear +// much in real applications because they will probably alternate SearchValid +// and SearchEmpty calls. + +#define JSLE_ODD(cDigits,Pjll,Pop0,Search,Copy) \ + { \ + Word_t IndexLSB; /* least bytes only */ \ + Word_t IndexFound; /* in leaf */ \ + \ + if ((offset = Search(Pjll, (Pop0) + 1, Index)) < 0) \ + RET_SUCCESS; /* Index is empty */ \ + \ + IndexLSB = JU_LEASTBYTES(Index, cDigits); \ + offset *= (cDigits); \ + \ + while ((offset -= (cDigits)) >= 0) \ + { /* skip until empty or start */ \ + Copy(IndexFound, ((uint8_t *) (Pjll)) + offset); \ + if (IndexFound != (--IndexLSB)) /* found an empty */ \ + { JU_SETDIGITS(Index, IndexLSB, cDigits); RET_SUCCESS; }\ + } \ + LEAF_EDGE_SET(IndexLSB, cDigits); \ + } + +#else // JUDYNEXT + +#define JSLE_EVEN(Addr,Pop0,cDigits,LeafType) \ + { \ + LeafType * PjllLSB = ((LeafType *) (Addr)) + (Pop0); \ + LeafType IndexLSB = Index; /* auto-masking */ \ + \ + /* Index at or after end of leaf: */ \ + \ + if (*PjllLSB <= IndexLSB) /* no need to search */ \ + { \ + if (*PjllLSB < IndexLSB) RET_SUCCESS; /* Index empty */\ + LEAF_EDGE(*PjllLSB, cDigits); \ + } \ + \ + /* Index before or in leaf: */ \ + \ + offset = *PjllLSB - IndexLSB; /* tentative offset */ \ + if (offset <= (Pop0)) /* can check density */ \ + { \ + PjllLSB -= offset; /* move to slot */ \ + \ + if (*PjllLSB >= IndexLSB) /* dense or corrupt */ \ + { \ + if (*PjllLSB == IndexLSB) /* dense, check edge */ \ + LEAF_EDGE_SET(PjllLSB[offset], cDigits); \ + RET_CORRUPT; \ + } \ + ++PjllLSB; /* not dense, start at next */ \ + } \ + else PjllLSB = (LeafType *) (Addr); /* start at minimum */ \ + \ + LEAF_HOLE_EVEN(cDigits, PjllLSB, IndexLSB); \ + } + +#define JSLE_ODD(cDigits,Pjll,Pop0,Search,Copy) \ + { \ + Word_t IndexLSB; /* least bytes only */ \ + Word_t IndexFound; /* in leaf */ \ + int offsetmax; /* in bytes */ \ + \ + if ((offset = Search(Pjll, (Pop0) + 1, Index)) < 0) \ + RET_SUCCESS; /* Index is empty */ \ + \ + IndexLSB = JU_LEASTBYTES(Index, cDigits); \ + offset *= (cDigits); \ + offsetmax = (Pop0) * (cDigits); /* single multiply */ \ + \ + while ((offset += (cDigits)) <= offsetmax) \ + { /* skip until empty or end */ \ + Copy(IndexFound, ((uint8_t *) (Pjll)) + offset); \ + if (IndexFound != (++IndexLSB)) /* found an empty */ \ + { JU_SETDIGITS(Index, IndexLSB, cDigits); RET_SUCCESS; } \ + } \ + LEAF_EDGE_SET(IndexLSB, cDigits); \ + } + +#endif // JUDYNEXT + +// Note: Immediate indexes never fill a single index group, so for odd index +// sizes, save time by calling JSLE_ODD_IMM instead of JSLE_ODD. + +#define j__udySearchLeafEmpty1(Addr,Pop0) \ + JSLE_EVEN(Addr, Pop0, 1, uint8_t) + +#define j__udySearchLeafEmpty2(Addr,Pop0) \ + JSLE_EVEN(Addr, Pop0, 2, uint16_t) + +#define j__udySearchLeafEmpty3(Addr,Pop0) \ + JSLE_ODD(3, Addr, Pop0, j__udySearchLeaf3, JU_COPY3_PINDEX_TO_LONG) + +#ifndef JU_64BIT + +#define j__udySearchLeafEmptyL(Addr,Pop0) \ + JSLE_EVEN(Addr, Pop0, 4, Word_t) + +#else + +#define j__udySearchLeafEmpty4(Addr,Pop0) \ + JSLE_EVEN(Addr, Pop0, 4, uint32_t) + +#define j__udySearchLeafEmpty5(Addr,Pop0) \ + JSLE_ODD(5, Addr, Pop0, j__udySearchLeaf5, JU_COPY5_PINDEX_TO_LONG) + +#define j__udySearchLeafEmpty6(Addr,Pop0) \ + JSLE_ODD(6, Addr, Pop0, j__udySearchLeaf6, JU_COPY6_PINDEX_TO_LONG) + +#define j__udySearchLeafEmpty7(Addr,Pop0) \ + JSLE_ODD(7, Addr, Pop0, j__udySearchLeaf7, JU_COPY7_PINDEX_TO_LONG) + +#define j__udySearchLeafEmptyL(Addr,Pop0) \ + JSLE_EVEN(Addr, Pop0, 8, Word_t) + +#endif // JU_64BIT + + +// ---------------------------------------------------------------------------- +// START OF CODE: +// +// CHECK FOR SHORTCUTS: +// +// Error out if PIndex is null. + + if (PIndex == (PWord_t) NULL) + { + JU_SET_ERRNO(PJError, JU_ERRNO_NULLPINDEX); + return(JERRI); + } + + Index = *PIndex; // fast local copy. + +// Set and pre-decrement/increment Index, watching for underflow/overflow: +// +// An out-of-bounds Index means failure: No previous/next empty index. + +SMGetRestart: // return here with revised Index. + +#ifdef JUDYPREV + if (Index-- == 0) return(0); +#else + if (++Index == 0) return(0); +#endif + +// An empty array with an in-bounds (not underflowed/overflowed) Index means +// success: +// +// Note: This check is redundant after restarting at SMGetRestart, but should +// take insignificant time. + + if (PArray == (Pvoid_t) NULL) RET_SUCCESS; + +// ---------------------------------------------------------------------------- +// ROOT-LEVEL LEAF that starts with a Pop0 word; just look within the leaf: +// +// If Index is not in the leaf, return success; otherwise return the first +// empty Index, if any, below/above where it would belong. + + if (JU_LEAFW_POP0(PArray) < cJU_LEAFW_MAXPOP1) // must be a LEAFW + { + Pjlw_t Pjlw = P_JLW(PArray); // first word of leaf. + pop0 = Pjlw[0]; + +#ifdef JUDY1 + if (pop0 == 0) // special case. + { +#ifdef JUDYPREV + if ((Index != Pjlw[1]) || (Index-- != 0)) RET_SUCCESS; +#else + if ((Index != Pjlw[1]) || (++Index != 0)) RET_SUCCESS; +#endif + return(0); // no previous/next empty index. + } +#endif // JUDY1 + + j__udySearchLeafEmptyL(Pjlw + 1, pop0); + +// No return -- thanks ALAN + + } + else + +// ---------------------------------------------------------------------------- +// HANDLE JRP Branch: +// +// For JRP branches, traverse the JPM; handle LEAFW +// directly; but look for the most common cases first. + + { + Pjpm_t Pjpm = P_JPM(PArray); + Pjp = &(Pjpm->jpm_JP); + +// goto SMGetContinue; + } + + +// ============================================================================ +// STATE MACHINE -- GET INDEX: +// +// Search for Index (already decremented/incremented so as to be an inclusive +// search). If not found (empty index), return success. Otherwise do a +// previous/next search, and if successful modify Index to the empty index +// found. See function header comments. +// +// ENTRY: Pjp points to next JP to interpret, whose Decode bytes have not yet +// been checked. +// +// Note: Check Decode bytes at the start of each loop, not after looking up a +// new JP, so its easy to do constant shifts/masks. +// +// EXIT: Return, or branch to SMGetRestart with modified Index, or branch to +// SMGetContinue with a modified Pjp, as described elsewhere. +// +// WARNING: For run-time efficiency the following cases replicate code with +// varying constants, rather than using common code with variable values! + +SMGetContinue: // return here for next branch/leaf. + +#ifdef TRACEJPSE + JudyPrintJP(Pjp, "sf", __LINE__); +#endif + + switch (JU_JPTYPE(Pjp)) + { + + +// ---------------------------------------------------------------------------- +// LINEAR BRANCH: +// +// Check Decode bytes, if any, in the current JP, then search for a JP for the +// next digit in Index. + + case cJU_JPBRANCH_L2: CHECKDCD(2); SMPREPB2(SMBranchL); + case cJU_JPBRANCH_L3: CHECKDCD(3); SMPREPB3(SMBranchL); +#ifdef JU_64BIT + case cJU_JPBRANCH_L4: CHECKDCD(4); SMPREPB4(SMBranchL); + case cJU_JPBRANCH_L5: CHECKDCD(5); SMPREPB5(SMBranchL); + case cJU_JPBRANCH_L6: CHECKDCD(6); SMPREPB6(SMBranchL); + case cJU_JPBRANCH_L7: CHECKDCD(7); SMPREPB7(SMBranchL); +#endif + case cJU_JPBRANCH_L: SMPREPBL(SMBranchL); + +// Common code (state-independent) for all cases of linear branches: + +SMBranchL: + Pjbl = P_JBL(Pjp->jp_Addr); + +// First, check if Indexs expanse (digit) is below/above the first/last +// populated expanse in the BranchL, in which case Index is empty; otherwise +// find the offset of the lowest/highest populated expanse at or above/below +// digit, if any: +// +// Note: The for-loop is guaranteed to exit eventually because the first/last +// expanse is known to be a terminator. +// +// Note: Cannot use j__udySearchLeaf*Empty1() here because it only applies to +// leaves and does not know about partial versus full JPs, unlike the use of +// j__udySearchLeaf1() for BranchLs in SearchValid code. Also, since linear +// leaf expanse lists are small, dont waste time calling j__udySearchLeaf1(), +// just scan the expanse list. + +#ifdef JUDYPREV + if ((Pjbl->jbl_Expanse[0]) > digit) RET_SUCCESS; + + for (offset = (Pjbl->jbl_NumJPs) - 1; /* null */; --offset) +#else + if ((Pjbl->jbl_Expanse[(Pjbl->jbl_NumJPs) - 1]) < digit) + RET_SUCCESS; + + for (offset = 0; /* null */; ++offset) +#endif + { + +// Too low/high, keep going; or too high/low, meaning the loop passed a hole +// and the initial Index is empty: + +#ifdef JUDYPREV + if ((Pjbl->jbl_Expanse[offset]) > digit) continue; + if ((Pjbl->jbl_Expanse[offset]) < digit) RET_SUCCESS; +#else + if ((Pjbl->jbl_Expanse[offset]) < digit) continue; + if ((Pjbl->jbl_Expanse[offset]) > digit) RET_SUCCESS; +#endif + +// Found expanse matching digit; if its not full, traverse through it: + + if (! JPFULL((Pjbl->jbl_jp) + offset)) + { + Pjp = (Pjbl->jbl_jp) + offset; + goto SMGetContinue; + } + +// Common code: While searching for a lower/higher hole or a non-full JP, upon +// finding a lower/higher hole, adjust Index using the revised digit and +// return; or upon finding a consecutive lower/higher expanse, if the expanses +// JP is non-full, modify Index and traverse through the JP: + +#define BRANCHL_CHECK(OpIncDec,OpLeastDigits,Digit,Digits) \ + { \ + if ((Pjbl->jbl_Expanse[offset]) != OpIncDec digit) \ + SET_AND_RETURN(OpLeastDigits, Digit, Digits); \ + \ + if (! JPFULL((Pjbl->jbl_jp) + offset)) \ + { \ + Pjp = (Pjbl->jbl_jp) + offset; \ + SET_AND_CONTINUE(OpLeastDigits, Digit, Digits); \ + } \ + } + +// BranchL primary dead end: Expanse matching Index/digit is full (rare except +// for dense/sequential indexes): +// +// Search for a lower/higher hole, a non-full JP, or the end of the expanse +// list, while decrementing/incrementing digit. + +#ifdef JUDYPREV + while (--offset >= 0) + BRANCHL_CHECK(--, SETLEASTDIGITS_D, digit, digits) +#else + while (++offset < Pjbl->jbl_NumJPs) + BRANCHL_CHECK(++, CLEARLEASTDIGITS_D, digit, digits) +#endif + +// Passed end of BranchL expanse list after finding a matching but full +// expanse: +// +// Digit now matches the lowest/highest expanse, which is a full expanse; if +// digit is at the end of BranchLs expanse (no hole before/after), break out +// of the loop; otherwise modify Index to the next lower/higher digit and +// return success: + +#ifdef JUDYPREV + if (digit == 0) break; + --digit; SET_AND_RETURN(SETLEASTDIGITS_D, digit, digits); +#else + if (digit == JU_LEASTBYTES(cJU_ALLONES, 1)) break; + ++digit; SET_AND_RETURN(CLEARLEASTDIGITS_D, digit, digits); +#endif + } // for-loop + +// BranchL secondary dead end, no non-full previous/next JP: + + SMRESTART(digits); + + +// ---------------------------------------------------------------------------- +// BITMAP BRANCH: +// +// Check Decode bytes, if any, in the current JP, then search for a JP for the +// next digit in Index. + + case cJU_JPBRANCH_B2: CHECKDCD(2); SMPREPB2(SMBranchB); + case cJU_JPBRANCH_B3: CHECKDCD(3); SMPREPB3(SMBranchB); +#ifdef JU_64BIT + case cJU_JPBRANCH_B4: CHECKDCD(4); SMPREPB4(SMBranchB); + case cJU_JPBRANCH_B5: CHECKDCD(5); SMPREPB5(SMBranchB); + case cJU_JPBRANCH_B6: CHECKDCD(6); SMPREPB6(SMBranchB); + case cJU_JPBRANCH_B7: CHECKDCD(7); SMPREPB7(SMBranchB); +#endif + case cJU_JPBRANCH_B: SMPREPBL(SMBranchB); + +// Common code (state-independent) for all cases of bitmap branches: + +SMBranchB: + Pjbb = P_JBB(Pjp->jp_Addr); + +// Locate the digits JP in the subexpanse list, if present: + + subexp = digit / cJU_BITSPERSUBEXPB; + assert(subexp < cJU_NUMSUBEXPB); // falls in expected range. + bitposmaskB = JU_BITPOSMASKB(digit); + +// Absent JP = no JP matches current digit in Index: + +// if (! JU_BITMAPTESTB(Pjbb, digit)) // slower. + if (! (JU_JBB_BITMAP(Pjbb, subexp) & bitposmaskB)) // faster. + RET_SUCCESS; + +// Non-full JP matches current digit in Index: +// +// Iterate to the subsidiary non-full JP. + + offset = SEARCHBITMAPB(JU_JBB_BITMAP(Pjbb, subexp), digit, + bitposmaskB); + // not negative since at least one bit is set: + assert(offset >= 0); + assert(offset < (int) cJU_BITSPERSUBEXPB); + +// Watch for null JP subarray pointer with non-null bitmap (a corruption): + + if ((Pjp = P_JP(JU_JBB_PJP(Pjbb, subexp))) + == (Pjp_t) NULL) RET_CORRUPT; + + Pjp += offset; + if (! JPFULL(Pjp)) goto SMGetContinue; + +// BranchB primary dead end: +// +// Upon hitting a full JP in a BranchB for the next digit in Index, search +// sideways for a previous/next absent JP (unset bit) or non-full JP (set bit +// with non-full JP); first in the current bitmap subexpanse, then in +// lower/higher subexpanses. Upon entry, Pjp points to a known-unusable JP, +// ready to decrement/increment. +// +// Note: The preceding code is separate from this loop because Index does not +// need revising (see SET_AND_*()) if the initial index is an empty index. +// +// TBD: For speed, shift bitposmaskB instead of using JU_BITMAPTESTB or +// JU_BITPOSMASKB, but this shift has knowledge of bit order that really should +// be encapsulated in a header file. + +#define BRANCHB_CHECKBIT(OpLeastDigits) \ + if (! (JU_JBB_BITMAP(Pjbb, subexp) & bitposmaskB)) /* absent JP */ \ + SET_AND_RETURN(OpLeastDigits, digit, digits) + +#define BRANCHB_CHECKJPFULL(OpLeastDigits) \ + if (! JPFULL(Pjp)) \ + SET_AND_CONTINUE(OpLeastDigits, digit, digits) + +#define BRANCHB_STARTSUBEXP(OpLeastDigits) \ + if (! JU_JBB_BITMAP(Pjbb, subexp)) /* empty subexpanse, shortcut */ \ + SET_AND_RETURN(OpLeastDigits, digit, digits) \ + if ((Pjp = P_JP(JU_JBB_PJP(Pjbb, subexp))) == (Pjp_t) NULL) RET_CORRUPT + +#ifdef JUDYPREV + + --digit; // skip initial digit. + bitposmaskB >>= 1; // see TBD above. + +BranchBNextSubexp: // return here to check next bitmap subexpanse. + + while (bitposmaskB) // more bits to check in subexp. + { + BRANCHB_CHECKBIT(SETLEASTDIGITS_D); + --Pjp; // previous in subarray. + BRANCHB_CHECKJPFULL(SETLEASTDIGITS_D); + assert(digit >= 0); + --digit; + bitposmaskB >>= 1; + } + + if (subexp-- > 0) // more subexpanses. + { + BRANCHB_STARTSUBEXP(SETLEASTDIGITS_D); + Pjp += SEARCHBITMAPMAXB(JU_JBB_BITMAP(Pjbb, subexp)) + 1; + bitposmaskB = (1U << (cJU_BITSPERSUBEXPB - 1)); + goto BranchBNextSubexp; + } + +#else // JUDYNEXT + + ++digit; // skip initial digit. + bitposmaskB <<= 1; // note: BITMAPB_t. + +BranchBNextSubexp: // return here to check next bitmap subexpanse. + + while (bitposmaskB) // more bits to check in subexp. + { + BRANCHB_CHECKBIT(CLEARLEASTDIGITS_D); + ++Pjp; // previous in subarray. + BRANCHB_CHECKJPFULL(CLEARLEASTDIGITS_D); + assert(digit < cJU_SUBEXPPERSTATE); + ++digit; + bitposmaskB <<= 1; // note: BITMAPB_t. + } + + if (++subexp < cJU_NUMSUBEXPB) // more subexpanses. + { + BRANCHB_STARTSUBEXP(CLEARLEASTDIGITS_D); + --Pjp; // pre-decrement. + bitposmaskB = 1; + goto BranchBNextSubexp; + } + +#endif // JUDYNEXT + +// BranchB secondary dead end, no non-full previous/next JP: + + SMRESTART(digits); + + +// ---------------------------------------------------------------------------- +// UNCOMPRESSED BRANCH: +// +// Check Decode bytes, if any, in the current JP, then search for a JP for the +// next digit in Index. + + case cJU_JPBRANCH_U2: CHECKDCD(2); SMPREPB2(SMBranchU); + case cJU_JPBRANCH_U3: CHECKDCD(3); SMPREPB3(SMBranchU); +#ifdef JU_64BIT + case cJU_JPBRANCH_U4: CHECKDCD(4); SMPREPB4(SMBranchU); + case cJU_JPBRANCH_U5: CHECKDCD(5); SMPREPB5(SMBranchU); + case cJU_JPBRANCH_U6: CHECKDCD(6); SMPREPB6(SMBranchU); + case cJU_JPBRANCH_U7: CHECKDCD(7); SMPREPB7(SMBranchU); +#endif + case cJU_JPBRANCH_U: SMPREPBL(SMBranchU); + +// Common code (state-independent) for all cases of uncompressed branches: + +SMBranchU: + Pjbu = P_JBU(Pjp->jp_Addr); + Pjp = (Pjbu->jbu_jp) + digit; + +// Absent JP = null JP for current digit in Index: + + if (JPNULL(JU_JPTYPE(Pjp))) RET_SUCCESS; + +// Non-full JP matches current digit in Index: +// +// Iterate to the subsidiary JP. + + if (! JPFULL(Pjp)) goto SMGetContinue; + +// BranchU primary dead end: +// +// Upon hitting a full JP in a BranchU for the next digit in Index, search +// sideways for a previous/next null or non-full JP. BRANCHU_CHECKJP() is +// shorthand for common code. +// +// Note: The preceding code is separate from this loop because Index does not +// need revising (see SET_AND_*()) if the initial index is an empty index. + +#define BRANCHU_CHECKJP(OpIncDec,OpLeastDigits) \ + { \ + OpIncDec Pjp; \ + \ + if (JPNULL(JU_JPTYPE(Pjp))) \ + SET_AND_RETURN(OpLeastDigits, digit, digits) \ + \ + if (! JPFULL(Pjp)) \ + SET_AND_CONTINUE(OpLeastDigits, digit, digits) \ + } + +#ifdef JUDYPREV + while (digit-- > 0) + BRANCHU_CHECKJP(--, SETLEASTDIGITS_D); +#else + while (++digit < cJU_BRANCHUNUMJPS) + BRANCHU_CHECKJP(++, CLEARLEASTDIGITS_D); +#endif + +// BranchU secondary dead end, no non-full previous/next JP: + + SMRESTART(digits); + + +// ---------------------------------------------------------------------------- +// LINEAR LEAF: +// +// Check Decode bytes, if any, in the current JP, then search the leaf for the +// previous/next empty index starting at Index. Primary leaf dead end is +// hidden within j__udySearchLeaf*Empty*(). In case of secondary leaf dead +// end, restart at the top of the tree. +// +// Note: Pword is the name known to GET*; think of it as Pjlw. + +#define SMLEAFL(cDigits,Func) \ + Pword = (PWord_t) P_JLW(Pjp->jp_Addr); \ + pop0 = JU_JPLEAF_POP0(Pjp); \ + Func(Pword, pop0) + +#if (defined(JUDYL) || (! defined(JU_64BIT))) + case cJU_JPLEAF1: CHECKDCD(1); SMLEAFL(1, j__udySearchLeafEmpty1); +#endif + case cJU_JPLEAF2: CHECKDCD(2); SMLEAFL(2, j__udySearchLeafEmpty2); + case cJU_JPLEAF3: CHECKDCD(3); SMLEAFL(3, j__udySearchLeafEmpty3); + +#ifdef JU_64BIT + case cJU_JPLEAF4: CHECKDCD(4); SMLEAFL(4, j__udySearchLeafEmpty4); + case cJU_JPLEAF5: CHECKDCD(5); SMLEAFL(5, j__udySearchLeafEmpty5); + case cJU_JPLEAF6: CHECKDCD(6); SMLEAFL(6, j__udySearchLeafEmpty6); + case cJU_JPLEAF7: CHECKDCD(7); SMLEAFL(7, j__udySearchLeafEmpty7); +#endif + + +// ---------------------------------------------------------------------------- +// BITMAP LEAF: +// +// Check Decode bytes, if any, in the current JP, then search the leaf for the +// previous/next empty index starting at Index. + + case cJU_JPLEAF_B1: + + CHECKDCD(1); + + Pjlb = P_JLB(Pjp->jp_Addr); + digit = JU_DIGITATSTATE(Index, 1); + subexp = digit / cJU_BITSPERSUBEXPL; + bitposmaskL = JU_BITPOSMASKL(digit); + assert(subexp < cJU_NUMSUBEXPL); // falls in expected range. + +// Absent index = no index matches current digit in Index: + +// if (! JU_BITMAPTESTL(Pjlb, digit)) // slower. + if (! (JU_JLB_BITMAP(Pjlb, subexp) & bitposmaskL)) // faster. + RET_SUCCESS; + +// LeafB1 primary dead end: +// +// Upon hitting a valid (non-empty) index in a LeafB1 for the last digit in +// Index, search sideways for a previous/next absent index, first in the +// current bitmap subexpanse, then in lower/higher subexpanses. +// LEAFB1_CHECKBIT() is shorthand for common code to handle one bit in one +// bitmap subexpanse. +// +// Note: The preceding code is separate from this loop because Index does not +// need revising (see SET_AND_*()) if the initial index is an empty index. +// +// TBD: For speed, shift bitposmaskL instead of using JU_BITMAPTESTL or +// JU_BITPOSMASKL, but this shift has knowledge of bit order that really should +// be encapsulated in a header file. + +#define LEAFB1_CHECKBIT(OpLeastDigits) \ + if (! (JU_JLB_BITMAP(Pjlb, subexp) & bitposmaskL)) \ + SET_AND_RETURN(OpLeastDigits, digit, 1) + +#define LEAFB1_STARTSUBEXP(OpLeastDigits) \ + if (! JU_JLB_BITMAP(Pjlb, subexp)) /* empty subexp */ \ + SET_AND_RETURN(OpLeastDigits, digit, 1) + +#ifdef JUDYPREV + + --digit; // skip initial digit. + bitposmaskL >>= 1; // see TBD above. + +LeafB1NextSubexp: // return here to check next bitmap subexpanse. + + while (bitposmaskL) // more bits to check in subexp. + { + LEAFB1_CHECKBIT(SETLEASTDIGITS_D); + assert(digit >= 0); + --digit; + bitposmaskL >>= 1; + } + + if (subexp-- > 0) // more subexpanses. + { + LEAFB1_STARTSUBEXP(SETLEASTDIGITS_D); + bitposmaskL = (1UL << (cJU_BITSPERSUBEXPL - 1)); + goto LeafB1NextSubexp; + } + +#else // JUDYNEXT + + ++digit; // skip initial digit. + bitposmaskL <<= 1; // note: BITMAPL_t. + +LeafB1NextSubexp: // return here to check next bitmap subexpanse. + + while (bitposmaskL) // more bits to check in subexp. + { + LEAFB1_CHECKBIT(CLEARLEASTDIGITS_D); + assert(digit < cJU_SUBEXPPERSTATE); + ++digit; + bitposmaskL <<= 1; // note: BITMAPL_t. + } + + if (++subexp < cJU_NUMSUBEXPL) // more subexpanses. + { + LEAFB1_STARTSUBEXP(CLEARLEASTDIGITS_D); + bitposmaskL = 1; + goto LeafB1NextSubexp; + } + +#endif // JUDYNEXT + +// LeafB1 secondary dead end, no empty index: + + SMRESTART(1); + + +#ifdef JUDY1 +// ---------------------------------------------------------------------------- +// FULL POPULATION: +// +// If the Decode bytes do not match, Index is empty (without modification); +// otherwise restart. + + case cJ1_JPFULLPOPU1: + + CHECKDCD(1); + SMRESTART(1); +#endif + + +// ---------------------------------------------------------------------------- +// IMMEDIATE: +// +// Pop1 = 1 Immediate JPs: +// +// If Index is not in the immediate JP, return success; otherwise check if +// there is an empty index below/above the immediate JPs index, and if so, +// return success with modified Index, else restart. +// +// Note: Doug says its fast enough to calculate the index size (digits) in +// the following; no need to set it separately for each case. + + case cJU_JPIMMED_1_01: + case cJU_JPIMMED_2_01: + case cJU_JPIMMED_3_01: +#ifdef JU_64BIT + case cJU_JPIMMED_4_01: + case cJU_JPIMMED_5_01: + case cJU_JPIMMED_6_01: + case cJU_JPIMMED_7_01: +#endif + if (JU_JPDCDPOP0(Pjp) != JU_TRIMTODCDSIZE(Index)) RET_SUCCESS; + digits = JU_JPTYPE(Pjp) - cJU_JPIMMED_1_01 + 1; + LEAF_EDGE(JU_LEASTBYTES(JU_JPDCDPOP0(Pjp), digits), digits); + +// Immediate JPs with Pop1 > 1: + +#define IMM_MULTI(Func,BaseJPType) \ + JUDY1CODE(Pword = (PWord_t) (Pjp->jp_1Index);) \ + JUDYLCODE(Pword = (PWord_t) (Pjp->jp_LIndex);) \ + Func(Pword, JU_JPTYPE(Pjp) - (BaseJPType) + 1) + + case cJU_JPIMMED_1_02: + case cJU_JPIMMED_1_03: +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_1_04: + case cJU_JPIMMED_1_05: + case cJU_JPIMMED_1_06: + case cJU_JPIMMED_1_07: +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_1_08: + case cJ1_JPIMMED_1_09: + case cJ1_JPIMMED_1_10: + case cJ1_JPIMMED_1_11: + case cJ1_JPIMMED_1_12: + case cJ1_JPIMMED_1_13: + case cJ1_JPIMMED_1_14: + case cJ1_JPIMMED_1_15: +#endif + IMM_MULTI(j__udySearchLeafEmpty1, cJU_JPIMMED_1_02); + +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_2_02: + case cJU_JPIMMED_2_03: +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_2_04: + case cJ1_JPIMMED_2_05: + case cJ1_JPIMMED_2_06: + case cJ1_JPIMMED_2_07: +#endif +#if (defined(JUDY1) || defined(JU_64BIT)) + IMM_MULTI(j__udySearchLeafEmpty2, cJU_JPIMMED_2_02); +#endif + +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_3_02: +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_3_03: + case cJ1_JPIMMED_3_04: + case cJ1_JPIMMED_3_05: +#endif +#if (defined(JUDY1) || defined(JU_64BIT)) + IMM_MULTI(j__udySearchLeafEmpty3, cJU_JPIMMED_3_02); +#endif + +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_4_02: + case cJ1_JPIMMED_4_03: + IMM_MULTI(j__udySearchLeafEmpty4, cJ1_JPIMMED_4_02); + + case cJ1_JPIMMED_5_02: + case cJ1_JPIMMED_5_03: + IMM_MULTI(j__udySearchLeafEmpty5, cJ1_JPIMMED_5_02); + + case cJ1_JPIMMED_6_02: + IMM_MULTI(j__udySearchLeafEmpty6, cJ1_JPIMMED_6_02); + + case cJ1_JPIMMED_7_02: + IMM_MULTI(j__udySearchLeafEmpty7, cJ1_JPIMMED_7_02); +#endif + + +// ---------------------------------------------------------------------------- +// INVALID JP TYPE: + + default: RET_CORRUPT; + + } // SMGet switch. + +} // Judy1PrevEmpty() / Judy1NextEmpty() / JudyLPrevEmpty() / JudyLNextEmpty() diff --git a/src/libnetdata/libjudy/src/JudyL/JudyLTables.c b/src/libnetdata/libjudy/src/JudyL/JudyLTables.c new file mode 100644 index 00000000..21c97498 --- /dev/null +++ b/src/libnetdata/libjudy/src/JudyL/JudyLTables.c @@ -0,0 +1,338 @@ +// @(#) From generation tool: $Revision: 4.37 $ $Source: /judy/src/JudyCommon/JudyTables.c $ +// Pregenerated and modified by hand. Do not overwrite! + +#include "JudyL.h" +// Leave the malloc() sizes readable in the binary (via strings(1)): +#ifdef JU_64BIT +const char * JudyLMallocSizes = "JudyLMallocSizes = 3, 5, 7, 11, 15, 23, 32, 47, 64, Leaf1 = 13"; +#else // JU_32BIT +const char * JudyLMallocSizes = "JudyLMallocSizes = 3, 5, 7, 11, 15, 23, 32, 47, 64, Leaf1 = 25"; +#endif // JU_64BIT + +#ifdef JU_64BIT +// object uses 64 words +// cJU_BITSPERSUBEXPB = 32 +const uint8_t +j__L_BranchBJPPopToWords[cJU_BITSPERSUBEXPB + 1] = +{ + 0, + 3, 5, 7, 11, 11, 15, 15, 23, + 23, 23, 23, 32, 32, 32, 32, 32, + 47, 47, 47, 47, 47, 47, 47, 64, + 64, 64, 64, 64, 64, 64, 64, 64 +}; + +// object uses 15 words +// cJL_LEAF1_MAXPOP1 = 13 +const uint8_t +j__L_Leaf1PopToWords[cJL_LEAF1_MAXPOP1 + 1] = +{ + 0, + 3, 3, 5, 5, 7, 7, 11, 11, + 11, 15, 15, 15, 15 +}; +const uint8_t +j__L_Leaf1Offset[cJL_LEAF1_MAXPOP1 + 1] = +{ + 0, + 1, 1, 1, 1, 1, 1, 2, 2, + 2, 2, 2, 2, 2 +}; + +// object uses 64 words +// cJL_LEAF2_MAXPOP1 = 51 +const uint8_t +j__L_Leaf2PopToWords[cJL_LEAF2_MAXPOP1 + 1] = +{ + 0, + 3, 3, 5, 5, 7, 11, 11, 11, + 15, 15, 15, 15, 23, 23, 23, 23, + 23, 23, 32, 32, 32, 32, 32, 32, + 32, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64 +}; +const uint8_t +j__L_Leaf2Offset[cJL_LEAF2_MAXPOP1 + 1] = +{ + 0, + 1, 1, 1, 1, 2, 3, 3, 3, + 3, 3, 3, 3, 5, 5, 5, 5, + 5, 5, 7, 7, 7, 7, 7, 7, + 7, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13 +}; + +// object uses 64 words +// cJL_LEAF3_MAXPOP1 = 46 +const uint8_t +j__L_Leaf3PopToWords[cJL_LEAF3_MAXPOP1 + 1] = +{ + 0, + 3, 3, 5, 7, 7, 11, 11, 11, + 15, 15, 23, 23, 23, 23, 23, 23, + 32, 32, 32, 32, 32, 32, 32, 47, + 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64 +}; +const uint8_t +j__L_Leaf3Offset[cJL_LEAF3_MAXPOP1 + 1] = +{ + 0, + 1, 1, 2, 2, 2, 3, 3, 3, + 4, 4, 6, 6, 6, 6, 6, 6, + 9, 9, 9, 9, 9, 9, 9, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18 +}; + +// object uses 63 words +// cJL_LEAF4_MAXPOP1 = 42 +const uint8_t +j__L_Leaf4PopToWords[cJL_LEAF4_MAXPOP1 + 1] = +{ + 0, + 3, 3, 5, 7, 11, 11, 11, 15, + 15, 15, 23, 23, 23, 23, 23, 32, + 32, 32, 32, 32, 32, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63 +}; +const uint8_t +j__L_Leaf4Offset[cJL_LEAF4_MAXPOP1 + 1] = +{ + 0, + 1, 1, 2, 2, 4, 4, 4, 5, + 5, 5, 8, 8, 8, 8, 8, 11, + 11, 11, 11, 11, 11, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21 +}; + +// object uses 64 words +// cJL_LEAF5_MAXPOP1 = 39 +const uint8_t +j__L_Leaf5PopToWords[cJL_LEAF5_MAXPOP1 + 1] = +{ + 0, + 3, 5, 5, 7, 11, 11, 15, 15, + 15, 23, 23, 23, 23, 23, 32, 32, + 32, 32, 32, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64 +}; +const uint8_t +j__L_Leaf5Offset[cJL_LEAF5_MAXPOP1 + 1] = +{ + 0, + 2, 2, 2, 3, 4, 4, 6, 6, + 6, 9, 9, 9, 9, 9, 12, 12, + 12, 12, 12, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25 +}; + +// object uses 63 words +// cJL_LEAF6_MAXPOP1 = 36 +const uint8_t +j__L_Leaf6PopToWords[cJL_LEAF6_MAXPOP1 + 1] = +{ + 0, + 3, 5, 7, 7, 11, 11, 15, 15, + 23, 23, 23, 23, 23, 32, 32, 32, + 32, 32, 47, 47, 47, 47, 47, 47, + 47, 47, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63 +}; +const uint8_t +j__L_Leaf6Offset[cJL_LEAF6_MAXPOP1 + 1] = +{ + 0, + 1, 3, 3, 3, 5, 5, 6, 6, + 10, 10, 10, 10, 10, 14, 14, 14, + 14, 14, 20, 20, 20, 20, 20, 20, + 20, 20, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27 +}; + +// object uses 64 words +// cJL_LEAF7_MAXPOP1 = 34 +const uint8_t +j__L_Leaf7PopToWords[cJL_LEAF7_MAXPOP1 + 1] = +{ + 0, + 3, 5, 7, 11, 11, 15, 15, 15, + 23, 23, 23, 23, 32, 32, 32, 32, + 32, 47, 47, 47, 47, 47, 47, 47, + 47, 64, 64, 64, 64, 64, 64, 64, + 64, 64 +}; +const uint8_t +j__L_Leaf7Offset[cJL_LEAF7_MAXPOP1 + 1] = +{ + 0, + 1, 3, 3, 5, 5, 7, 7, 7, + 11, 11, 11, 11, 15, 15, 15, 15, + 15, 22, 22, 22, 22, 22, 22, 22, + 22, 30, 30, 30, 30, 30, 30, 30, + 30, 30 +}; + +// object uses 63 words +// cJL_LEAFW_MAXPOP1 = 31 +const uint8_t +j__L_LeafWPopToWords[cJL_LEAFW_MAXPOP1 + 1] = +{ + 0, + 3, 5, 7, 11, 11, 15, 15, 23, + 23, 23, 23, 32, 32, 32, 32, 47, + 47, 47, 47, 47, 47, 47, 47, 63, + 63, 63, 63, 63, 63, 63, 63 +}; +const uint8_t +j__L_LeafWOffset[cJL_LEAFW_MAXPOP1 + 1] = +{ + 0, + 2, 3, 4, 6, 6, 8, 8, 12, + 12, 12, 12, 16, 16, 16, 16, 24, + 24, 24, 24, 24, 24, 24, 24, 32, + 32, 32, 32, 32, 32, 32, 32 +}; + +// object uses 64 words +// cJU_BITSPERSUBEXPL = 64 +const uint8_t +j__L_LeafVPopToWords[cJU_BITSPERSUBEXPL + 1] = +{ + 0, + 3, 3, 3, 5, 5, 7, 7, 11, + 11, 11, 11, 15, 15, 15, 15, 23, + 23, 23, 23, 23, 23, 23, 23, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64 +}; +#else // JU_32BIT +// object uses 64 words +// cJU_BITSPERSUBEXPB = 32 +const uint8_t +j__L_BranchBJPPopToWords[cJU_BITSPERSUBEXPB + 1] = +{ + 0, + 3, 5, 7, 11, 11, 15, 15, 23, + 23, 23, 23, 32, 32, 32, 32, 32, + 47, 47, 47, 47, 47, 47, 47, 64, + 64, 64, 64, 64, 64, 64, 64, 64 +}; + +// object uses 32 words +// cJL_LEAF1_MAXPOP1 = 25 +const uint8_t +j__L_Leaf1PopToWords[cJL_LEAF1_MAXPOP1 + 1] = +{ + 0, + 3, 3, 5, 5, 7, 11, 11, 11, + 15, 15, 15, 15, 23, 23, 23, 23, + 23, 23, 32, 32, 32, 32, 32, 32, + 32 +}; +const uint8_t +j__L_Leaf1Offset[cJL_LEAF1_MAXPOP1 + 1] = +{ + 0, + 1, 1, 1, 1, 2, 3, 3, 3, + 3, 3, 3, 3, 5, 5, 5, 5, + 5, 5, 7, 7, 7, 7, 7, 7, + 7 +}; + +// object uses 63 words +// cJL_LEAF2_MAXPOP1 = 42 +const uint8_t +j__L_Leaf2PopToWords[cJL_LEAF2_MAXPOP1 + 1] = +{ + 0, + 3, 3, 5, 7, 11, 11, 11, 15, + 15, 15, 23, 23, 23, 23, 23, 32, + 32, 32, 32, 32, 32, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63 +}; +const uint8_t +j__L_Leaf2Offset[cJL_LEAF2_MAXPOP1 + 1] = +{ + 0, + 1, 1, 2, 2, 4, 4, 4, 5, + 5, 5, 8, 8, 8, 8, 8, 11, + 11, 11, 11, 11, 11, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21 +}; + +// object uses 63 words +// cJL_LEAF3_MAXPOP1 = 36 +const uint8_t +j__L_Leaf3PopToWords[cJL_LEAF3_MAXPOP1 + 1] = +{ + 0, + 3, 5, 7, 7, 11, 11, 15, 15, + 23, 23, 23, 23, 23, 32, 32, 32, + 32, 32, 47, 47, 47, 47, 47, 47, + 47, 47, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63 +}; +const uint8_t +j__L_Leaf3Offset[cJL_LEAF3_MAXPOP1 + 1] = +{ + 0, + 1, 3, 3, 3, 5, 5, 6, 6, + 10, 10, 10, 10, 10, 14, 14, 14, + 14, 14, 20, 20, 20, 20, 20, 20, + 20, 20, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27 +}; + +// object uses 63 words +// cJL_LEAFW_MAXPOP1 = 31 +const uint8_t +j__L_LeafWPopToWords[cJL_LEAFW_MAXPOP1 + 1] = +{ + 0, + 3, 5, 7, 11, 11, 15, 15, 23, + 23, 23, 23, 32, 32, 32, 32, 47, + 47, 47, 47, 47, 47, 47, 47, 63, + 63, 63, 63, 63, 63, 63, 63 +}; +const uint8_t +j__L_LeafWOffset[cJL_LEAFW_MAXPOP1 + 1] = +{ + 0, + 2, 3, 4, 6, 6, 8, 8, 12, + 12, 12, 12, 16, 16, 16, 16, 24, + 24, 24, 24, 24, 24, 24, 24, 32, + 32, 32, 32, 32, 32, 32, 32 +}; + +// object uses 32 words +// cJU_BITSPERSUBEXPL = 32 +const uint8_t +j__L_LeafVPopToWords[cJU_BITSPERSUBEXPL + 1] = +{ + 0, + 3, 3, 3, 5, 5, 7, 7, 11, + 11, 11, 11, 15, 15, 15, 15, 23, + 23, 23, 23, 23, 23, 23, 23, 32, + 32, 32, 32, 32, 32, 32, 32, 32 +}; +#endif // JU_64BIT diff --git a/src/libnetdata/libjudy/src/JudyL/j__udyLGet.c b/src/libnetdata/libjudy/src/JudyL/j__udyLGet.c new file mode 100644 index 00000000..0bb9971c --- /dev/null +++ b/src/libnetdata/libjudy/src/JudyL/j__udyLGet.c @@ -0,0 +1,1094 @@ +// Copyright (C) 2000 - 2002 Hewlett-Packard Company +// +// This program is free software; you can redistribute it and/or modify it +// under the term of the GNU Lesser General Public License as published by the +// Free Software Foundation; either version 2 of the License, or (at your +// option) any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License +// for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this program; if not, write to the Free Software Foundation, +// Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// _________________ + +// @(#) $Revision: 4.43 $ $Source: /judy/src/JudyCommon/JudyGet.c $ +// +// Judy1Test() and JudyLGet() functions for Judy1 and JudyL. +// Compile with one of -DJUDY1 or -DJUDYL. + +#if (! (defined(JUDY1) || defined(JUDYL))) +#error: One of -DJUDY1 or -DJUDYL must be specified. +#endif + +#ifdef JUDY1 +#include "Judy1.h" +#else +#include "JudyL.h" +#endif + +#include "JudyPrivate1L.h" + +#ifdef TRACEJPR // different macro name, for "retrieval" only. +#include "JudyPrintJP.c" +#endif + + +// **************************************************************************** +// J U D Y 1 T E S T +// J U D Y L G E T +// +// See the manual entry for details. Note support for "shortcut" entries to +// trees known to start with a JPM. + +#ifdef JUDY1 + +#ifdef JUDYGETINLINE +FUNCTION int j__udy1Test +#else +FUNCTION int Judy1Test +#endif + +#else // JUDYL + +#ifdef JUDYGETINLINE +FUNCTION PPvoid_t j__udyLGet +#else +FUNCTION PPvoid_t JudyLGet +#endif + +#endif // JUDYL + ( +#ifdef JUDYGETINLINE + Pvoid_t PArray, // from which to retrieve. + Word_t Index // to retrieve. +#else + Pcvoid_t PArray, // from which to retrieve. + Word_t Index, // to retrieve. + PJError_t PJError // optional, for returning error info. +#endif + ) +{ + Pjp_t Pjp; // current JP while walking the tree. + Pjpm_t Pjpm; // for global accounting. + uint8_t Digit; // byte just decoded from Index. + Word_t Pop1; // leaf population (number of indexes). + Pjll_t Pjll; // pointer to LeafL. + DBGCODE(uint8_t ParentJPType;) + +#ifndef JUDYGETINLINE + + if (PArray == (Pcvoid_t) NULL) // empty array. + { + JUDY1CODE(return(0);) + JUDYLCODE(return((PPvoid_t) NULL);) + } + +// **************************************************************************** +// PROCESS TOP LEVEL BRANCHES AND LEAF: + + if (JU_LEAFW_POP0(PArray) < cJU_LEAFW_MAXPOP1) // must be a LEAFW + { + Pjlw_t Pjlw = P_JLW(PArray); // first word of leaf. + int posidx; // signed offset in leaf. + + Pop1 = Pjlw[0] + 1; + posidx = j__udySearchLeafW(Pjlw + 1, Pop1, Index); + + if (posidx >= 0) + { + JUDY1CODE(return(1);) + JUDYLCODE(return((PPvoid_t) (JL_LEAFWVALUEAREA(Pjlw, Pop1) + posidx));) + } + JUDY1CODE(return(0);) + JUDYLCODE(return((PPvoid_t) NULL);) + } + +#endif // ! JUDYGETINLINE + + Pjpm = P_JPM(PArray); + Pjp = &(Pjpm->jpm_JP); // top branch is below JPM. + +// **************************************************************************** +// WALK THE JUDY TREE USING A STATE MACHINE: + +ContinueWalk: // for going down one level; come here with Pjp set. + +#ifdef TRACEJPR + JudyPrintJP(Pjp, "g", __LINE__); +#endif + switch (JU_JPTYPE(Pjp)) + { + +// Ensure the switch table starts at 0 for speed; otherwise more code is +// executed: + + case 0: goto ReturnCorrupt; // save a little code. + + +// **************************************************************************** +// JPNULL*: +// +// Note: These are legitimate in a BranchU (only) and do not constitute a +// fault. + + case cJU_JPNULL1: + case cJU_JPNULL2: + case cJU_JPNULL3: +#ifdef JU_64BIT + case cJU_JPNULL4: + case cJU_JPNULL5: + case cJU_JPNULL6: + case cJU_JPNULL7: +#endif + assert(ParentJPType >= cJU_JPBRANCH_U2); + assert(ParentJPType <= cJU_JPBRANCH_U); + JUDY1CODE(return(0);) + JUDYLCODE(return((PPvoid_t) NULL);) + + +// **************************************************************************** +// JPBRANCH_L*: +// +// Note: The use of JU_DCDNOTMATCHINDEX() in branches is not strictly +// required,since this can be done at leaf level, but it costs nothing to do it +// sooner, and it aborts an unnecessary traversal sooner. + + case cJU_JPBRANCH_L2: + + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 2)) break; + Digit = JU_DIGITATSTATE(Index, 2); + goto JudyBranchL; + + case cJU_JPBRANCH_L3: + +#ifdef JU_64BIT // otherwise its a no-op: + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 3)) break; +#endif + Digit = JU_DIGITATSTATE(Index, 3); + goto JudyBranchL; + +#ifdef JU_64BIT + case cJU_JPBRANCH_L4: + + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 4)) break; + Digit = JU_DIGITATSTATE(Index, 4); + goto JudyBranchL; + + case cJU_JPBRANCH_L5: + + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 5)) break; + Digit = JU_DIGITATSTATE(Index, 5); + goto JudyBranchL; + + case cJU_JPBRANCH_L6: + + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 6)) break; + Digit = JU_DIGITATSTATE(Index, 6); + goto JudyBranchL; + + case cJU_JPBRANCH_L7: + + // JU_DCDNOTMATCHINDEX() would be a no-op. + Digit = JU_DIGITATSTATE(Index, 7); + goto JudyBranchL; + +#endif // JU_64BIT + + case cJU_JPBRANCH_L: + { + Pjbl_t Pjbl; + int posidx; + + Digit = JU_DIGITATSTATE(Index, cJU_ROOTSTATE); + +// Common code for all BranchLs; come here with Digit set: + +JudyBranchL: + Pjbl = P_JBL(Pjp->jp_Addr); + + posidx = 0; + + do { + if (Pjbl->jbl_Expanse[posidx] == Digit) + { // found Digit; continue traversal: + DBGCODE(ParentJPType = JU_JPTYPE(Pjp);) + Pjp = Pjbl->jbl_jp + posidx; + goto ContinueWalk; + } + } while (++posidx != Pjbl->jbl_NumJPs); + + break; + } + + +// **************************************************************************** +// JPBRANCH_B*: + + case cJU_JPBRANCH_B2: + + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 2)) break; + Digit = JU_DIGITATSTATE(Index, 2); + goto JudyBranchB; + + case cJU_JPBRANCH_B3: + +#ifdef JU_64BIT // otherwise its a no-op: + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 3)) break; +#endif + Digit = JU_DIGITATSTATE(Index, 3); + goto JudyBranchB; + + +#ifdef JU_64BIT + case cJU_JPBRANCH_B4: + + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 4)) break; + Digit = JU_DIGITATSTATE(Index, 4); + goto JudyBranchB; + + case cJU_JPBRANCH_B5: + + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 5)) break; + Digit = JU_DIGITATSTATE(Index, 5); + goto JudyBranchB; + + case cJU_JPBRANCH_B6: + + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 6)) break; + Digit = JU_DIGITATSTATE(Index, 6); + goto JudyBranchB; + + case cJU_JPBRANCH_B7: + + // JU_DCDNOTMATCHINDEX() would be a no-op. + Digit = JU_DIGITATSTATE(Index, 7); + goto JudyBranchB; + +#endif // JU_64BIT + + case cJU_JPBRANCH_B: + { + Pjbb_t Pjbb; + Word_t subexp; // in bitmap, 0..7. + BITMAPB_t BitMap; // for one subexpanse. + BITMAPB_t BitMask; // bit in BitMap for Indexs Digit. + + Digit = JU_DIGITATSTATE(Index, cJU_ROOTSTATE); + +// Common code for all BranchBs; come here with Digit set: + +JudyBranchB: + DBGCODE(ParentJPType = JU_JPTYPE(Pjp);) + Pjbb = P_JBB(Pjp->jp_Addr); + subexp = Digit / cJU_BITSPERSUBEXPB; + + BitMap = JU_JBB_BITMAP(Pjbb, subexp); + Pjp = P_JP(JU_JBB_PJP(Pjbb, subexp)); + + BitMask = JU_BITPOSMASKB(Digit); + +// No JP in subexpanse for Index => Index not found: + + if (! (BitMap & BitMask)) break; + +// Count JPs in the subexpanse below the one for Index: + + Pjp += j__udyCountBitsB(BitMap & (BitMask - 1)); + + goto ContinueWalk; + + } // case cJU_JPBRANCH_B* + + +// **************************************************************************** +// JPBRANCH_U*: +// +// Notice the reverse order of the cases, and falling through to the next case, +// for performance. + + case cJU_JPBRANCH_U: + + DBGCODE(ParentJPType = JU_JPTYPE(Pjp);) + Pjp = JU_JBU_PJP(Pjp, Index, cJU_ROOTSTATE); + +// If not a BranchU, traverse; otherwise fall into the next case, which makes +// this very fast code for a large Judy array (mainly BranchUs), especially +// when branches are already in the cache, such as for prev/next: + +#ifndef JU_64BIT + if (JU_JPTYPE(Pjp) != cJU_JPBRANCH_U3) goto ContinueWalk; +#else + if (JU_JPTYPE(Pjp) != cJU_JPBRANCH_U7) goto ContinueWalk; +#endif + +#ifdef JU_64BIT + case cJU_JPBRANCH_U7: + + // JU_DCDNOTMATCHINDEX() would be a no-op. + DBGCODE(ParentJPType = JU_JPTYPE(Pjp);) + Pjp = JU_JBU_PJP(Pjp, Index, 7); + + if (JU_JPTYPE(Pjp) != cJU_JPBRANCH_U6) goto ContinueWalk; + // and fall through. + + case cJU_JPBRANCH_U6: + + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 6)) break; + DBGCODE(ParentJPType = JU_JPTYPE(Pjp);) + Pjp = JU_JBU_PJP(Pjp, Index, 6); + + if (JU_JPTYPE(Pjp) != cJU_JPBRANCH_U5) goto ContinueWalk; + // and fall through. + + case cJU_JPBRANCH_U5: + + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 5)) break; + DBGCODE(ParentJPType = JU_JPTYPE(Pjp);) + Pjp = JU_JBU_PJP(Pjp, Index, 5); + + if (JU_JPTYPE(Pjp) != cJU_JPBRANCH_U4) goto ContinueWalk; + // and fall through. + + case cJU_JPBRANCH_U4: + + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 4)) break; + DBGCODE(ParentJPType = JU_JPTYPE(Pjp);) + Pjp = JU_JBU_PJP(Pjp, Index, 4); + + if (JU_JPTYPE(Pjp) != cJU_JPBRANCH_U3) goto ContinueWalk; + // and fall through. + +#endif // JU_64BIT + + case cJU_JPBRANCH_U3: + +#ifdef JU_64BIT // otherwise its a no-op: + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 3)) break; +#endif + DBGCODE(ParentJPType = JU_JPTYPE(Pjp);) + Pjp = JU_JBU_PJP(Pjp, Index, 3); + + if (JU_JPTYPE(Pjp) != cJU_JPBRANCH_U2) goto ContinueWalk; + // and fall through. + + case cJU_JPBRANCH_U2: + + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 2)) break; + DBGCODE(ParentJPType = JU_JPTYPE(Pjp);) + Pjp = JU_JBU_PJP(Pjp, Index, 2); + +// Note: BranchU2 is a special case that must continue traversal to a leaf, +// immed, full, or null type: + + goto ContinueWalk; + + +// **************************************************************************** +// JPLEAF*: +// +// Note: Here the calls of JU_DCDNOTMATCHINDEX() are necessary and check +// whether Index is out of the expanse of a narrow pointer. + +#if (defined(JUDYL) || (! defined(JU_64BIT))) + + case cJU_JPLEAF1: + { + int posidx; // signed offset in leaf. + + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 1)) break; + + Pop1 = JU_JPLEAF_POP0(Pjp) + 1; + Pjll = P_JLL(Pjp->jp_Addr); + + if ((posidx = j__udySearchLeaf1(Pjll, Pop1, Index)) < 0) break; + + JUDY1CODE(return(1);) + JUDYLCODE(return((PPvoid_t) (JL_LEAF1VALUEAREA(Pjll, Pop1) + posidx));) + } + +#endif // (JUDYL || (! JU_64BIT)) + + case cJU_JPLEAF2: + { + int posidx; // signed offset in leaf. + + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 2)) break; + + Pop1 = JU_JPLEAF_POP0(Pjp) + 1; + Pjll = P_JLL(Pjp->jp_Addr); + + if ((posidx = j__udySearchLeaf2(Pjll, Pop1, Index)) < 0) break; + + JUDY1CODE(return(1);) + JUDYLCODE(return((PPvoid_t) (JL_LEAF2VALUEAREA(Pjll, Pop1) + posidx));) + } + case cJU_JPLEAF3: + { + int posidx; // signed offset in leaf. + +#ifdef JU_64BIT // otherwise its a no-op: + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 3)) break; +#endif + + Pop1 = JU_JPLEAF_POP0(Pjp) + 1; + Pjll = P_JLL(Pjp->jp_Addr); + + if ((posidx = j__udySearchLeaf3(Pjll, Pop1, Index)) < 0) break; + + JUDY1CODE(return(1);) + JUDYLCODE(return((PPvoid_t) (JL_LEAF3VALUEAREA(Pjll, Pop1) + posidx));) + } +#ifdef JU_64BIT + case cJU_JPLEAF4: + { + int posidx; // signed offset in leaf. + + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 4)) break; + + Pop1 = JU_JPLEAF_POP0(Pjp) + 1; + Pjll = P_JLL(Pjp->jp_Addr); + + if ((posidx = j__udySearchLeaf4(Pjll, Pop1, Index)) < 0) break; + + JUDY1CODE(return(1);) + JUDYLCODE(return((PPvoid_t) (JL_LEAF4VALUEAREA(Pjll, Pop1) + posidx));) + } + case cJU_JPLEAF5: + { + int posidx; // signed offset in leaf. + + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 5)) break; + + Pop1 = JU_JPLEAF_POP0(Pjp) + 1; + Pjll = P_JLL(Pjp->jp_Addr); + + if ((posidx = j__udySearchLeaf5(Pjll, Pop1, Index)) < 0) break; + + JUDY1CODE(return(1);) + JUDYLCODE(return((PPvoid_t) (JL_LEAF5VALUEAREA(Pjll, Pop1) + posidx));) + } + + case cJU_JPLEAF6: + { + int posidx; // signed offset in leaf. + + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 6)) break; + + Pop1 = JU_JPLEAF_POP0(Pjp) + 1; + Pjll = P_JLL(Pjp->jp_Addr); + + if ((posidx = j__udySearchLeaf6(Pjll, Pop1, Index)) < 0) break; + + JUDY1CODE(return(1);) + JUDYLCODE(return((PPvoid_t) (JL_LEAF6VALUEAREA(Pjll, Pop1) + posidx));) + } + case cJU_JPLEAF7: + { + int posidx; // signed offset in leaf. + + // JU_DCDNOTMATCHINDEX() would be a no-op. + Pop1 = JU_JPLEAF_POP0(Pjp) + 1; + Pjll = P_JLL(Pjp->jp_Addr); + + if ((posidx = j__udySearchLeaf7(Pjll, Pop1, Index)) < 0) break; + + JUDY1CODE(return(1);) + JUDYLCODE(return((PPvoid_t) (JL_LEAF7VALUEAREA(Pjll, Pop1) + posidx));) + } +#endif // JU_64BIT + + +// **************************************************************************** +// JPLEAF_B1: + + case cJU_JPLEAF_B1: + { + Pjlb_t Pjlb; +#ifdef JUDYL + int posidx; + Word_t subexp; // in bitmap, 0..7. + BITMAPL_t BitMap; // for one subexpanse. + BITMAPL_t BitMask; // bit in BitMap for Indexs Digit. + Pjv_t Pjv; +#endif + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 1)) break; + + Pjlb = P_JLB(Pjp->jp_Addr); + +#ifdef JUDY1 + +// Simply check if Indexs bit is set in the bitmap: + + if (JU_BITMAPTESTL(Pjlb, Index)) return(1); + break; + +#else // JUDYL + +// JudyL is much more complicated because of value area subarrays: + + Digit = JU_DIGITATSTATE(Index, 1); + subexp = Digit / cJU_BITSPERSUBEXPL; + BitMap = JU_JLB_BITMAP(Pjlb, subexp); + BitMask = JU_BITPOSMASKL(Digit); + +// No value in subexpanse for Index => Index not found: + + if (! (BitMap & BitMask)) break; + +// Count value areas in the subexpanse below the one for Index: + + Pjv = P_JV(JL_JLB_PVALUE(Pjlb, subexp)); + assert(Pjv != (Pjv_t) NULL); + posidx = j__udyCountBitsL(BitMap & (BitMask - 1)); + + return((PPvoid_t) (Pjv + posidx)); + +#endif // JUDYL + + } // case cJU_JPLEAF_B1 + +#ifdef JUDY1 + +// **************************************************************************** +// JPFULLPOPU1: +// +// If the Index is in the expanse, it is necessarily valid (found). + + case cJ1_JPFULLPOPU1: + + if (JU_DCDNOTMATCHINDEX(Index, Pjp, 1)) break; + return(1); + +#ifdef notdef // for future enhancements +#ifdef JU_64BIT + +// Note: Need ? if (JU_DCDNOTMATCHINDEX(Index, Pjp, 1)) break; + + case cJ1_JPFULLPOPU1m15: + if (Pjp->jp_1Index[14] == (uint8_t)Index) break; + case cJ1_JPFULLPOPU1m14: + if (Pjp->jp_1Index[13] == (uint8_t)Index) break; + case cJ1_JPFULLPOPU1m13: + if (Pjp->jp_1Index[12] == (uint8_t)Index) break; + case cJ1_JPFULLPOPU1m12: + if (Pjp->jp_1Index[11] == (uint8_t)Index) break; + case cJ1_JPFULLPOPU1m11: + if (Pjp->jp_1Index[10] == (uint8_t)Index) break; + case cJ1_JPFULLPOPU1m10: + if (Pjp->jp_1Index[9] == (uint8_t)Index) break; + case cJ1_JPFULLPOPU1m9: + if (Pjp->jp_1Index[8] == (uint8_t)Index) break; + case cJ1_JPFULLPOPU1m8: + if (Pjp->jp_1Index[7] == (uint8_t)Index) break; +#endif + case cJ1_JPFULLPOPU1m7: + if (Pjp->jp_1Index[6] == (uint8_t)Index) break; + case cJ1_JPFULLPOPU1m6: + if (Pjp->jp_1Index[5] == (uint8_t)Index) break; + case cJ1_JPFULLPOPU1m5: + if (Pjp->jp_1Index[4] == (uint8_t)Index) break; + case cJ1_JPFULLPOPU1m4: + if (Pjp->jp_1Index[3] == (uint8_t)Index) break; + case cJ1_JPFULLPOPU1m3: + if (Pjp->jp_1Index[2] == (uint8_t)Index) break; + case cJ1_JPFULLPOPU1m2: + if (Pjp->jp_1Index[1] == (uint8_t)Index) break; + case cJ1_JPFULLPOPU1m1: + if (Pjp->jp_1Index[0] == (uint8_t)Index) break; + + return(1); // found, not in exclusion list + +#endif // JUDY1 +#endif // notdef + +// **************************************************************************** +// JPIMMED*: +// +// Note that the contents of jp_DcdPopO are different for cJU_JPIMMED_*_01: + + case cJU_JPIMMED_1_01: + case cJU_JPIMMED_2_01: + case cJU_JPIMMED_3_01: +#ifdef JU_64BIT + case cJU_JPIMMED_4_01: + case cJU_JPIMMED_5_01: + case cJU_JPIMMED_6_01: + case cJU_JPIMMED_7_01: +#endif + if (JU_JPDCDPOP0(Pjp) != JU_TRIMTODCDSIZE(Index)) break; + + JUDY1CODE(return(1);) + JUDYLCODE(return((PPvoid_t) &(Pjp->jp_Addr));) // immediate value area. + + +// Macros to make code more readable and avoid dup errors + +#ifdef JUDY1 + +#define CHECKINDEXNATIVE(LEAF_T, PJP, IDX, INDEX) \ +if (((LEAF_T *)((PJP)->jp_1Index))[(IDX) - 1] == (LEAF_T)(INDEX)) \ + return(1) + +#define CHECKLEAFNONNAT(LFBTS, PJP, INDEX, IDX, COPY) \ +{ \ + Word_t i_ndex; \ + uint8_t *a_ddr; \ + a_ddr = (PJP)->jp_1Index + (((IDX) - 1) * (LFBTS)); \ + COPY(i_ndex, a_ddr); \ + if (i_ndex == JU_LEASTBYTES((INDEX), (LFBTS))) \ + return(1); \ +} +#endif + +#ifdef JUDYL + +#define CHECKINDEXNATIVE(LEAF_T, PJP, IDX, INDEX) \ +if (((LEAF_T *)((PJP)->jp_LIndex))[(IDX) - 1] == (LEAF_T)(INDEX)) \ + return((PPvoid_t)(P_JV((PJP)->jp_Addr) + (IDX) - 1)) + +#define CHECKLEAFNONNAT(LFBTS, PJP, INDEX, IDX, COPY) \ +{ \ + Word_t i_ndex; \ + uint8_t *a_ddr; \ + a_ddr = (PJP)->jp_LIndex + (((IDX) - 1) * (LFBTS)); \ + COPY(i_ndex, a_ddr); \ + if (i_ndex == JU_LEASTBYTES((INDEX), (LFBTS))) \ + return((PPvoid_t)(P_JV((PJP)->jp_Addr) + (IDX) - 1)); \ +} +#endif + +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_1_15: CHECKINDEXNATIVE(uint8_t, Pjp, 15, Index); + case cJ1_JPIMMED_1_14: CHECKINDEXNATIVE(uint8_t, Pjp, 14, Index); + case cJ1_JPIMMED_1_13: CHECKINDEXNATIVE(uint8_t, Pjp, 13, Index); + case cJ1_JPIMMED_1_12: CHECKINDEXNATIVE(uint8_t, Pjp, 12, Index); + case cJ1_JPIMMED_1_11: CHECKINDEXNATIVE(uint8_t, Pjp, 11, Index); + case cJ1_JPIMMED_1_10: CHECKINDEXNATIVE(uint8_t, Pjp, 10, Index); + case cJ1_JPIMMED_1_09: CHECKINDEXNATIVE(uint8_t, Pjp, 9, Index); + case cJ1_JPIMMED_1_08: CHECKINDEXNATIVE(uint8_t, Pjp, 8, Index); +#endif +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_1_07: CHECKINDEXNATIVE(uint8_t, Pjp, 7, Index); + case cJU_JPIMMED_1_06: CHECKINDEXNATIVE(uint8_t, Pjp, 6, Index); + case cJU_JPIMMED_1_05: CHECKINDEXNATIVE(uint8_t, Pjp, 5, Index); + case cJU_JPIMMED_1_04: CHECKINDEXNATIVE(uint8_t, Pjp, 4, Index); +#endif + case cJU_JPIMMED_1_03: CHECKINDEXNATIVE(uint8_t, Pjp, 3, Index); + case cJU_JPIMMED_1_02: CHECKINDEXNATIVE(uint8_t, Pjp, 2, Index); + CHECKINDEXNATIVE(uint8_t, Pjp, 1, Index); + break; + +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_2_07: CHECKINDEXNATIVE(uint16_t, Pjp, 7, Index); + case cJ1_JPIMMED_2_06: CHECKINDEXNATIVE(uint16_t, Pjp, 6, Index); + case cJ1_JPIMMED_2_05: CHECKINDEXNATIVE(uint16_t, Pjp, 5, Index); + case cJ1_JPIMMED_2_04: CHECKINDEXNATIVE(uint16_t, Pjp, 4, Index); +#endif +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_2_03: CHECKINDEXNATIVE(uint16_t, Pjp, 3, Index); + case cJU_JPIMMED_2_02: CHECKINDEXNATIVE(uint16_t, Pjp, 2, Index); + CHECKINDEXNATIVE(uint16_t, Pjp, 1, Index); + break; +#endif + +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_3_05: + CHECKLEAFNONNAT(3, Pjp, Index, 5, JU_COPY3_PINDEX_TO_LONG); + case cJ1_JPIMMED_3_04: + CHECKLEAFNONNAT(3, Pjp, Index, 4, JU_COPY3_PINDEX_TO_LONG); + case cJ1_JPIMMED_3_03: + CHECKLEAFNONNAT(3, Pjp, Index, 3, JU_COPY3_PINDEX_TO_LONG); +#endif +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_3_02: + CHECKLEAFNONNAT(3, Pjp, Index, 2, JU_COPY3_PINDEX_TO_LONG); + CHECKLEAFNONNAT(3, Pjp, Index, 1, JU_COPY3_PINDEX_TO_LONG); + break; +#endif + +#if (defined(JUDY1) && defined(JU_64BIT)) + + case cJ1_JPIMMED_4_03: CHECKINDEXNATIVE(uint32_t, Pjp, 3, Index); + case cJ1_JPIMMED_4_02: CHECKINDEXNATIVE(uint32_t, Pjp, 2, Index); + CHECKINDEXNATIVE(uint32_t, Pjp, 1, Index); + break; + + case cJ1_JPIMMED_5_03: + CHECKLEAFNONNAT(5, Pjp, Index, 3, JU_COPY5_PINDEX_TO_LONG); + case cJ1_JPIMMED_5_02: + CHECKLEAFNONNAT(5, Pjp, Index, 2, JU_COPY5_PINDEX_TO_LONG); + CHECKLEAFNONNAT(5, Pjp, Index, 1, JU_COPY5_PINDEX_TO_LONG); + break; + + case cJ1_JPIMMED_6_02: + CHECKLEAFNONNAT(6, Pjp, Index, 2, JU_COPY6_PINDEX_TO_LONG); + CHECKLEAFNONNAT(6, Pjp, Index, 1, JU_COPY6_PINDEX_TO_LONG); + break; + + case cJ1_JPIMMED_7_02: + CHECKLEAFNONNAT(7, Pjp, Index, 2, JU_COPY7_PINDEX_TO_LONG); + CHECKLEAFNONNAT(7, Pjp, Index, 1, JU_COPY7_PINDEX_TO_LONG); + break; + +#endif // (JUDY1 && JU_64BIT) + + +// **************************************************************************** +// INVALID JP TYPE: + + default: + +ReturnCorrupt: + +#ifdef JUDYGETINLINE // Pjpm is known to be non-null: + JU_SET_ERRNO_NONNULL(Pjpm, JU_ERRNO_CORRUPT); +#else + JU_SET_ERRNO(PJError, JU_ERRNO_CORRUPT); +#endif + JUDY1CODE(return(JERRI );) + JUDYLCODE(return(PPJERR);) + + } // switch on JP type + +JUDY1CODE(return(0);) +JUDYLCODE(return((PPvoid_t) NULL);) + +} // Judy1Test() / JudyLGet() + + +#ifndef JUDYGETINLINE // only compile the following function once: +#ifdef DEBUG + +// **************************************************************************** +// J U D Y C H E C K P O P +// +// Given a pointer to a Judy array, traverse the entire array to ensure +// population counts add up correctly. This can catch various coding errors. +// +// Since walking the entire tree is probably time-consuming, enable this +// function by setting env parameter $CHECKPOP to first call at which to start +// checking. Note: This function is called both from insert and delete code. +// +// Note: Even though this function does nothing useful for LEAFW leaves, its +// good practice to call it anyway, and cheap too. +// +// TBD: This is a debug-only check function similar to JudyCheckSorted(), but +// since it walks the tree it is Judy1/JudyL-specific and must live in a source +// file that is built both ways. +// +// TBD: As feared, enabling this code for every insert/delete makes Judy +// deathly slow, even for a small tree (10K indexes). Its not so bad if +// present but disabled (<1% slowdown measured). Still, should it be ifdefd +// other than DEBUG and/or called less often? +// +// TBD: Should this "population checker" be expanded to a comprehensive tree +// checker? It currently detects invalid LEAFW/JP types as well as inconsistent +// pop1s. Other possible checks, all based on essentially redundant data in +// the Judy tree, include: +// +// - Zero LS bits in jp_Addr field. +// +// - Correct Dcd bits. +// +// - Consistent JP types (always descending down the tree). +// +// - Sorted linear lists in BranchLs and leaves (using JudyCheckSorted(), but +// ideally that function is already called wherever appropriate after any +// linear list is modified). +// +// - Any others possible? + +#include <stdlib.h> // for getenv() and atol(). + +static Word_t JudyCheckPopSM(Pjp_t Pjp, Word_t RootPop1); + +FUNCTION void JudyCheckPop( + Pvoid_t PArray) +{ +static bool_t checked = FALSE; // already checked env parameter. +static bool_t enabled = FALSE; // env parameter set. +static bool_t active = FALSE; // calls >= callsmin. +static Word_t callsmin; // start point from $CHECKPOP. +static Word_t calls = 0; // times called so far. + + +// CHECK FOR EXTERNAL ENABLING: + + if (! checked) // only check once. + { + char * value; // for getenv(). + + checked = TRUE; + + if ((value = getenv("CHECKPOP")) == (char *) NULL) + { +#ifdef notdef +// Take this out because nightly tests want to be flavor-independent; its not +// OK to emit special non-error output from the debug flavor: + + (void) puts("JudyCheckPop() present but not enabled by " + "$CHECKPOP env parameter; set it to the number of " + "calls at which to begin checking"); +#endif + return; + } + + callsmin = atol(value); // note: non-number evaluates to 0. + enabled = TRUE; + + (void) printf("JudyCheckPop() present and enabled; callsmin = " + "%lu\n", callsmin); + } + else if (! enabled) return; + +// Previously or just now enabled; check if non-active or newly active: + + if (! active) + { + if (++calls < callsmin) return; + + (void) printf("JudyCheckPop() activated at call %lu\n", calls); + active = TRUE; + } + +// IGNORE LEAFW AT TOP OF TREE: + + if (JU_LEAFW_POP0(PArray) < cJU_LEAFW_MAXPOP1) // must be a LEAFW + return; + +// Check JPM pop0 against tree, recursively: +// +// Note: The traversal code in JudyCheckPopSM() is simplest when the case +// statement for each JP type compares the pop1 for that JP to its subtree (if +// any) after traversing the subtree (thats the hard part) and adding up +// actual pop1s. A top branchs JP in the JPM does not have room for a +// full-word pop1, so pass it in as a special case. + + { + Pjpm_t Pjpm = P_JPM(PArray); + (void) JudyCheckPopSM(&(Pjpm->jpm_JP), Pjpm->jpm_Pop0 + 1); + return; + } + +} // JudyCheckPop() + + +// **************************************************************************** +// J U D Y C H E C K P O P S M +// +// Recursive state machine (subroutine) for JudyCheckPop(): Given a Pjp (other +// than JPNULL*; caller should shortcut) and the root population for top-level +// branches, check the subtrees actual pop1 against its nominal value, and +// return the total pop1 for the subtree. +// +// Note: Expect RootPop1 to be ignored at lower levels, so pass down 0, which +// should pop an assertion if this expectation is violated. + +FUNCTION static Word_t JudyCheckPopSM( + Pjp_t Pjp, // top of subtree. + Word_t RootPop1) // whole array, for top-level branches only. +{ + Word_t pop1_jp; // nominal population from the JP. + Word_t pop1 = 0; // actual population at this level. + Word_t offset; // in a branch. + +#define PREPBRANCH(cPopBytes,Next) \ + pop1_jp = JU_JPBRANCH_POP0(Pjp, cPopBytes) + 1; goto Next + +assert((((Word_t) (Pjp->jp_Addr)) & 7) == 3); + switch (JU_JPTYPE(Pjp)) + { + + case cJU_JPBRANCH_L2: PREPBRANCH(2, BranchL); + case cJU_JPBRANCH_L3: PREPBRANCH(3, BranchL); +#ifdef JU_64BIT + case cJU_JPBRANCH_L4: PREPBRANCH(4, BranchL); + case cJU_JPBRANCH_L5: PREPBRANCH(5, BranchL); + case cJU_JPBRANCH_L6: PREPBRANCH(6, BranchL); + case cJU_JPBRANCH_L7: PREPBRANCH(7, BranchL); +#endif + case cJU_JPBRANCH_L: pop1_jp = RootPop1; + { + Pjbl_t Pjbl; +BranchL: + Pjbl = P_JBL(Pjp->jp_Addr); + + for (offset = 0; offset < (Pjbl->jbl_NumJPs); ++offset) + pop1 += JudyCheckPopSM((Pjbl->jbl_jp) + offset, 0); + + assert(pop1_jp == pop1); + return(pop1); + } + + case cJU_JPBRANCH_B2: PREPBRANCH(2, BranchB); + case cJU_JPBRANCH_B3: PREPBRANCH(3, BranchB); +#ifdef JU_64BIT + case cJU_JPBRANCH_B4: PREPBRANCH(4, BranchB); + case cJU_JPBRANCH_B5: PREPBRANCH(5, BranchB); + case cJU_JPBRANCH_B6: PREPBRANCH(6, BranchB); + case cJU_JPBRANCH_B7: PREPBRANCH(7, BranchB); +#endif + case cJU_JPBRANCH_B: pop1_jp = RootPop1; + { + Word_t subexp; + Word_t jpcount; + Pjbb_t Pjbb; +BranchB: + Pjbb = P_JBB(Pjp->jp_Addr); + + for (subexp = 0; subexp < cJU_NUMSUBEXPB; ++subexp) + { + jpcount = j__udyCountBitsB(JU_JBB_BITMAP(Pjbb, subexp)); + + for (offset = 0; offset < jpcount; ++offset) + { + pop1 += JudyCheckPopSM(P_JP(JU_JBB_PJP(Pjbb, subexp)) + + offset, 0); + } + } + + assert(pop1_jp == pop1); + return(pop1); + } + + case cJU_JPBRANCH_U2: PREPBRANCH(2, BranchU); + case cJU_JPBRANCH_U3: PREPBRANCH(3, BranchU); +#ifdef JU_64BIT + case cJU_JPBRANCH_U4: PREPBRANCH(4, BranchU); + case cJU_JPBRANCH_U5: PREPBRANCH(5, BranchU); + case cJU_JPBRANCH_U6: PREPBRANCH(6, BranchU); + case cJU_JPBRANCH_U7: PREPBRANCH(7, BranchU); +#endif + case cJU_JPBRANCH_U: pop1_jp = RootPop1; + { + Pjbu_t Pjbu; +BranchU: + Pjbu = P_JBU(Pjp->jp_Addr); + + for (offset = 0; offset < cJU_BRANCHUNUMJPS; ++offset) + { + if (((Pjbu->jbu_jp[offset].jp_Type) >= cJU_JPNULL1) + && ((Pjbu->jbu_jp[offset].jp_Type) <= cJU_JPNULLMAX)) + { + continue; // skip null JP to save time. + } + + pop1 += JudyCheckPopSM((Pjbu->jbu_jp) + offset, 0); + } + + assert(pop1_jp == pop1); + return(pop1); + } + + +// -- Cases below here terminate and do not recurse. -- +// +// For all of these cases except JPLEAF_B1, there is no way to check the JPs +// pop1 against the object itself; just return the pop1; but for linear leaves, +// a bounds check is possible. + +#define CHECKLEAF(MaxPop1) \ + pop1 = JU_JPLEAF_POP0(Pjp) + 1; \ + assert(pop1 >= 1); \ + assert(pop1 <= (MaxPop1)); \ + return(pop1) + +#if (defined(JUDYL) || (! defined(JU_64BIT))) + case cJU_JPLEAF1: CHECKLEAF(cJU_LEAF1_MAXPOP1); +#endif + case cJU_JPLEAF2: CHECKLEAF(cJU_LEAF2_MAXPOP1); + case cJU_JPLEAF3: CHECKLEAF(cJU_LEAF3_MAXPOP1); +#ifdef JU_64BIT + case cJU_JPLEAF4: CHECKLEAF(cJU_LEAF4_MAXPOP1); + case cJU_JPLEAF5: CHECKLEAF(cJU_LEAF5_MAXPOP1); + case cJU_JPLEAF6: CHECKLEAF(cJU_LEAF6_MAXPOP1); + case cJU_JPLEAF7: CHECKLEAF(cJU_LEAF7_MAXPOP1); +#endif + + case cJU_JPLEAF_B1: + { + Word_t subexp; + Pjlb_t Pjlb; + + pop1_jp = JU_JPLEAF_POP0(Pjp) + 1; + + Pjlb = P_JLB(Pjp->jp_Addr); + + for (subexp = 0; subexp < cJU_NUMSUBEXPL; ++subexp) + pop1 += j__udyCountBitsL(JU_JLB_BITMAP(Pjlb, subexp)); + + assert(pop1_jp == pop1); + return(pop1); + } + + JUDY1CODE(case cJ1_JPFULLPOPU1: return(cJU_JPFULLPOPU1_POP0);) + + case cJU_JPIMMED_1_01: return(1); + case cJU_JPIMMED_2_01: return(1); + case cJU_JPIMMED_3_01: return(1); +#ifdef JU_64BIT + case cJU_JPIMMED_4_01: return(1); + case cJU_JPIMMED_5_01: return(1); + case cJU_JPIMMED_6_01: return(1); + case cJU_JPIMMED_7_01: return(1); +#endif + + case cJU_JPIMMED_1_02: return(2); + case cJU_JPIMMED_1_03: return(3); +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_1_04: return(4); + case cJU_JPIMMED_1_05: return(5); + case cJU_JPIMMED_1_06: return(6); + case cJU_JPIMMED_1_07: return(7); +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_1_08: return(8); + case cJ1_JPIMMED_1_09: return(9); + case cJ1_JPIMMED_1_10: return(10); + case cJ1_JPIMMED_1_11: return(11); + case cJ1_JPIMMED_1_12: return(12); + case cJ1_JPIMMED_1_13: return(13); + case cJ1_JPIMMED_1_14: return(14); + case cJ1_JPIMMED_1_15: return(15); +#endif + +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_2_02: return(2); + case cJU_JPIMMED_2_03: return(3); +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_2_04: return(4); + case cJ1_JPIMMED_2_05: return(5); + case cJ1_JPIMMED_2_06: return(6); + case cJ1_JPIMMED_2_07: return(7); +#endif + +#if (defined(JUDY1) || defined(JU_64BIT)) + case cJU_JPIMMED_3_02: return(2); +#endif +#if (defined(JUDY1) && defined(JU_64BIT)) + case cJ1_JPIMMED_3_03: return(3); + case cJ1_JPIMMED_3_04: return(4); + case cJ1_JPIMMED_3_05: return(5); + + case cJ1_JPIMMED_4_02: return(2); + case cJ1_JPIMMED_4_03: return(3); + case cJ1_JPIMMED_5_02: return(2); + case cJ1_JPIMMED_5_03: return(3); + case cJ1_JPIMMED_6_02: return(2); + case cJ1_JPIMMED_7_02: return(2); +#endif + + } // switch (JU_JPTYPE(Pjp)) + + assert(FALSE); // unrecognized JP type => corruption. + return(0); // to make some compilers happy. + +} // JudyCheckPopSM() + +#endif // DEBUG +#endif // ! JUDYGETINLINE diff --git a/src/libnetdata/libnetdata.c b/src/libnetdata/libnetdata.c new file mode 100644 index 00000000..909bb71d --- /dev/null +++ b/src/libnetdata/libnetdata.c @@ -0,0 +1,2082 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "libnetdata.h" + +#if !defined(MADV_DONTFORK) +#define MADV_DONTFORK 0 +#endif + +#if !defined(O_NOATIME) +#define O_NOATIME 0 +#endif + +struct rlimit rlimit_nofile = { .rlim_cur = 1024, .rlim_max = 1024 }; + +#if defined(MADV_MERGEABLE) +int enable_ksm = 1; +#else +int enable_ksm = 0; +#endif + +volatile sig_atomic_t netdata_exit = 0; + +#define MAX_JUDY_SIZE_TO_ARAL 24 +static bool judy_sizes_config[MAX_JUDY_SIZE_TO_ARAL + 1] = { + [3] = true, + [4] = true, + [5] = true, + [6] = true, + [7] = true, + [8] = true, + [10] = true, + [11] = true, + [15] = true, + [23] = true, +}; +static ARAL *judy_sizes_aral[MAX_JUDY_SIZE_TO_ARAL + 1] = {}; + +struct aral_statistics judy_sizes_aral_statistics = {}; + +void aral_judy_init(void) { + for(size_t Words = 0; Words <= MAX_JUDY_SIZE_TO_ARAL; Words++) + if(judy_sizes_config[Words]) { + char buf[30+1]; + snprintfz(buf, sizeof(buf) - 1, "judy-%zu", Words * sizeof(Word_t)); + judy_sizes_aral[Words] = aral_create( + buf, + Words * sizeof(Word_t), + 0, + 65536, + &judy_sizes_aral_statistics, + NULL, NULL, false, false); + } +} + +size_t judy_aral_overhead(void) { + return aral_overhead_from_stats(&judy_sizes_aral_statistics); +} + +size_t judy_aral_structures(void) { + return aral_structures_from_stats(&judy_sizes_aral_statistics); +} + +static ARAL *judy_size_aral(Word_t Words) { + if(Words <= MAX_JUDY_SIZE_TO_ARAL && judy_sizes_aral[Words]) + return judy_sizes_aral[Words]; + + return NULL; +} + +inline Word_t JudyMalloc(Word_t Words) { + Word_t Addr; + + ARAL *ar = judy_size_aral(Words); + if(ar) + Addr = (Word_t) aral_mallocz(ar); + else + Addr = (Word_t) mallocz(Words * sizeof(Word_t)); + + return(Addr); +} + +inline void JudyFree(void * PWord, Word_t Words) { + ARAL *ar = judy_size_aral(Words); + if(ar) + aral_freez(ar, PWord); + else + freez(PWord); +} + +Word_t JudyMallocVirtual(Word_t Words) { + return JudyMalloc(Words); +} + +void JudyFreeVirtual(void * PWord, Word_t Words) { + JudyFree(PWord, Words); +} + +// ---------------------------------------------------------------------------- +// memory allocation functions that handle failures + +// although netdata does not use memory allocations too often (netdata tries to +// maintain its memory footprint stable during runtime, i.e. all buffers are +// allocated during initialization and are adapted to current use throughout +// its lifetime), these can be used to override the default system allocation +// routines. + +#ifdef NETDATA_TRACE_ALLOCATIONS +#warning NETDATA_TRACE_ALLOCATIONS ENABLED +#include "Judy.h" + +#if defined(HAVE_DLSYM) && defined(ENABLE_DLSYM) +#include <dlfcn.h> + +typedef void (*libc_function_t)(void); + +static void *malloc_first_run(size_t size); +static void *(*libc_malloc)(size_t) = malloc_first_run; + +static void *calloc_first_run(size_t n, size_t size); +static void *(*libc_calloc)(size_t, size_t) = calloc_first_run; + +static void *realloc_first_run(void *ptr, size_t size); +static void *(*libc_realloc)(void *, size_t) = realloc_first_run; + +static void free_first_run(void *ptr); +static void (*libc_free)(void *) = free_first_run; + +static char *strdup_first_run(const char *s); +static char *(*libc_strdup)(const char *) = strdup_first_run; + +static char *strndup_first_run(const char *s, size_t len); +static char *(*libc_strndup)(const char *, size_t) = strndup_first_run; + +static size_t malloc_usable_size_first_run(void *ptr); +#ifdef HAVE_MALLOC_USABLE_SIZE +static size_t (*libc_malloc_usable_size)(void *) = malloc_usable_size_first_run; +#else +static size_t (*libc_malloc_usable_size)(void *) = NULL; +#endif + +static void link_system_library_function(libc_function_t *func_pptr, const char *name, bool required) { + *func_pptr = dlsym(RTLD_NEXT, name); + if(!*func_pptr && required) { + fprintf(stderr, "FATAL: Cannot find system's %s() function.\n", name); + abort(); + } +} + +static void *malloc_first_run(size_t size) { + link_system_library_function((libc_function_t *) &libc_malloc, "malloc", true); + return libc_malloc(size); +} + +static void *calloc_first_run(size_t n, size_t size) { + link_system_library_function((libc_function_t *) &libc_calloc, "calloc", true); + return libc_calloc(n, size); +} + +static void *realloc_first_run(void *ptr, size_t size) { + link_system_library_function((libc_function_t *) &libc_realloc, "realloc", true); + return libc_realloc(ptr, size); +} + +static void free_first_run(void *ptr) { + link_system_library_function((libc_function_t *) &libc_free, "free", true); + libc_free(ptr); +} + +static char *strdup_first_run(const char *s) { + link_system_library_function((libc_function_t *) &libc_strdup, "strdup", true); + return libc_strdup(s); +} + +static char *strndup_first_run(const char *s, size_t len) { + link_system_library_function((libc_function_t *) &libc_strndup, "strndup", true); + return libc_strndup(s, len); +} + +static size_t malloc_usable_size_first_run(void *ptr) { + link_system_library_function((libc_function_t *) &libc_malloc_usable_size, "malloc_usable_size", false); + + if(libc_malloc_usable_size) + return libc_malloc_usable_size(ptr); + else + return 0; +} + +void *malloc(size_t size) { + return mallocz(size); +} + +void *calloc(size_t n, size_t size) { + return callocz(n, size); +} + +void *realloc(void *ptr, size_t size) { + return reallocz(ptr, size); +} + +void *reallocarray(void *ptr, size_t n, size_t size) { + return reallocz(ptr, n * size); +} + +void free(void *ptr) { + freez(ptr); +} + +char *strdup(const char *s) { + return strdupz(s); +} + +char *strndup(const char *s, size_t len) { + return strndupz(s, len); +} + +size_t malloc_usable_size(void *ptr) { + return mallocz_usable_size(ptr); +} +#else // !HAVE_DLSYM + +static void *(*libc_malloc)(size_t) = malloc; +static void *(*libc_calloc)(size_t, size_t) = calloc; +static void *(*libc_realloc)(void *, size_t) = realloc; +static void (*libc_free)(void *) = free; + +#ifdef HAVE_MALLOC_USABLE_SIZE +static size_t (*libc_malloc_usable_size)(void *) = malloc_usable_size; +#else +static size_t (*libc_malloc_usable_size)(void *) = NULL; +#endif + +#endif // HAVE_DLSYM + + +void posix_memfree(void *ptr) { + libc_free(ptr); +} + +struct malloc_header_signature { + uint32_t magic; + uint32_t size; + struct malloc_trace *trace; +}; + +struct malloc_header { + struct malloc_header_signature signature; + uint8_t padding[(sizeof(struct malloc_header_signature) % MALLOC_ALIGNMENT) ? MALLOC_ALIGNMENT - (sizeof(struct malloc_header_signature) % MALLOC_ALIGNMENT) : 0]; + uint8_t data[]; +}; + +static size_t malloc_header_size = sizeof(struct malloc_header); + +int malloc_trace_compare(void *A, void *B) { + struct malloc_trace *a = A; + struct malloc_trace *b = B; + return strcmp(a->function, b->function); +} + +static avl_tree_lock malloc_trace_index = { + .avl_tree = { + .root = NULL, + .compar = malloc_trace_compare}, + .rwlock = AVL_LOCK_INITIALIZER +}; + +int malloc_trace_walkthrough(int (*callback)(void *item, void *data), void *data) { + return avl_traverse_lock(&malloc_trace_index, callback, data); +} + +NEVERNULL WARNUNUSED +static struct malloc_trace *malloc_trace_find_or_create(const char *file, const char *function, size_t line) { + struct malloc_trace tmp = { + .line = line, + .function = function, + .file = file, + }; + + struct malloc_trace *t = (struct malloc_trace *)avl_search_lock(&malloc_trace_index, (avl_t *)&tmp); + if(!t) { + t = libc_calloc(1, sizeof(struct malloc_trace)); + if(!t) fatal("No memory"); + t->line = line; + t->function = function; + t->file = file; + + struct malloc_trace *t2 = (struct malloc_trace *)avl_insert_lock(&malloc_trace_index, (avl_t *)t); + if(t2 != t) + free(t); + + t = t2; + } + + if(!t) + fatal("Cannot insert to AVL"); + + return t; +} + +void malloc_trace_mmap(size_t size) { + struct malloc_trace *p = malloc_trace_find_or_create("unknown", "netdata_mmap", 1); + size_t_atomic_count(add, p->mmap_calls, 1); + size_t_atomic_count(add, p->allocations, 1); + size_t_atomic_bytes(add, p->bytes, size); +} + +void malloc_trace_munmap(size_t size) { + struct malloc_trace *p = malloc_trace_find_or_create("unknown", "netdata_mmap", 1); + size_t_atomic_count(add, p->munmap_calls, 1); + size_t_atomic_count(sub, p->allocations, 1); + size_t_atomic_bytes(sub, p->bytes, size); +} + +void *mallocz_int(size_t size, const char *file, const char *function, size_t line) { + struct malloc_trace *p = malloc_trace_find_or_create(file, function, line); + + size_t_atomic_count(add, p->malloc_calls, 1); + size_t_atomic_count(add, p->allocations, 1); + size_t_atomic_bytes(add, p->bytes, size); + + struct malloc_header *t = (struct malloc_header *)libc_malloc(malloc_header_size + size); + if (unlikely(!t)) fatal("mallocz() cannot allocate %zu bytes of memory (%zu with header).", size, malloc_header_size + size); + t->signature.magic = 0x0BADCAFE; + t->signature.trace = p; + t->signature.size = size; + +#ifdef NETDATA_INTERNAL_CHECKS + for(ssize_t i = 0; i < (ssize_t)sizeof(t->padding) ;i++) // signed to avoid compiler warning when zero-padded + t->padding[i] = 0xFF; +#endif + + return (void *)&t->data; +} + +void *callocz_int(size_t nmemb, size_t size, const char *file, const char *function, size_t line) { + struct malloc_trace *p = malloc_trace_find_or_create(file, function, line); + size = nmemb * size; + + size_t_atomic_count(add, p->calloc_calls, 1); + size_t_atomic_count(add, p->allocations, 1); + size_t_atomic_bytes(add, p->bytes, size); + + struct malloc_header *t = (struct malloc_header *)libc_calloc(1, malloc_header_size + size); + if (unlikely(!t)) fatal("mallocz() cannot allocate %zu bytes of memory (%zu with header).", size, malloc_header_size + size); + t->signature.magic = 0x0BADCAFE; + t->signature.trace = p; + t->signature.size = size; + +#ifdef NETDATA_INTERNAL_CHECKS + for(ssize_t i = 0; i < (ssize_t)sizeof(t->padding) ;i++) // signed to avoid compiler warning when zero-padded + t->padding[i] = 0xFF; +#endif + + return &t->data; +} + +char *strdupz_int(const char *s, const char *file, const char *function, size_t line) { + struct malloc_trace *p = malloc_trace_find_or_create(file, function, line); + size_t size = strlen(s) + 1; + + size_t_atomic_count(add, p->strdup_calls, 1); + size_t_atomic_count(add, p->allocations, 1); + size_t_atomic_bytes(add, p->bytes, size); + + struct malloc_header *t = (struct malloc_header *)libc_malloc(malloc_header_size + size); + if (unlikely(!t)) fatal("strdupz() cannot allocate %zu bytes of memory (%zu with header).", size, malloc_header_size + size); + t->signature.magic = 0x0BADCAFE; + t->signature.trace = p; + t->signature.size = size; + +#ifdef NETDATA_INTERNAL_CHECKS + for(ssize_t i = 0; i < (ssize_t)sizeof(t->padding) ;i++) // signed to avoid compiler warning when zero-padded + t->padding[i] = 0xFF; +#endif + + memcpy(&t->data, s, size); + return (char *)&t->data; +} + +char *strndupz_int(const char *s, size_t len, const char *file, const char *function, size_t line) { + struct malloc_trace *p = malloc_trace_find_or_create(file, function, line); + size_t size = len + 1; + + size_t_atomic_count(add, p->strdup_calls, 1); + size_t_atomic_count(add, p->allocations, 1); + size_t_atomic_bytes(add, p->bytes, size); + + struct malloc_header *t = (struct malloc_header *)libc_malloc(malloc_header_size + size); + if (unlikely(!t)) fatal("strndupz() cannot allocate %zu bytes of memory (%zu with header).", size, malloc_header_size + size); + t->signature.magic = 0x0BADCAFE; + t->signature.trace = p; + t->signature.size = size; + +#ifdef NETDATA_INTERNAL_CHECKS + for(ssize_t i = 0; i < (ssize_t)sizeof(t->padding) ;i++) // signed to avoid compiler warning when zero-padded + t->padding[i] = 0xFF; +#endif + + memcpy(&t->data, s, size); + t->data[len] = '\0'; + return (char *)&t->data; +} + +static struct malloc_header *malloc_get_header(void *ptr, const char *caller, const char *file, const char *function, size_t line) { + uint8_t *ret = (uint8_t *)ptr - malloc_header_size; + struct malloc_header *t = (struct malloc_header *)ret; + + if(t->signature.magic != 0x0BADCAFE) { + netdata_log_error("pointer %p is not our pointer (called %s() from %zu@%s, %s()).", ptr, caller, line, file, function); + return NULL; + } + + return t; +} + +void *reallocz_int(void *ptr, size_t size, const char *file, const char *function, size_t line) { + if(!ptr) return mallocz_int(size, file, function, line); + + struct malloc_header *t = malloc_get_header(ptr, __FUNCTION__, file, function, line); + if(!t) + return libc_realloc(ptr, size); + + if(t->signature.size == size) return ptr; + size_t_atomic_count(add, t->signature.trace->free_calls, 1); + size_t_atomic_count(sub, t->signature.trace->allocations, 1); + size_t_atomic_bytes(sub, t->signature.trace->bytes, t->signature.size); + + struct malloc_trace *p = malloc_trace_find_or_create(file, function, line); + size_t_atomic_count(add, p->realloc_calls, 1); + size_t_atomic_count(add, p->allocations, 1); + size_t_atomic_bytes(add, p->bytes, size); + + t = (struct malloc_header *)libc_realloc(t, malloc_header_size + size); + if (unlikely(!t)) fatal("reallocz() cannot allocate %zu bytes of memory (%zu with header).", size, malloc_header_size + size); + t->signature.magic = 0x0BADCAFE; + t->signature.trace = p; + t->signature.size = size; + +#ifdef NETDATA_INTERNAL_CHECKS + for(ssize_t i = 0; i < (ssize_t)sizeof(t->padding) ;i++) // signed to avoid compiler warning when zero-padded + t->padding[i] = 0xFF; +#endif + + return (void *)&t->data; +} + +size_t mallocz_usable_size_int(void *ptr, const char *file, const char *function, size_t line) { + if(unlikely(!ptr)) return 0; + + struct malloc_header *t = malloc_get_header(ptr, __FUNCTION__, file, function, line); + if(!t) { + if(libc_malloc_usable_size) + return libc_malloc_usable_size(ptr); + else + return 0; + } + + return t->signature.size; +} + +void freez_int(void *ptr, const char *file, const char *function, size_t line) { + if(unlikely(!ptr)) return; + + struct malloc_header *t = malloc_get_header(ptr, __FUNCTION__, file, function, line); + if(!t) { + libc_free(ptr); + return; + } + + size_t_atomic_count(add, t->signature.trace->free_calls, 1); + size_t_atomic_count(sub, t->signature.trace->allocations, 1); + size_t_atomic_bytes(sub, t->signature.trace->bytes, t->signature.size); + +#ifdef NETDATA_INTERNAL_CHECKS + // it should crash if it is used after freeing it + memset(t, 0, malloc_header_size + t->signature.size); +#endif + + libc_free(t); +} +#else + +char *strdupz(const char *s) { + char *t = strdup(s); + if (unlikely(!t)) fatal("Cannot strdup() string '%s'", s); + return t; +} + +char *strndupz(const char *s, size_t len) { + char *t = strndup(s, len); + if (unlikely(!t)) fatal("Cannot strndup() string '%s' of len %zu", s, len); + return t; +} + +// If ptr is NULL, no operation is performed. +void freez(void *ptr) { + free(ptr); +} + +void *mallocz(size_t size) { + void *p = malloc(size); + if (unlikely(!p)) fatal("Cannot allocate %zu bytes of memory.", size); + return p; +} + +void *callocz(size_t nmemb, size_t size) { + void *p = calloc(nmemb, size); + if (unlikely(!p)) fatal("Cannot allocate %zu bytes of memory.", nmemb * size); + return p; +} + +void *reallocz(void *ptr, size_t size) { + void *p = realloc(ptr, size); + if (unlikely(!p)) fatal("Cannot re-allocate memory to %zu bytes.", size); + return p; +} + +void posix_memfree(void *ptr) { + free(ptr); +} + +#endif + +// -------------------------------------------------------------------------------------------------------------------- + +void json_escape_string(char *dst, const char *src, size_t size) { + const char *t; + char *d = dst, *e = &dst[size - 1]; + + for(t = src; *t && d < e ;t++) { + if(unlikely(*t == '\\' || *t == '"')) { + if(unlikely(d + 1 >= e)) break; + *d++ = '\\'; + } + *d++ = *t; + } + + *d = '\0'; +} + +void json_fix_string(char *s) { + unsigned char c; + while((c = (unsigned char)*s)) { + if(unlikely(c == '\\')) + *s++ = '/'; + else if(unlikely(c == '"')) + *s++ = '\''; + else if(unlikely(isspace(c) || iscntrl(c))) + *s++ = ' '; + else if(unlikely(!isprint(c) || c > 127)) + *s++ = '_'; + else + s++; + } +} + +unsigned char netdata_map_chart_names[256] = { + [0] = '\0', // + [1] = '_', // + [2] = '_', // + [3] = '_', // + [4] = '_', // + [5] = '_', // + [6] = '_', // + [7] = '_', // + [8] = '_', // + [9] = '_', // + [10] = '_', // + [11] = '_', // + [12] = '_', // + [13] = '_', // + [14] = '_', // + [15] = '_', // + [16] = '_', // + [17] = '_', // + [18] = '_', // + [19] = '_', // + [20] = '_', // + [21] = '_', // + [22] = '_', // + [23] = '_', // + [24] = '_', // + [25] = '_', // + [26] = '_', // + [27] = '_', // + [28] = '_', // + [29] = '_', // + [30] = '_', // + [31] = '_', // + [32] = '_', // + [33] = '_', // ! + [34] = '_', // " + [35] = '_', // # + [36] = '_', // $ + [37] = '_', // % + [38] = '_', // & + [39] = '_', // ' + [40] = '_', // ( + [41] = '_', // ) + [42] = '_', // * + [43] = '_', // + + [44] = '.', // , + [45] = '-', // - + [46] = '.', // . + [47] = '/', // / + [48] = '0', // 0 + [49] = '1', // 1 + [50] = '2', // 2 + [51] = '3', // 3 + [52] = '4', // 4 + [53] = '5', // 5 + [54] = '6', // 6 + [55] = '7', // 7 + [56] = '8', // 8 + [57] = '9', // 9 + [58] = '_', // : + [59] = '_', // ; + [60] = '_', // < + [61] = '_', // = + [62] = '_', // > + [63] = '_', // ? + [64] = '_', // @ + [65] = 'a', // A + [66] = 'b', // B + [67] = 'c', // C + [68] = 'd', // D + [69] = 'e', // E + [70] = 'f', // F + [71] = 'g', // G + [72] = 'h', // H + [73] = 'i', // I + [74] = 'j', // J + [75] = 'k', // K + [76] = 'l', // L + [77] = 'm', // M + [78] = 'n', // N + [79] = 'o', // O + [80] = 'p', // P + [81] = 'q', // Q + [82] = 'r', // R + [83] = 's', // S + [84] = 't', // T + [85] = 'u', // U + [86] = 'v', // V + [87] = 'w', // W + [88] = 'x', // X + [89] = 'y', // Y + [90] = 'z', // Z + [91] = '_', // [ + [92] = '/', // backslash + [93] = '_', // ] + [94] = '_', // ^ + [95] = '_', // _ + [96] = '_', // ` + [97] = 'a', // a + [98] = 'b', // b + [99] = 'c', // c + [100] = 'd', // d + [101] = 'e', // e + [102] = 'f', // f + [103] = 'g', // g + [104] = 'h', // h + [105] = 'i', // i + [106] = 'j', // j + [107] = 'k', // k + [108] = 'l', // l + [109] = 'm', // m + [110] = 'n', // n + [111] = 'o', // o + [112] = 'p', // p + [113] = 'q', // q + [114] = 'r', // r + [115] = 's', // s + [116] = 't', // t + [117] = 'u', // u + [118] = 'v', // v + [119] = 'w', // w + [120] = 'x', // x + [121] = 'y', // y + [122] = 'z', // z + [123] = '_', // { + [124] = '_', // | + [125] = '_', // } + [126] = '_', // ~ + [127] = '_', // + [128] = '_', // + [129] = '_', // + [130] = '_', // + [131] = '_', // + [132] = '_', // + [133] = '_', // + [134] = '_', // + [135] = '_', // + [136] = '_', // + [137] = '_', // + [138] = '_', // + [139] = '_', // + [140] = '_', // + [141] = '_', // + [142] = '_', // + [143] = '_', // + [144] = '_', // + [145] = '_', // + [146] = '_', // + [147] = '_', // + [148] = '_', // + [149] = '_', // + [150] = '_', // + [151] = '_', // + [152] = '_', // + [153] = '_', // + [154] = '_', // + [155] = '_', // + [156] = '_', // + [157] = '_', // + [158] = '_', // + [159] = '_', // + [160] = '_', // + [161] = '_', // + [162] = '_', // + [163] = '_', // + [164] = '_', // + [165] = '_', // + [166] = '_', // + [167] = '_', // + [168] = '_', // + [169] = '_', // + [170] = '_', // + [171] = '_', // + [172] = '_', // + [173] = '_', // + [174] = '_', // + [175] = '_', // + [176] = '_', // + [177] = '_', // + [178] = '_', // + [179] = '_', // + [180] = '_', // + [181] = '_', // + [182] = '_', // + [183] = '_', // + [184] = '_', // + [185] = '_', // + [186] = '_', // + [187] = '_', // + [188] = '_', // + [189] = '_', // + [190] = '_', // + [191] = '_', // + [192] = '_', // + [193] = '_', // + [194] = '_', // + [195] = '_', // + [196] = '_', // + [197] = '_', // + [198] = '_', // + [199] = '_', // + [200] = '_', // + [201] = '_', // + [202] = '_', // + [203] = '_', // + [204] = '_', // + [205] = '_', // + [206] = '_', // + [207] = '_', // + [208] = '_', // + [209] = '_', // + [210] = '_', // + [211] = '_', // + [212] = '_', // + [213] = '_', // + [214] = '_', // + [215] = '_', // + [216] = '_', // + [217] = '_', // + [218] = '_', // + [219] = '_', // + [220] = '_', // + [221] = '_', // + [222] = '_', // + [223] = '_', // + [224] = '_', // + [225] = '_', // + [226] = '_', // + [227] = '_', // + [228] = '_', // + [229] = '_', // + [230] = '_', // + [231] = '_', // + [232] = '_', // + [233] = '_', // + [234] = '_', // + [235] = '_', // + [236] = '_', // + [237] = '_', // + [238] = '_', // + [239] = '_', // + [240] = '_', // + [241] = '_', // + [242] = '_', // + [243] = '_', // + [244] = '_', // + [245] = '_', // + [246] = '_', // + [247] = '_', // + [248] = '_', // + [249] = '_', // + [250] = '_', // + [251] = '_', // + [252] = '_', // + [253] = '_', // + [254] = '_', // + [255] = '_' // +}; + +// make sure the supplied string +// is good for a netdata chart/dimension ID/NAME +void netdata_fix_chart_name(char *s) { + while ((*s = netdata_map_chart_names[(unsigned char) *s])) s++; +} + +unsigned char netdata_map_chart_ids[256] = { + [0] = '\0', // + [1] = '_', // + [2] = '_', // + [3] = '_', // + [4] = '_', // + [5] = '_', // + [6] = '_', // + [7] = '_', // + [8] = '_', // + [9] = '_', // + [10] = '_', // + [11] = '_', // + [12] = '_', // + [13] = '_', // + [14] = '_', // + [15] = '_', // + [16] = '_', // + [17] = '_', // + [18] = '_', // + [19] = '_', // + [20] = '_', // + [21] = '_', // + [22] = '_', // + [23] = '_', // + [24] = '_', // + [25] = '_', // + [26] = '_', // + [27] = '_', // + [28] = '_', // + [29] = '_', // + [30] = '_', // + [31] = '_', // + [32] = '_', // + [33] = '_', // ! + [34] = '_', // " + [35] = '_', // # + [36] = '_', // $ + [37] = '_', // % + [38] = '_', // & + [39] = '_', // ' + [40] = '_', // ( + [41] = '_', // ) + [42] = '_', // * + [43] = '_', // + + [44] = '.', // , + [45] = '-', // - + [46] = '.', // . + [47] = '_', // / + [48] = '0', // 0 + [49] = '1', // 1 + [50] = '2', // 2 + [51] = '3', // 3 + [52] = '4', // 4 + [53] = '5', // 5 + [54] = '6', // 6 + [55] = '7', // 7 + [56] = '8', // 8 + [57] = '9', // 9 + [58] = '_', // : + [59] = '_', // ; + [60] = '_', // < + [61] = '_', // = + [62] = '_', // > + [63] = '_', // ? + [64] = '_', // @ + [65] = 'a', // A + [66] = 'b', // B + [67] = 'c', // C + [68] = 'd', // D + [69] = 'e', // E + [70] = 'f', // F + [71] = 'g', // G + [72] = 'h', // H + [73] = 'i', // I + [74] = 'j', // J + [75] = 'k', // K + [76] = 'l', // L + [77] = 'm', // M + [78] = 'n', // N + [79] = 'o', // O + [80] = 'p', // P + [81] = 'q', // Q + [82] = 'r', // R + [83] = 's', // S + [84] = 't', // T + [85] = 'u', // U + [86] = 'v', // V + [87] = 'w', // W + [88] = 'x', // X + [89] = 'y', // Y + [90] = 'z', // Z + [91] = '_', // [ + [92] = '_', // backslash + [93] = '_', // ] + [94] = '_', // ^ + [95] = '_', // _ + [96] = '_', // ` + [97] = 'a', // a + [98] = 'b', // b + [99] = 'c', // c + [100] = 'd', // d + [101] = 'e', // e + [102] = 'f', // f + [103] = 'g', // g + [104] = 'h', // h + [105] = 'i', // i + [106] = 'j', // j + [107] = 'k', // k + [108] = 'l', // l + [109] = 'm', // m + [110] = 'n', // n + [111] = 'o', // o + [112] = 'p', // p + [113] = 'q', // q + [114] = 'r', // r + [115] = 's', // s + [116] = 't', // t + [117] = 'u', // u + [118] = 'v', // v + [119] = 'w', // w + [120] = 'x', // x + [121] = 'y', // y + [122] = 'z', // z + [123] = '_', // { + [124] = '_', // | + [125] = '_', // } + [126] = '_', // ~ + [127] = '_', // + [128] = '_', // + [129] = '_', // + [130] = '_', // + [131] = '_', // + [132] = '_', // + [133] = '_', // + [134] = '_', // + [135] = '_', // + [136] = '_', // + [137] = '_', // + [138] = '_', // + [139] = '_', // + [140] = '_', // + [141] = '_', // + [142] = '_', // + [143] = '_', // + [144] = '_', // + [145] = '_', // + [146] = '_', // + [147] = '_', // + [148] = '_', // + [149] = '_', // + [150] = '_', // + [151] = '_', // + [152] = '_', // + [153] = '_', // + [154] = '_', // + [155] = '_', // + [156] = '_', // + [157] = '_', // + [158] = '_', // + [159] = '_', // + [160] = '_', // + [161] = '_', // + [162] = '_', // + [163] = '_', // + [164] = '_', // + [165] = '_', // + [166] = '_', // + [167] = '_', // + [168] = '_', // + [169] = '_', // + [170] = '_', // + [171] = '_', // + [172] = '_', // + [173] = '_', // + [174] = '_', // + [175] = '_', // + [176] = '_', // + [177] = '_', // + [178] = '_', // + [179] = '_', // + [180] = '_', // + [181] = '_', // + [182] = '_', // + [183] = '_', // + [184] = '_', // + [185] = '_', // + [186] = '_', // + [187] = '_', // + [188] = '_', // + [189] = '_', // + [190] = '_', // + [191] = '_', // + [192] = '_', // + [193] = '_', // + [194] = '_', // + [195] = '_', // + [196] = '_', // + [197] = '_', // + [198] = '_', // + [199] = '_', // + [200] = '_', // + [201] = '_', // + [202] = '_', // + [203] = '_', // + [204] = '_', // + [205] = '_', // + [206] = '_', // + [207] = '_', // + [208] = '_', // + [209] = '_', // + [210] = '_', // + [211] = '_', // + [212] = '_', // + [213] = '_', // + [214] = '_', // + [215] = '_', // + [216] = '_', // + [217] = '_', // + [218] = '_', // + [219] = '_', // + [220] = '_', // + [221] = '_', // + [222] = '_', // + [223] = '_', // + [224] = '_', // + [225] = '_', // + [226] = '_', // + [227] = '_', // + [228] = '_', // + [229] = '_', // + [230] = '_', // + [231] = '_', // + [232] = '_', // + [233] = '_', // + [234] = '_', // + [235] = '_', // + [236] = '_', // + [237] = '_', // + [238] = '_', // + [239] = '_', // + [240] = '_', // + [241] = '_', // + [242] = '_', // + [243] = '_', // + [244] = '_', // + [245] = '_', // + [246] = '_', // + [247] = '_', // + [248] = '_', // + [249] = '_', // + [250] = '_', // + [251] = '_', // + [252] = '_', // + [253] = '_', // + [254] = '_', // + [255] = '_' // +}; + +// make sure the supplied string +// is good for a netdata chart/dimension ID/NAME +void netdata_fix_chart_id(char *s) { + while ((*s = netdata_map_chart_ids[(unsigned char) *s])) s++; +} + +static int memory_file_open(const char *filename, size_t size) { + // netdata_log_info("memory_file_open('%s', %zu", filename, size); + + int fd = open(filename, O_RDWR | O_CREAT | O_NOATIME | O_CLOEXEC, 0664); + if (fd != -1) { + if (lseek(fd, size, SEEK_SET) == (off_t) size) { + if (write(fd, "", 1) == 1) { + if (ftruncate(fd, size)) + netdata_log_error("Cannot truncate file '%s' to size %zu. Will use the larger file.", filename, size); + } + else + netdata_log_error("Cannot write to file '%s' at position %zu.", filename, size); + } + else + netdata_log_error("Cannot seek file '%s' to size %zu.", filename, size); + } + else + netdata_log_error("Cannot create/open file '%s'.", filename); + + return fd; +} + +inline int madvise_sequential(void *mem, size_t len) { + static int logger = 1; + int ret = madvise(mem, len, MADV_SEQUENTIAL); + + if (ret != 0 && logger-- > 0) + netdata_log_error("madvise(MADV_SEQUENTIAL) failed."); + return ret; +} + +inline int madvise_random(void *mem, size_t len) { + static int logger = 1; + int ret = madvise(mem, len, MADV_RANDOM); + + if (ret != 0 && logger-- > 0) + netdata_log_error("madvise(MADV_RANDOM) failed."); + return ret; +} + +inline int madvise_dontfork(void *mem, size_t len) { + static int logger = 1; + int ret = madvise(mem, len, MADV_DONTFORK); + + if (ret != 0 && logger-- > 0) + netdata_log_error("madvise(MADV_DONTFORK) failed."); + return ret; +} + +inline int madvise_willneed(void *mem, size_t len) { + static int logger = 1; + int ret = madvise(mem, len, MADV_WILLNEED); + + if (ret != 0 && logger-- > 0) + netdata_log_error("madvise(MADV_WILLNEED) failed."); + return ret; +} + +inline int madvise_dontneed(void *mem, size_t len) { + static int logger = 1; + int ret = madvise(mem, len, MADV_DONTNEED); + + if (ret != 0 && logger-- > 0) + netdata_log_error("madvise(MADV_DONTNEED) failed."); + return ret; +} + +inline int madvise_dontdump(void *mem __maybe_unused, size_t len __maybe_unused) { +#if __linux__ + static int logger = 1; + int ret = madvise(mem, len, MADV_DONTDUMP); + + if (ret != 0 && logger-- > 0) + netdata_log_error("madvise(MADV_DONTDUMP) failed."); + return ret; +#else + return 0; +#endif +} + +inline int madvise_mergeable(void *mem __maybe_unused, size_t len __maybe_unused) { +#ifdef MADV_MERGEABLE + static int logger = 1; + int ret = madvise(mem, len, MADV_MERGEABLE); + + if (ret != 0 && logger-- > 0) + netdata_log_error("madvise(MADV_MERGEABLE) failed."); + return ret; +#else + return 0; +#endif +} + +void *netdata_mmap(const char *filename, size_t size, int flags, int ksm, bool read_only, int *open_fd) +{ + // netdata_log_info("netdata_mmap('%s', %zu", filename, size); + + // MAP_SHARED is used in memory mode map + // MAP_PRIVATE is used in memory mode ram and save + + if(unlikely(!(flags & MAP_SHARED) && !(flags & MAP_PRIVATE))) + fatal("Neither MAP_SHARED or MAP_PRIVATE were given to netdata_mmap()"); + + if(unlikely((flags & MAP_SHARED) && (flags & MAP_PRIVATE))) + fatal("Both MAP_SHARED and MAP_PRIVATE were given to netdata_mmap()"); + + if(unlikely((flags & MAP_SHARED) && (!filename || !*filename))) + fatal("MAP_SHARED requested, without a filename to netdata_mmap()"); + + // don't enable ksm is the global setting is disabled + if(unlikely(!enable_ksm)) ksm = 0; + + // KSM only merges anonymous (private) pages, never pagecache (file) pages + // but MAP_PRIVATE without MAP_ANONYMOUS it fails too, so we need it always + if((flags & MAP_PRIVATE)) flags |= MAP_ANONYMOUS; + + int fd = -1; + void *mem = MAP_FAILED; + + if(filename && *filename) { + // open/create the file to be used + fd = memory_file_open(filename, size); + if(fd == -1) goto cleanup; + } + + int fd_for_mmap = fd; + if(fd != -1 && (flags & MAP_PRIVATE)) { + // this is MAP_PRIVATE allocation + // no need for mmap() to use our fd + // we will copy the file into the memory allocated + fd_for_mmap = -1; + } + + mem = mmap(NULL, size, read_only ? PROT_READ : PROT_READ | PROT_WRITE, flags, fd_for_mmap, 0); + if (mem != MAP_FAILED) { + +#ifdef NETDATA_TRACE_ALLOCATIONS + malloc_trace_mmap(size); +#endif + + // if we have a file open, but we didn't give it to mmap(), + // we have to read the file into the memory block we allocated + if(fd != -1 && fd_for_mmap == -1) { + if (lseek(fd, 0, SEEK_SET) == 0) { + if (read(fd, mem, size) != (ssize_t) size) + netdata_log_info("Cannot read from file '%s'", filename); + } + else netdata_log_info("Cannot seek to beginning of file '%s'.", filename); + } + + // madvise_sequential(mem, size); + madvise_dontfork(mem, size); + madvise_dontdump(mem, size); + // if(flags & MAP_SHARED) madvise_willneed(mem, size); + if(ksm) madvise_mergeable(mem, size); + } + +cleanup: + if(fd != -1) { + if (open_fd) + *open_fd = fd; + else + close(fd); + } + if(mem == MAP_FAILED) return NULL; + errno = 0; + return mem; +} + +int netdata_munmap(void *ptr, size_t size) { +#ifdef NETDATA_TRACE_ALLOCATIONS + malloc_trace_munmap(size); +#endif + return munmap(ptr, size); +} + +char *fgets_trim_len(char *buf, size_t buf_size, FILE *fp, size_t *len) { + char *s = fgets(buf, (int)buf_size, fp); + if (!s) return NULL; + + char *t = s; + if (*t != '\0') { + // find the string end + while (*++t != '\0'); + + // trim trailing spaces/newlines/tabs + while (--t > s && *t == '\n') + *t = '\0'; + } + + if (len) + *len = t - s + 1; + + return s; +} + +// vsnprintfz() returns the number of bytes actually written - after possible truncation +int vsnprintfz(char *dst, size_t n, const char *fmt, va_list args) { + if(unlikely(!n)) return 0; + + int size = vsnprintf(dst, n, fmt, args); + dst[n - 1] = '\0'; + + if (unlikely((size_t) size >= n)) size = (int)(n - 1); + + return size; +} + +// snprintfz() returns the number of bytes actually written - after possible truncation +int snprintfz(char *dst, size_t n, const char *fmt, ...) { + va_list args; + + va_start(args, fmt); + int ret = vsnprintfz(dst, n, fmt, args); + va_end(args); + + return ret; +} + +static int is_procfs(const char *path, char **reason) { +#if defined(__APPLE__) || defined(__FreeBSD__) + (void)path; + (void)reason; +#else + struct statfs stat; + + if (statfs(path, &stat) == -1) { + if (reason) + *reason = "failed to statfs()"; + return -1; + } + +#if defined PROC_SUPER_MAGIC + if (stat.f_type != PROC_SUPER_MAGIC) { + if (reason) + *reason = "type is not procfs"; + return -1; + } +#endif + +#endif + + return 0; +} + +static int is_sysfs(const char *path, char **reason) { +#if defined(__APPLE__) || defined(__FreeBSD__) + (void)path; + (void)reason; +#else + struct statfs stat; + + if (statfs(path, &stat) == -1) { + if (reason) + *reason = "failed to statfs()"; + return -1; + } + +#if defined SYSFS_MAGIC + if (stat.f_type != SYSFS_MAGIC) { + if (reason) + *reason = "type is not sysfs"; + return -1; + } +#endif + +#endif + + return 0; +} + +int verify_netdata_host_prefix(bool log_msg) { + if(!netdata_configured_host_prefix) + netdata_configured_host_prefix = ""; + + if(!*netdata_configured_host_prefix) + return 0; + + char buffer[FILENAME_MAX + 1]; + char *path = netdata_configured_host_prefix; + char *reason = "unknown reason"; + errno = 0; + + struct stat sb; + if (stat(path, &sb) == -1) { + reason = "failed to stat()"; + goto failed; + } + + if((sb.st_mode & S_IFMT) != S_IFDIR) { + errno = EINVAL; + reason = "is not a directory"; + goto failed; + } + + path = buffer; + snprintfz(path, FILENAME_MAX, "%s/proc", netdata_configured_host_prefix); + if(is_procfs(path, &reason) == -1) + goto failed; + + snprintfz(path, FILENAME_MAX, "%s/sys", netdata_configured_host_prefix); + if(is_sysfs(path, &reason) == -1) + goto failed; + + if (netdata_configured_host_prefix && *netdata_configured_host_prefix) { + if (log_msg) + netdata_log_info("Using host prefix directory '%s'", netdata_configured_host_prefix); + } + + return 0; + +failed: + if (log_msg) + netdata_log_error("Ignoring host prefix '%s': path '%s' %s", netdata_configured_host_prefix, path, reason); + netdata_configured_host_prefix = ""; + return -1; +} + +char *strdupz_path_subpath(const char *path, const char *subpath) { + if(unlikely(!path || !*path)) path = "."; + if(unlikely(!subpath)) subpath = ""; + + // skip trailing slashes in path + size_t len = strlen(path); + while(len > 0 && path[len - 1] == '/') len--; + + // skip leading slashes in subpath + while(subpath[0] == '/') subpath++; + + // if the last character in path is / and (there is a subpath or path is now empty) + // keep the trailing slash in path and remove the additional slash + char *slash = "/"; + if(path[len] == '/' && (*subpath || len == 0)) { + slash = ""; + len++; + } + else if(!*subpath) { + // there is no subpath + // no need for trailing slash + slash = ""; + } + + char buffer[FILENAME_MAX + 1]; + snprintfz(buffer, FILENAME_MAX, "%.*s%s%s", (int)len, path, slash, subpath); + return strdupz(buffer); +} + +int path_is_dir(const char *path, const char *subpath) { + char *s = strdupz_path_subpath(path, subpath); + + size_t max_links = 100; + + int is_dir = 0; + struct stat statbuf; + while(max_links-- && stat(s, &statbuf) == 0) { + if((statbuf.st_mode & S_IFMT) == S_IFDIR) { + is_dir = 1; + break; + } + else if((statbuf.st_mode & S_IFMT) == S_IFLNK) { + char buffer[FILENAME_MAX + 1]; + ssize_t l = readlink(s, buffer, FILENAME_MAX); + if(l > 0) { + buffer[l] = '\0'; + freez(s); + s = strdupz(buffer); + continue; + } + else { + is_dir = 0; + break; + } + } + else { + is_dir = 0; + break; + } + } + + freez(s); + return is_dir; +} + +int path_is_file(const char *path, const char *subpath) { + char *s = strdupz_path_subpath(path, subpath); + + size_t max_links = 100; + + int is_file = 0; + struct stat statbuf; + while(max_links-- && stat(s, &statbuf) == 0) { + if((statbuf.st_mode & S_IFMT) == S_IFREG) { + is_file = 1; + break; + } + else if((statbuf.st_mode & S_IFMT) == S_IFLNK) { + char buffer[FILENAME_MAX + 1]; + ssize_t l = readlink(s, buffer, FILENAME_MAX); + if(l > 0) { + buffer[l] = '\0'; + freez(s); + s = strdupz(buffer); + continue; + } + else { + is_file = 0; + break; + } + } + else { + is_file = 0; + break; + } + } + + freez(s); + return is_file; +} + +void recursive_config_double_dir_load(const char *user_path, const char *stock_path, const char *subpath, int (*callback)(const char *filename, void *data, bool stock_config), void *data, size_t depth) { + if(depth > 3) { + netdata_log_error("CONFIG: Max directory depth reached while reading user path '%s', stock path '%s', subpath '%s'", user_path, stock_path, subpath); + return; + } + + if(!stock_path) + stock_path = user_path; + + char *udir = strdupz_path_subpath(user_path, subpath); + char *sdir = strdupz_path_subpath(stock_path, subpath); + + netdata_log_debug(D_HEALTH, "CONFIG traversing user-config directory '%s', stock config directory '%s'", udir, sdir); + + DIR *dir = opendir(udir); + if (!dir) { + netdata_log_error("CONFIG cannot open user-config directory '%s'.", udir); + } + else { + struct dirent *de = NULL; + while((de = readdir(dir))) { + if(de->d_type == DT_DIR || de->d_type == DT_LNK) { + if( !de->d_name[0] || + (de->d_name[0] == '.' && de->d_name[1] == '\0') || + (de->d_name[0] == '.' && de->d_name[1] == '.' && de->d_name[2] == '\0') + ) { + netdata_log_debug(D_HEALTH, "CONFIG ignoring user-config directory '%s/%s'", udir, de->d_name); + continue; + } + + if(path_is_dir(udir, de->d_name)) { + recursive_config_double_dir_load(udir, sdir, de->d_name, callback, data, depth + 1); + continue; + } + } + + if(de->d_type == DT_UNKNOWN || de->d_type == DT_REG || de->d_type == DT_LNK) { + size_t len = strlen(de->d_name); + if(path_is_file(udir, de->d_name) && + len > 5 && !strcmp(&de->d_name[len - 5], ".conf")) { + char *filename = strdupz_path_subpath(udir, de->d_name); + netdata_log_debug(D_HEALTH, "CONFIG calling callback for user file '%s'", filename); + callback(filename, data, false); + freez(filename); + continue; + } + } + + netdata_log_debug(D_HEALTH, "CONFIG ignoring user-config file '%s/%s' of type %d", udir, de->d_name, (int)de->d_type); + } + + closedir(dir); + } + + netdata_log_debug(D_HEALTH, "CONFIG traversing stock config directory '%s', user config directory '%s'", sdir, udir); + + dir = opendir(sdir); + if (!dir) { + netdata_log_error("CONFIG cannot open stock config directory '%s'.", sdir); + } + else { + if (strcmp(udir, sdir)) { + struct dirent *de = NULL; + while((de = readdir(dir))) { + if(de->d_type == DT_DIR || de->d_type == DT_LNK) { + if( !de->d_name[0] || + (de->d_name[0] == '.' && de->d_name[1] == '\0') || + (de->d_name[0] == '.' && de->d_name[1] == '.' && de->d_name[2] == '\0') + ) { + netdata_log_debug(D_HEALTH, "CONFIG ignoring stock config directory '%s/%s'", sdir, de->d_name); + continue; + } + + if(path_is_dir(sdir, de->d_name)) { + // we recurse in stock subdirectory, only when there is no corresponding + // user subdirectory - to avoid reading the files twice + + if(!path_is_dir(udir, de->d_name)) + recursive_config_double_dir_load(udir, sdir, de->d_name, callback, data, depth + 1); + + continue; + } + } + + if(de->d_type == DT_UNKNOWN || de->d_type == DT_REG || de->d_type == DT_LNK) { + size_t len = strlen(de->d_name); + if(path_is_file(sdir, de->d_name) && !path_is_file(udir, de->d_name) && + len > 5 && !strcmp(&de->d_name[len - 5], ".conf")) { + char *filename = strdupz_path_subpath(sdir, de->d_name); + netdata_log_debug(D_HEALTH, "CONFIG calling callback for stock file '%s'", filename); + callback(filename, data, true); + freez(filename); + continue; + } + + } + + netdata_log_debug(D_HEALTH, "CONFIG ignoring stock-config file '%s/%s' of type %d", udir, de->d_name, (int)de->d_type); + } + } + closedir(dir); + } + + netdata_log_debug(D_HEALTH, "CONFIG done traversing user-config directory '%s', stock config directory '%s'", udir, sdir); + + freez(udir); + freez(sdir); +} + +// Returns the number of bytes read from the file if file_size is not NULL. +// The actual buffer has an extra byte set to zero (not included in the count). +char *read_by_filename(const char *filename, long *file_size) +{ + FILE *f = fopen(filename, "r"); + if (!f) + return NULL; + if (fseek(f, 0, SEEK_END) < 0) { + fclose(f); + return NULL; + } + long size = ftell(f); + if (size <= 0 || fseek(f, 0, SEEK_END) < 0) { + fclose(f); + return NULL; + } + char *contents = callocz(size + 1, 1); + if (!contents) { + fclose(f); + return NULL; + } + if (fseek(f, 0, SEEK_SET) < 0) { + fclose(f); + freez(contents); + return NULL; + } + size_t res = fread(contents, 1, size, f); + if ( res != (size_t)size) { + freez(contents); + fclose(f); + return NULL; + } + fclose(f); + if (file_size) + *file_size = size; + return contents; +} + +char *find_and_replace(const char *src, const char *find, const char *replace, const char *where) +{ + size_t size = strlen(src) + 1; + size_t find_len = strlen(find); + size_t repl_len = strlen(replace); + char *value, *dst; + + if (likely(where)) + size += (repl_len - find_len); + + value = mallocz(size); + dst = value; + + if (likely(where)) { + size_t count = where - src; + + memmove(dst, src, count); + src += count; + dst += count; + + memmove(dst, replace, repl_len); + src += find_len; + dst += repl_len; + } + + strcpy(dst, src); + + return value; +} + + +BUFFER *run_command_and_get_output_to_buffer(const char *command, int max_line_length) { + BUFFER *wb = buffer_create(0, NULL); + + pid_t pid; + FILE *fp = netdata_popen(command, &pid, NULL); + + if(fp) { + char buffer[max_line_length + 1]; + while (fgets(buffer, max_line_length, fp)) { + buffer[max_line_length] = '\0'; + buffer_strcat(wb, buffer); + } + } + else { + buffer_free(wb); + netdata_log_error("Failed to execute command '%s'.", command); + return NULL; + } + + netdata_pclose(NULL, fp, pid); + return wb; +} + +bool run_command_and_copy_output_to_stdout(const char *command, int max_line_length) { + pid_t pid; + FILE *fp = netdata_popen(command, &pid, NULL); + + if(fp) { + char buffer[max_line_length + 1]; + while (fgets(buffer, max_line_length, fp)) + fprintf(stdout, "%s", buffer); + } + else { + netdata_log_error("Failed to execute command '%s'.", command); + return false; + } + + netdata_pclose(NULL, fp, pid); + return true; +} + + +static int fd_is_valid(int fd) { + return fcntl(fd, F_GETFD) != -1 || errno != EBADF; +} + +void for_each_open_fd(OPEN_FD_ACTION action, OPEN_FD_EXCLUDE excluded_fds){ + int fd; + + switch(action){ + case OPEN_FD_ACTION_CLOSE: + if(!(excluded_fds & OPEN_FD_EXCLUDE_STDIN)) (void)close(STDIN_FILENO); + if(!(excluded_fds & OPEN_FD_EXCLUDE_STDOUT)) (void)close(STDOUT_FILENO); + if(!(excluded_fds & OPEN_FD_EXCLUDE_STDERR)) (void)close(STDERR_FILENO); +#if defined(HAVE_CLOSE_RANGE) + if(close_range(STDERR_FILENO + 1, ~0U, 0) == 0) return; + nd_log(NDLS_DAEMON, NDLP_DEBUG, "close_range() failed, will try to close fds one by one"); +#endif + break; + case OPEN_FD_ACTION_FD_CLOEXEC: + if(!(excluded_fds & OPEN_FD_EXCLUDE_STDIN)) (void)fcntl(STDIN_FILENO, F_SETFD, FD_CLOEXEC); + if(!(excluded_fds & OPEN_FD_EXCLUDE_STDOUT)) (void)fcntl(STDOUT_FILENO, F_SETFD, FD_CLOEXEC); + if(!(excluded_fds & OPEN_FD_EXCLUDE_STDERR)) (void)fcntl(STDERR_FILENO, F_SETFD, FD_CLOEXEC); +#if defined(HAVE_CLOSE_RANGE) && defined(CLOSE_RANGE_CLOEXEC) // Linux >= 5.11, FreeBSD >= 13.1 + if(close_range(STDERR_FILENO + 1, ~0U, CLOSE_RANGE_CLOEXEC) == 0) return; + nd_log(NDLS_DAEMON, NDLP_DEBUG, "close_range() failed, will try to mark fds for closing one by one"); +#endif + break; + default: + break; // do nothing + } + + DIR *dir = opendir("/proc/self/fd"); + if (dir == NULL) { + struct rlimit rl; + int open_max = -1; + + if(getrlimit(RLIMIT_NOFILE, &rl) == 0 && rl.rlim_max != RLIM_INFINITY) open_max = rl.rlim_max; +#ifdef _SC_OPEN_MAX + else open_max = sysconf(_SC_OPEN_MAX); +#endif + + if (open_max == -1) open_max = 65535; // 65535 arbitrary default if everything else fails + + for (fd = STDERR_FILENO + 1; fd < open_max; fd++) { + switch(action){ + case OPEN_FD_ACTION_CLOSE: + if(fd_is_valid(fd)) (void)close(fd); + break; + case OPEN_FD_ACTION_FD_CLOEXEC: + (void)fcntl(fd, F_SETFD, FD_CLOEXEC); + break; + default: + break; // do nothing + } + } + } else { + struct dirent *entry; + while ((entry = readdir(dir)) != NULL) { + fd = str2i(entry->d_name); + if(unlikely((fd == STDIN_FILENO ) || (fd == STDOUT_FILENO) || (fd == STDERR_FILENO) )) continue; + switch(action){ + case OPEN_FD_ACTION_CLOSE: + if(fd_is_valid(fd)) (void)close(fd); + break; + case OPEN_FD_ACTION_FD_CLOEXEC: + (void)fcntl(fd, F_SETFD, FD_CLOEXEC); + break; + default: + break; // do nothing + } + } + closedir(dir); + } +} + +struct timing_steps { + const char *name; + usec_t time; + size_t count; +} timing_steps[TIMING_STEP_MAX + 1] = { + [TIMING_STEP_INTERNAL] = { .name = "internal", .time = 0, }, + + [TIMING_STEP_BEGIN2_PREPARE] = { .name = "BEGIN2 prepare", .time = 0, }, + [TIMING_STEP_BEGIN2_FIND_CHART] = { .name = "BEGIN2 find chart", .time = 0, }, + [TIMING_STEP_BEGIN2_PARSE] = { .name = "BEGIN2 parse", .time = 0, }, + [TIMING_STEP_BEGIN2_ML] = { .name = "BEGIN2 ml", .time = 0, }, + [TIMING_STEP_BEGIN2_PROPAGATE] = { .name = "BEGIN2 propagate", .time = 0, }, + [TIMING_STEP_BEGIN2_STORE] = { .name = "BEGIN2 store", .time = 0, }, + + [TIMING_STEP_SET2_PREPARE] = { .name = "SET2 prepare", .time = 0, }, + [TIMING_STEP_SET2_LOOKUP_DIMENSION] = { .name = "SET2 find dimension", .time = 0, }, + [TIMING_STEP_SET2_PARSE] = { .name = "SET2 parse", .time = 0, }, + [TIMING_STEP_SET2_ML] = { .name = "SET2 ml", .time = 0, }, + [TIMING_STEP_SET2_PROPAGATE] = { .name = "SET2 propagate", .time = 0, }, + [TIMING_STEP_RRDSET_STORE_METRIC] = { .name = "SET2 rrdset store", .time = 0, }, + [TIMING_STEP_DBENGINE_FIRST_CHECK] = { .name = "db 1st check", .time = 0, }, + [TIMING_STEP_DBENGINE_CHECK_DATA] = { .name = "db check data", .time = 0, }, + [TIMING_STEP_DBENGINE_PACK] = { .name = "db pack", .time = 0, }, + [TIMING_STEP_DBENGINE_PAGE_FIN] = { .name = "db page fin", .time = 0, }, + [TIMING_STEP_DBENGINE_MRG_UPDATE] = { .name = "db mrg update", .time = 0, }, + [TIMING_STEP_DBENGINE_PAGE_ALLOC] = { .name = "db page alloc", .time = 0, }, + [TIMING_STEP_DBENGINE_CREATE_NEW_PAGE] = { .name = "db new page", .time = 0, }, + [TIMING_STEP_DBENGINE_FLUSH_PAGE] = { .name = "db page flush", .time = 0, }, + [TIMING_STEP_SET2_STORE] = { .name = "SET2 store", .time = 0, }, + + [TIMING_STEP_END2_PREPARE] = { .name = "END2 prepare", .time = 0, }, + [TIMING_STEP_END2_PUSH_V1] = { .name = "END2 push v1", .time = 0, }, + [TIMING_STEP_END2_ML] = { .name = "END2 ml", .time = 0, }, + [TIMING_STEP_END2_RRDSET] = { .name = "END2 rrdset", .time = 0, }, + [TIMING_STEP_END2_PROPAGATE] = { .name = "END2 propagate", .time = 0, }, + [TIMING_STEP_END2_STORE] = { .name = "END2 store", .time = 0, }, + + // terminator + [TIMING_STEP_MAX] = { .name = NULL, .time = 0, }, +}; + +void timing_action(TIMING_ACTION action, TIMING_STEP step) { + static __thread usec_t last_action_time = 0; + static struct timing_steps timings2[TIMING_STEP_MAX + 1] = {}; + + switch(action) { + case TIMING_ACTION_INIT: + last_action_time = now_monotonic_usec(); + break; + + case TIMING_ACTION_STEP: { + if(!last_action_time) + return; + + usec_t now = now_monotonic_usec(); + __atomic_add_fetch(&timing_steps[step].time, now - last_action_time, __ATOMIC_RELAXED); + __atomic_add_fetch(&timing_steps[step].count, 1, __ATOMIC_RELAXED); + last_action_time = now; + break; + } + + case TIMING_ACTION_FINISH: { + if(!last_action_time) + return; + + usec_t expected = __atomic_load_n(&timing_steps[TIMING_STEP_INTERNAL].time, __ATOMIC_RELAXED); + if(last_action_time - expected < 10 * USEC_PER_SEC) { + last_action_time = 0; + return; + } + + if(!__atomic_compare_exchange_n(&timing_steps[TIMING_STEP_INTERNAL].time, &expected, last_action_time, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) { + last_action_time = 0; + return; + } + + struct timing_steps timings3[TIMING_STEP_MAX + 1]; + memcpy(timings3, timing_steps, sizeof(timings3)); + + size_t total_reqs = 0; + usec_t total_usec = 0; + for(size_t t = 1; t < TIMING_STEP_MAX ; t++) { + total_usec += timings3[t].time - timings2[t].time; + total_reqs += timings3[t].count - timings2[t].count; + } + + BUFFER *wb = buffer_create(1024, NULL); + + for(size_t t = 1; t < TIMING_STEP_MAX ; t++) { + size_t requests = timings3[t].count - timings2[t].count; + if(!requests) continue; + + buffer_sprintf(wb, "TIMINGS REPORT: [%3zu. %-20s]: # %10zu, t %11.2f ms (%6.2f %%), avg %6.2f usec/run\n", + t, + timing_steps[t].name ? timing_steps[t].name : "x", + requests, + (double) (timings3[t].time - timings2[t].time) / (double)USEC_PER_MS, + (double) (timings3[t].time - timings2[t].time) * 100.0 / (double) total_usec, + (double) (timings3[t].time - timings2[t].time) / (double)requests + ); + } + + netdata_log_info("TIMINGS REPORT:\n%sTIMINGS REPORT: total # %10zu, t %11.2f ms", + buffer_tostring(wb), total_reqs, (double)total_usec / USEC_PER_MS); + + memcpy(timings2, timings3, sizeof(timings2)); + + last_action_time = 0; + buffer_free(wb); + } + } +} + +#ifdef ENABLE_HTTPS +int hash256_string(const unsigned char *string, size_t size, char *hash) { + EVP_MD_CTX *ctx; + ctx = EVP_MD_CTX_create(); + + if (!ctx) + return 0; + + if (!EVP_DigestInit(ctx, EVP_sha256())) { + EVP_MD_CTX_destroy(ctx); + return 0; + } + + if (!EVP_DigestUpdate(ctx, string, size)) { + EVP_MD_CTX_destroy(ctx); + return 0; + } + + if (!EVP_DigestFinal(ctx, (unsigned char *)hash, NULL)) { + EVP_MD_CTX_destroy(ctx); + return 0; + } + EVP_MD_CTX_destroy(ctx); + return 1; +} +#endif + + +bool rrdr_relative_window_to_absolute(time_t *after, time_t *before, time_t now) { + if(!now) now = now_realtime_sec(); + + int absolute_period_requested = -1; + time_t before_requested = *before; + time_t after_requested = *after; + + // allow relative for before (smaller than API_RELATIVE_TIME_MAX) + if(ABS(before_requested) <= API_RELATIVE_TIME_MAX) { + // if the user asked for a positive relative time, + // flip it to a negative + if(before_requested > 0) + before_requested = -before_requested; + + before_requested = now + before_requested; + absolute_period_requested = 0; + } + + // allow relative for after (smaller than API_RELATIVE_TIME_MAX) + if(ABS(after_requested) <= API_RELATIVE_TIME_MAX) { + if(after_requested > 0) + after_requested = -after_requested; + + // if the user didn't give an after, use the number of points + // to give a sane default + if(after_requested == 0) + after_requested = -600; + + // since the query engine now returns inclusive timestamps + // it is awkward to return 6 points when after=-5 is given + // so for relative queries we add 1 second, to give + // more predictable results to users. + after_requested = before_requested + after_requested + 1; + absolute_period_requested = 0; + } + + if(absolute_period_requested == -1) + absolute_period_requested = 1; + + // check if the parameters are flipped + if(after_requested > before_requested) { + long long t = before_requested; + before_requested = after_requested; + after_requested = t; + } + + // if the query requests future data + // shift the query back to be in the present time + // (this may also happen because of the rules above) + if(before_requested > now) { + time_t delta = before_requested - now; + before_requested -= delta; + after_requested -= delta; + } + + *before = before_requested; + *after = after_requested; + + return (absolute_period_requested != 1); +} + +// Returns 1 if an absolute period was requested or 0 if it was a relative period +bool rrdr_relative_window_to_absolute_query(time_t *after, time_t *before, time_t *now_ptr, bool unittest) { + time_t now = now_realtime_sec() - 1; + + if(now_ptr) + *now_ptr = now; + + time_t before_requested = *before; + time_t after_requested = *after; + + int absolute_period_requested = rrdr_relative_window_to_absolute(&after_requested, &before_requested, now); + + time_t absolute_minimum_time = now - (10 * 365 * 86400); + time_t absolute_maximum_time = now + (1 * 365 * 86400); + + if (after_requested < absolute_minimum_time && !unittest) + after_requested = absolute_minimum_time; + + if (after_requested > absolute_maximum_time && !unittest) + after_requested = absolute_maximum_time; + + if (before_requested < absolute_minimum_time && !unittest) + before_requested = absolute_minimum_time; + + if (before_requested > absolute_maximum_time && !unittest) + before_requested = absolute_maximum_time; + + *before = before_requested; + *after = after_requested; + + return (absolute_period_requested != 1); +} + +int netdata_base64_decode(const char *encoded, char *decoded, size_t decoded_size) { + static const unsigned char base64_table[256] = { + ['A'] = 0, ['B'] = 1, ['C'] = 2, ['D'] = 3, ['E'] = 4, ['F'] = 5, ['G'] = 6, ['H'] = 7, + ['I'] = 8, ['J'] = 9, ['K'] = 10, ['L'] = 11, ['M'] = 12, ['N'] = 13, ['O'] = 14, ['P'] = 15, + ['Q'] = 16, ['R'] = 17, ['S'] = 18, ['T'] = 19, ['U'] = 20, ['V'] = 21, ['W'] = 22, ['X'] = 23, + ['Y'] = 24, ['Z'] = 25, ['a'] = 26, ['b'] = 27, ['c'] = 28, ['d'] = 29, ['e'] = 30, ['f'] = 31, + ['g'] = 32, ['h'] = 33, ['i'] = 34, ['j'] = 35, ['k'] = 36, ['l'] = 37, ['m'] = 38, ['n'] = 39, + ['o'] = 40, ['p'] = 41, ['q'] = 42, ['r'] = 43, ['s'] = 44, ['t'] = 45, ['u'] = 46, ['v'] = 47, + ['w'] = 48, ['x'] = 49, ['y'] = 50, ['z'] = 51, ['0'] = 52, ['1'] = 53, ['2'] = 54, ['3'] = 55, + ['4'] = 56, ['5'] = 57, ['6'] = 58, ['7'] = 59, ['8'] = 60, ['9'] = 61, ['+'] = 62, ['/'] = 63, + [0 ... '+' - 1] = 255, + ['+' + 1 ... '/' - 1] = 255, + ['9' + 1 ... 'A' - 1] = 255, + ['Z' + 1 ... 'a' - 1] = 255, + ['z' + 1 ... 255] = 255 + }; + + size_t count = 0; + unsigned int tmp = 0; + int i, bit; + + if (decoded_size < 1) + return 0; // Buffer size must be at least 1 for null termination + + for (i = 0, bit = 0; encoded[i]; i++) { + unsigned char value = base64_table[(unsigned char)encoded[i]]; + if (value > 63) + return -1; // Invalid character in input + + tmp = tmp << 6 | value; + if (++bit == 4) { + if (count + 3 >= decoded_size) break; // Stop decoding if buffer is full + decoded[count++] = (tmp >> 16) & 0xFF; + decoded[count++] = (tmp >> 8) & 0xFF; + decoded[count++] = tmp & 0xFF; + tmp = 0; + bit = 0; + } + } + + if (bit > 0 && count + 1 < decoded_size) { + tmp <<= 6 * (4 - bit); + if (bit > 2 && count + 1 < decoded_size) decoded[count++] = (tmp >> 16) & 0xFF; + if (bit > 3 && count + 1 < decoded_size) decoded[count++] = (tmp >> 8) & 0xFF; + } + + decoded[count] = '\0'; // Null terminate the output string + return count; +} diff --git a/src/libnetdata/libnetdata.h b/src/libnetdata/libnetdata.h new file mode 100644 index 00000000..859f54cc --- /dev/null +++ b/src/libnetdata/libnetdata.h @@ -0,0 +1,707 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_LIB_H +#define NETDATA_LIB_H 1 + +# ifdef __cplusplus +extern "C" { +# endif + +#include "config.h" + +#ifdef ENABLE_OPENSSL +#define ENABLE_HTTPS 1 +#endif + +#ifdef HAVE_LIBDATACHANNEL +#define ENABLE_WEBRTC 1 +#endif + +#define STRINGIFY(x) #x +#define TOSTRING(x) STRINGIFY(x) + +#define JUDYHS_INDEX_SIZE_ESTIMATE(key_bytes) (((key_bytes) + sizeof(Word_t) - 1) / sizeof(Word_t) * 4) + +#if defined(NETDATA_DEV_MODE) && !defined(NETDATA_INTERNAL_CHECKS) +#define NETDATA_INTERNAL_CHECKS 1 +#endif + +#ifndef SIZEOF_VOID_P +#error SIZEOF_VOID_P is not defined +#endif + +#if SIZEOF_VOID_P == 4 +#define ENV32BIT 1 +#else +#define ENV64BIT 1 +#endif + +// NETDATA_TRACE_ALLOCATIONS does not work under musl libc, so don't enable it +//#if defined(NETDATA_INTERNAL_CHECKS) && !defined(NETDATA_TRACE_ALLOCATIONS) +//#define NETDATA_TRACE_ALLOCATIONS 1 +//#endif + +#define MALLOC_ALIGNMENT (sizeof(uintptr_t) * 2) +#define size_t_atomic_count(op, var, size) __atomic_## op ##_fetch(&(var), size, __ATOMIC_RELAXED) +#define size_t_atomic_bytes(op, var, size) __atomic_## op ##_fetch(&(var), ((size) % MALLOC_ALIGNMENT)?((size) + MALLOC_ALIGNMENT - ((size) % MALLOC_ALIGNMENT)):(size), __ATOMIC_RELAXED) + +// ---------------------------------------------------------------------------- +// system include files for all netdata C programs + +/* select the memory allocator, based on autoconf findings */ +#if defined(ENABLE_JEMALLOC) + +#if defined(HAVE_JEMALLOC_JEMALLOC_H) +#include <jemalloc/jemalloc.h> +#else // !defined(HAVE_JEMALLOC_JEMALLOC_H) +#include <malloc.h> +#endif // !defined(HAVE_JEMALLOC_JEMALLOC_H) + +#elif defined(ENABLE_TCMALLOC) + +#include <google/tcmalloc.h> + +#else /* !defined(ENABLE_JEMALLOC) && !defined(ENABLE_TCMALLOC) */ + +#if !(defined(__FreeBSD__) || defined(__APPLE__)) +#include <malloc.h> +#endif /* __FreeBSD__ || __APPLE__ */ + +#endif /* !defined(ENABLE_JEMALLOC) && !defined(ENABLE_TCMALLOC) */ + +// ---------------------------------------------------------------------------- + +#if defined(__FreeBSD__) +#include <pthread_np.h> +#define NETDATA_OS_TYPE "freebsd" +#elif defined(__APPLE__) +#define NETDATA_OS_TYPE "macos" +#elif defined(OS_WINDOWS) +#define NETDATA_OS_TYPE "windows" +#else +#define NETDATA_OS_TYPE "linux" +#endif /* __FreeBSD__, __APPLE__*/ + +#include <pthread.h> +#include <errno.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdarg.h> +#include <stddef.h> +#include <ctype.h> +#include <string.h> +#include <strings.h> +#include <libgen.h> +#include <dirent.h> +#include <fcntl.h> +#include <getopt.h> +#include <limits.h> +#include <locale.h> +#include <signal.h> +#include <sys/time.h> +#include <sys/types.h> +#include <time.h> +#include <unistd.h> +#include <uv.h> +#include <assert.h> + +#ifdef HAVE_ARPA_INET_H +#include <arpa/inet.h> +#endif + +#ifdef HAVE_NETINET_TCP_H +#include <netinet/tcp.h> +#endif + +#ifdef HAVE_SYS_IOCTL_H +#include <sys/ioctl.h> +#endif + +#ifdef HAVE_GRP_H +#include <grp.h> +#else +typedef uint32_t gid_t; +#endif + +#ifdef HAVE_PWD_H +#include <pwd.h> +#else +typedef uint32_t uid_t; +#endif + +#ifdef HAVE_NET_IF_H +#include <net/if.h> +#endif + +#ifdef HAVE_POLL_H +#include <poll.h> +#endif + +#ifdef HAVE_SYSLOG_H +#include <syslog.h> +#else +/* priorities */ +#define LOG_EMERG 0 /* system is unusable */ +#define LOG_ALERT 1 /* action must be taken immediately */ +#define LOG_CRIT 2 /* critical conditions */ +#define LOG_ERR 3 /* error conditions */ +#define LOG_WARNING 4 /* warning conditions */ +#define LOG_NOTICE 5 /* normal but significant condition */ +#define LOG_INFO 6 /* informational */ +#define LOG_DEBUG 7 /* debug-level messages */ + +/* facility codes */ +#define LOG_KERN (0<<3) /* kernel messages */ +#define LOG_USER (1<<3) /* random user-level messages */ +#define LOG_MAIL (2<<3) /* mail system */ +#define LOG_DAEMON (3<<3) /* system daemons */ +#define LOG_AUTH (4<<3) /* security/authorization messages */ +#define LOG_SYSLOG (5<<3) /* messages generated internally by syslogd */ +#define LOG_LPR (6<<3) /* line printer subsystem */ +#define LOG_NEWS (7<<3) /* network news subsystem */ +#define LOG_UUCP (8<<3) /* UUCP subsystem */ +#define LOG_CRON (9<<3) /* clock daemon */ +#define LOG_AUTHPRIV (10<<3) /* security/authorization messages (private) */ +#define LOG_FTP (11<<3) /* ftp daemon */ + +/* other codes through 15 reserved for system use */ +#define LOG_LOCAL0 (16<<3) /* reserved for local use */ +#define LOG_LOCAL1 (17<<3) /* reserved for local use */ +#define LOG_LOCAL2 (18<<3) /* reserved for local use */ +#define LOG_LOCAL3 (19<<3) /* reserved for local use */ +#define LOG_LOCAL4 (20<<3) /* reserved for local use */ +#define LOG_LOCAL5 (21<<3) /* reserved for local use */ +#define LOG_LOCAL6 (22<<3) /* reserved for local use */ +#define LOG_LOCAL7 (23<<3) /* reserved for local use */ +#endif + +#ifdef HAVE_SYS_MMAN_H +#include <sys/mman.h> +#endif + +#ifdef HAVE_SYS_RESOURCE_H +#include <sys/resource.h> +#endif + +#ifdef HAVE_SYS_SOCKET_H +#include <sys/socket.h> +#endif + +#ifdef HAVE_SYS_WAIT_H +#include <sys/wait.h> +#endif + +#ifdef HAVE_SYS_UN_H +#include <sys/un.h> +#endif + +#ifdef HAVE_SPAWN_H +#include <spawn.h> +#endif + +#ifdef HAVE_NETINET_IN_H +#include <netinet/in.h> +#endif + +#ifdef HAVE_RESOLV_H +#include <resolv.h> +#endif + +#ifdef HAVE_NETDB_H +#include <netdb.h> +#endif + +#ifdef HAVE_SYS_PRCTL_H +#include <sys/prctl.h> +#endif + +#ifdef HAVE_SYS_STAT_H +#include <sys/stat.h> +#endif + +#ifdef HAVE_SYS_VFS_H +#include <sys/vfs.h> +#endif + +#ifdef HAVE_SYS_STATFS_H +#include <sys/statfs.h> +#endif + +#ifdef HAVE_LINUX_MAGIC_H +#include <linux/magic.h> +#endif + +#ifdef HAVE_SYS_MOUNT_H +#include <sys/mount.h> +#endif + +#ifdef HAVE_SYS_STATVFS_H +#include <sys/statvfs.h> +#endif + +// #1408 +#ifdef MAJOR_IN_MKDEV +#include <sys/mkdev.h> +#endif +#ifdef MAJOR_IN_SYSMACROS +#include <sys/sysmacros.h> +#endif + +#include <math.h> +#include <float.h> + +#if defined(HAVE_INTTYPES_H) +#include <inttypes.h> +#elif defined(HAVE_STDINT_H) +#include <stdint.h> +#endif + +#include <zlib.h> + +#ifdef HAVE_SYS_CAPABILITY_H +#include <sys/capability.h> +#endif + + +#ifndef O_CLOEXEC +#define O_CLOEXEC (0) +#endif + +// ---------------------------------------------------------------------------- +// netdata common definitions + +#define _cleanup_(x) __attribute__((__cleanup__(x))) + +#ifdef HAVE_FUNC_ATTRIBUTE_RETURNS_NONNULL +#define NEVERNULL __attribute__((returns_nonnull)) +#else +#define NEVERNULL +#endif + +#ifdef HAVE_FUNC_ATTRIBUTE_NOINLINE +#define NOINLINE __attribute__((noinline)) +#else +#define NOINLINE +#endif + +#ifdef HAVE_FUNC_ATTRIBUTE_MALLOC +#define MALLOCLIKE __attribute__((malloc)) +#else +#define MALLOCLIKE +#endif + +#if defined(HAVE_FUNC_ATTRIBUTE_FORMAT_GNU_PRINTF) +#define PRINTFLIKE(f, a) __attribute__ ((format(gnu_printf, f, a))) +#elif defined(HAVE_FUNC_ATTRIBUTE_FORMAT_PRINTF) +#define PRINTFLIKE(f, a) __attribute__ ((format(printf, f, a))) +#else +#define PRINTFLIKE(f, a) +#endif + +#ifdef HAVE_FUNC_ATTRIBUTE_NORETURN +#define NORETURN __attribute__ ((noreturn)) +#else +#define NORETURN +#endif + +#ifdef HAVE_FUNC_ATTRIBUTE_WARN_UNUSED_RESULT +#define WARNUNUSED __attribute__ ((warn_unused_result)) +#else +#define WARNUNUSED +#endif + +void aral_judy_init(void); +size_t judy_aral_overhead(void); +size_t judy_aral_structures(void); + +#define ABS(x) (((x) < 0)? (-(x)) : (x)) +#define MIN(a,b) (((a)<(b))?(a):(b)) +#define MAX(a,b) (((a)>(b))?(a):(b)) +#define SWAP(a, b) do { \ + typeof(a) _tmp = b; \ + b = a; \ + a = _tmp; \ +} while(0) + +#define GUID_LEN 36 + +#include "linked-lists.h" +#include "storage-point.h" + +void netdata_fix_chart_id(char *s); +void netdata_fix_chart_name(char *s); + +int madvise_sequential(void *mem, size_t len); +int madvise_random(void *mem, size_t len); +int madvise_dontfork(void *mem, size_t len); +int madvise_willneed(void *mem, size_t len); +int madvise_dontneed(void *mem, size_t len); +int madvise_dontdump(void *mem, size_t len); +int madvise_mergeable(void *mem, size_t len); + +int vsnprintfz(char *dst, size_t n, const char *fmt, va_list args); +int snprintfz(char *dst, size_t n, const char *fmt, ...) PRINTFLIKE(3, 4); + +// memory allocation functions that handle failures +#ifdef NETDATA_TRACE_ALLOCATIONS +int malloc_trace_walkthrough(int (*callback)(void *item, void *data), void *data); + +#define strdupz(s) strdupz_int(s, __FILE__, __FUNCTION__, __LINE__) +#define strndupz(s, len) strndupz_int(s, len, __FILE__, __FUNCTION__, __LINE__) +#define callocz(nmemb, size) callocz_int(nmemb, size, __FILE__, __FUNCTION__, __LINE__) +#define mallocz(size) mallocz_int(size, __FILE__, __FUNCTION__, __LINE__) +#define reallocz(ptr, size) reallocz_int(ptr, size, __FILE__, __FUNCTION__, __LINE__) +#define freez(ptr) freez_int(ptr, __FILE__, __FUNCTION__, __LINE__) +#define mallocz_usable_size(ptr) mallocz_usable_size_int(ptr, __FILE__, __FUNCTION__, __LINE__) + +char *strdupz_int(const char *s, const char *file, const char *function, size_t line); +char *strndupz_int(const char *s, size_t len, const char *file, const char *function, size_t line); +void *callocz_int(size_t nmemb, size_t size, const char *file, const char *function, size_t line); +void *mallocz_int(size_t size, const char *file, const char *function, size_t line); +void *reallocz_int(void *ptr, size_t size, const char *file, const char *function, size_t line); +void freez_int(void *ptr, const char *file, const char *function, size_t line); +size_t mallocz_usable_size_int(void *ptr, const char *file, const char *function, size_t line); + +#else // NETDATA_TRACE_ALLOCATIONS +char *strdupz(const char *s) MALLOCLIKE NEVERNULL; +char *strndupz(const char *s, size_t len) MALLOCLIKE NEVERNULL; +void *callocz(size_t nmemb, size_t size) MALLOCLIKE NEVERNULL; +void *mallocz(size_t size) MALLOCLIKE NEVERNULL; +void *reallocz(void *ptr, size_t size) MALLOCLIKE NEVERNULL; +void freez(void *ptr); +#endif // NETDATA_TRACE_ALLOCATIONS + +void posix_memfree(void *ptr); + +void json_escape_string(char *dst, const char *src, size_t size); +void json_fix_string(char *s); + +void *netdata_mmap(const char *filename, size_t size, int flags, int ksm, bool read_only, int *open_fd); +int netdata_munmap(void *ptr, size_t size); +int memory_file_save(const char *filename, void *mem, size_t size); + +extern struct rlimit rlimit_nofile; + +extern int enable_ksm; + +char *fgets_trim_len(char *buf, size_t buf_size, FILE *fp, size_t *len); + +int verify_netdata_host_prefix(bool log_msg); + +extern volatile sig_atomic_t netdata_exit; + +char *strdupz_path_subpath(const char *path, const char *subpath); +int path_is_dir(const char *path, const char *subpath); +int path_is_file(const char *path, const char *subpath); +void recursive_config_double_dir_load( + const char *user_path + , const char *stock_path + , const char *subpath + , int (*callback)(const char *filename, void *data, bool stock_config) + , void *data + , size_t depth +); +char *read_by_filename(const char *filename, long *file_size); +char *find_and_replace(const char *src, const char *find, const char *replace, const char *where); + +/* fix for alpine linux */ +#ifndef RUSAGE_THREAD +#ifdef RUSAGE_CHILDREN +#define RUSAGE_THREAD RUSAGE_CHILDREN +#endif +#endif + +#define BITS_IN_A_KILOBIT 1000 +#define KILOBITS_IN_A_MEGABIT 1000 + +/* misc. */ + +#define UNUSED(x) (void)(x) + +#ifdef __GNUC__ +#define UNUSED_FUNCTION(x) __attribute__((unused)) UNUSED_##x +#else +#define UNUSED_FUNCTION(x) UNUSED_##x +#endif + +#define error_report(x, args...) do { errno = 0; netdata_log_error(x, ##args); } while(0) + +// Taken from linux kernel +#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) + +#include "bitmap64.h" + +#define COMPRESSION_MAX_CHUNK 0x4000 +#define COMPRESSION_MAX_OVERHEAD 128 +#define COMPRESSION_MAX_MSG_SIZE (COMPRESSION_MAX_CHUNK - COMPRESSION_MAX_OVERHEAD - 1) +#define PLUGINSD_LINE_MAX (COMPRESSION_MAX_MSG_SIZE - 768) + +bool run_command_and_copy_output_to_stdout(const char *command, int max_line_length); +struct web_buffer *run_command_and_get_output_to_buffer(const char *command, int max_line_length); + +typedef enum { + OPEN_FD_ACTION_CLOSE, + OPEN_FD_ACTION_FD_CLOEXEC +} OPEN_FD_ACTION; +typedef enum { + OPEN_FD_EXCLUDE_STDIN = 0x01, + OPEN_FD_EXCLUDE_STDOUT = 0x02, + OPEN_FD_EXCLUDE_STDERR = 0x04 +} OPEN_FD_EXCLUDE; +void for_each_open_fd(OPEN_FD_ACTION action, OPEN_FD_EXCLUDE excluded_fds); + +void netdata_cleanup_and_exit(int ret, const char *action, const char *action_result, const char *action_data) NORETURN; +extern char *netdata_configured_host_prefix; + +#include "os/os.h" + +#define XXH_INLINE_ALL +#include "xxhash.h" + +#include "uuid/uuid.h" +#include "template-enum.h" +#include "http/http_access.h" +#include "http/content_type.h" +#include "config/dyncfg.h" +#include "libjudy/src/Judy.h" +#include "july/july.h" +#include "threads/threads.h" +#include "buffer/buffer.h" +#include "locks/locks.h" +#include "circular_buffer/circular_buffer.h" +#include "avl/avl.h" +#include "inlined.h" +#include "line_splitter/line_splitter.h" +#include "clocks/clocks.h" +#include "datetime/iso8601.h" +#include "datetime/rfc3339.h" +#include "datetime/rfc7231.h" +#include "completion/completion.h" +#include "popen/popen.h" +#include "simple_pattern/simple_pattern.h" +#ifdef ENABLE_HTTPS +# include "socket/security.h" +#endif +#include "socket/socket.h" +#include "config/appconfig.h" +#include "log/journal.h" +#include "log/log.h" +#include "buffered_reader/buffered_reader.h" +#include "procfile/procfile.h" +#include "string/string.h" +#include "dictionary/dictionary.h" +#include "dictionary/thread-cache.h" +#if defined(HAVE_LIBBPF) && !defined(__cplusplus) +#include "ebpf/ebpf.h" +#endif +#include "eval/eval.h" +#include "statistical/statistical.h" +#include "adaptive_resortable_list/adaptive_resortable_list.h" +#include "url/url.h" +#include "json/json.h" +#include "json/json-c-parser-inline.h" +#include "string/utf8.h" +#include "libnetdata/aral/aral.h" +#include "onewayalloc/onewayalloc.h" +#include "worker_utilization/worker_utilization.h" +#include "yaml.h" +#include "http/http_defs.h" +#include "gorilla/gorilla.h" +#include "facets/facets.h" +#include "functions_evloop/functions_evloop.h" +#include "query_progress/progress.h" + +// BEWARE: this exists in alarm-notify.sh +#define DEFAULT_CLOUD_BASE_URL "https://app.netdata.cloud" + +#define RRD_STORAGE_TIERS 5 + +static inline size_t struct_natural_alignment(size_t size) __attribute__((const)); + +#define STRUCT_NATURAL_ALIGNMENT (sizeof(uintptr_t) * 2) +static inline size_t struct_natural_alignment(size_t size) { + if(unlikely(size % STRUCT_NATURAL_ALIGNMENT)) + size = size + STRUCT_NATURAL_ALIGNMENT - (size % STRUCT_NATURAL_ALIGNMENT); + + return size; +} + +#ifdef NETDATA_TRACE_ALLOCATIONS +struct malloc_trace { + avl_t avl; + + const char *function; + const char *file; + size_t line; + + size_t malloc_calls; + size_t calloc_calls; + size_t realloc_calls; + size_t strdup_calls; + size_t free_calls; + + size_t mmap_calls; + size_t munmap_calls; + + size_t allocations; + size_t bytes; + + struct rrddim *rd_bytes; + struct rrddim *rd_allocations; + struct rrddim *rd_avg_alloc; + struct rrddim *rd_ops; +}; +#endif // NETDATA_TRACE_ALLOCATIONS + +static inline PPvoid_t JudyLFirstThenNext(Pcvoid_t PArray, Word_t * PIndex, bool *first) { + if(unlikely(*first)) { + *first = false; + return JudyLFirst(PArray, PIndex, PJE0); + } + + return JudyLNext(PArray, PIndex, PJE0); +} + +static inline PPvoid_t JudyLLastThenPrev(Pcvoid_t PArray, Word_t * PIndex, bool *first) { + if(unlikely(*first)) { + *first = false; + return JudyLLast(PArray, PIndex, PJE0); + } + + return JudyLPrev(PArray, PIndex, PJE0); +} + +typedef enum { + TIMING_STEP_INTERNAL = 0, + + TIMING_STEP_BEGIN2_PREPARE, + TIMING_STEP_BEGIN2_FIND_CHART, + TIMING_STEP_BEGIN2_PARSE, + TIMING_STEP_BEGIN2_ML, + TIMING_STEP_BEGIN2_PROPAGATE, + TIMING_STEP_BEGIN2_STORE, + + TIMING_STEP_SET2_PREPARE, + TIMING_STEP_SET2_LOOKUP_DIMENSION, + TIMING_STEP_SET2_PARSE, + TIMING_STEP_SET2_ML, + TIMING_STEP_SET2_PROPAGATE, + TIMING_STEP_RRDSET_STORE_METRIC, + TIMING_STEP_DBENGINE_FIRST_CHECK, + TIMING_STEP_DBENGINE_CHECK_DATA, + TIMING_STEP_DBENGINE_PACK, + TIMING_STEP_DBENGINE_PAGE_FIN, + TIMING_STEP_DBENGINE_MRG_UPDATE, + TIMING_STEP_DBENGINE_PAGE_ALLOC, + TIMING_STEP_DBENGINE_CREATE_NEW_PAGE, + TIMING_STEP_DBENGINE_FLUSH_PAGE, + TIMING_STEP_SET2_STORE, + + TIMING_STEP_END2_PREPARE, + TIMING_STEP_END2_PUSH_V1, + TIMING_STEP_END2_ML, + TIMING_STEP_END2_RRDSET, + TIMING_STEP_END2_PROPAGATE, + TIMING_STEP_END2_STORE, + + TIMING_STEP_FREEIPMI_CTX_CREATE, + TIMING_STEP_FREEIPMI_DSR_CACHE_DIR, + TIMING_STEP_FREEIPMI_SENSOR_CONFIG_FILE, + TIMING_STEP_FREEIPMI_SENSOR_READINGS_BY_X, + TIMING_STEP_FREEIPMI_READ_record_id, + TIMING_STEP_FREEIPMI_READ_sensor_number, + TIMING_STEP_FREEIPMI_READ_sensor_type, + TIMING_STEP_FREEIPMI_READ_sensor_name, + TIMING_STEP_FREEIPMI_READ_sensor_state, + TIMING_STEP_FREEIPMI_READ_sensor_units, + TIMING_STEP_FREEIPMI_READ_sensor_bitmask_type, + TIMING_STEP_FREEIPMI_READ_sensor_bitmask, + TIMING_STEP_FREEIPMI_READ_sensor_bitmask_strings, + TIMING_STEP_FREEIPMI_READ_sensor_reading_type, + TIMING_STEP_FREEIPMI_READ_sensor_reading, + TIMING_STEP_FREEIPMI_READ_event_reading_type_code, + TIMING_STEP_FREEIPMI_READ_record_type, + TIMING_STEP_FREEIPMI_READ_record_type_class, + TIMING_STEP_FREEIPMI_READ_sel_state, + TIMING_STEP_FREEIPMI_READ_event_direction, + TIMING_STEP_FREEIPMI_READ_event_type_code, + TIMING_STEP_FREEIPMI_READ_event_offset_type, + TIMING_STEP_FREEIPMI_READ_event_offset, + TIMING_STEP_FREEIPMI_READ_event_offset_string, + TIMING_STEP_FREEIPMI_READ_manufacturer_id, + + // terminator + TIMING_STEP_MAX, +} TIMING_STEP; + +typedef enum { + TIMING_ACTION_INIT, + TIMING_ACTION_STEP, + TIMING_ACTION_FINISH, +} TIMING_ACTION; + +#ifdef NETDATA_TIMING_REPORT +#define timing_init() timing_action(TIMING_ACTION_INIT, TIMING_STEP_INTERNAL) +#define timing_step(step) timing_action(TIMING_ACTION_STEP, step) +#define timing_report() timing_action(TIMING_ACTION_FINISH, TIMING_STEP_INTERNAL) +#else +#define timing_init() debug_dummy() +#define timing_step(step) debug_dummy() +#define timing_report() debug_dummy() +#endif +void timing_action(TIMING_ACTION action, TIMING_STEP step); + +int hash256_string(const unsigned char *string, size_t size, char *hash); + +extern bool unittest_running; +#define API_RELATIVE_TIME_MAX (3 * 365 * 86400) + +bool rrdr_relative_window_to_absolute(time_t *after, time_t *before, time_t now); +bool rrdr_relative_window_to_absolute_query(time_t *after, time_t *before, time_t *now_ptr, bool unittest); + +int netdata_base64_decode(const char *encoded, char *decoded, size_t decoded_size); + +static inline void freez_charp(char **p) { + freez(*p); +} + +static inline void freez_const_charp(const char **p) { + freez((void *)*p); +} + +#define CLEAN_CONST_CHAR_P _cleanup_(freez_const_charp) const char +#define CLEAN_CHAR_P _cleanup_(freez_charp) char + +// -------------------------------------------------------------------------------------------------------------------- +// automatic cleanup function, instead of pthread pop/push + +// volatile: Tells the compiler that the variable defined might be accessed in unexpected ways +// (e.g., by the cleanup function). This prevents it from being optimized out. +#define CLEANUP_FUNCTION_REGISTER(func) volatile void * __attribute__((cleanup(func))) + +static inline void *CLEANUP_FUNCTION_GET_PTR(void *pptr) { + void *ret; + void **p = (void **)pptr; + if(p) { + ret = *p; + *p = NULL; // use it only once - this will prevent using it again + + if(!ret) + nd_log(NDLS_DAEMON, NDLP_ERR, "cleanup function called multiple times!"); + } + else { + nd_log(NDLS_DAEMON, NDLP_ERR, "cleanup function called with NULL pptr!"); + ret = NULL; + } + + return ret; +} + +// -------------------------------------------------------------------------------------------------------------------- + +# ifdef __cplusplus +} +# endif + +#endif // NETDATA_LIB_H diff --git a/src/libnetdata/line_splitter/README.md b/src/libnetdata/line_splitter/README.md new file mode 100644 index 00000000..b391a492 --- /dev/null +++ b/src/libnetdata/line_splitter/README.md @@ -0,0 +1,14 @@ +<!-- +title: "Log" +custom_edit_url: https://github.com/netdata/netdata/edit/master/src/libnetdata/log/README.md +sidebar_label: "Log" +learn_status: "Published" +learn_topic_type: "Tasks" +learn_rel_path: "Developers/libnetdata" +--> + +# Log + +The netdata log library supports debug, info, error and fatal error logging. +By default we have an access log, an error log and a collectors log. + diff --git a/src/libnetdata/line_splitter/line_splitter.c b/src/libnetdata/line_splitter/line_splitter.c new file mode 100644 index 00000000..6726d909 --- /dev/null +++ b/src/libnetdata/line_splitter/line_splitter.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" + + +bool line_splitter_reconstruct_line(BUFFER *wb, void *ptr) { + struct line_splitter *spl = ptr; + if(!spl) return false; + + size_t added = 0; + for(size_t i = 0; i < spl->num_words ;i++) { + if(i) buffer_fast_strcat(wb, " ", 1); + + buffer_fast_strcat(wb, "'", 1); + const char *s = get_word(spl->words, spl->num_words, i); + buffer_strcat(wb, s?s:""); + buffer_fast_strcat(wb, "'", 1); + added++; + } + + return added > 0; +} + +inline int pluginsd_isspace(char c) { + switch(c) { + case ' ': + case '\t': + case '\r': + case '\n': + case '=': + return 1; + + default: + return 0; + } +} + +inline int config_isspace(char c) { + switch (c) { + case ' ': + case '\t': + case '\r': + case '\n': + case ',': + return 1; + + default: + return 0; + } +} + +inline int group_by_label_isspace(char c) { + if(c == ',' || c == '|') + return 1; + + return 0; +} + +inline int dyncfg_id_isspace(char c) { + if(c == ':') + return 1; + + return 0; +} + +bool isspace_map_pluginsd[256] = {}; +bool isspace_map_config[256] = {}; +bool isspace_map_group_by_label[256] = {}; +bool isspace_dyncfg_id_map[256] = {}; + +__attribute__((constructor)) void initialize_is_space_arrays(void) { + for(int c = 0; c < 256 ; c++) { + isspace_map_pluginsd[c] = pluginsd_isspace((char) c); + isspace_map_config[c] = config_isspace((char) c); + isspace_map_group_by_label[c] = group_by_label_isspace((char) c); + isspace_dyncfg_id_map[c] = dyncfg_id_isspace((char)c); + } +} diff --git a/src/libnetdata/line_splitter/line_splitter.h b/src/libnetdata/line_splitter/line_splitter.h new file mode 100644 index 00000000..96893041 --- /dev/null +++ b/src/libnetdata/line_splitter/line_splitter.h @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" + +#ifndef NETDATA_LINE_SPLITTER_H +#define NETDATA_LINE_SPLITTER_H + +#define PLUGINSD_MAX_WORDS 30 + +struct line_splitter { + size_t count; // counts number of lines + char *words[PLUGINSD_MAX_WORDS]; // an array of pointers for the words in this line + size_t num_words; // the number of pointers used in this line +}; + +bool line_splitter_reconstruct_line(BUFFER *wb, void *ptr); + +static inline void line_splitter_reset(struct line_splitter *line) { + line->num_words = 0; +} + +int pluginsd_isspace(char c); +int config_isspace(char c); +int group_by_label_isspace(char c); +int dyncfg_id_isspace(char c); + +extern bool isspace_map_pluginsd[256]; +extern bool isspace_map_config[256]; +extern bool isspace_map_group_by_label[256]; +extern bool isspace_dyncfg_id_map[256]; + +static inline size_t quoted_strings_splitter(char *str, char **words, size_t max_words, bool *isspace_map) { + char *s = str, quote = 0; + size_t i = 0; + + // skip all white space + while (unlikely(isspace_map[(uint8_t)*s])) + s++; + + if(unlikely(!*s)) { + words[i] = NULL; + return 0; + } + + // check for quote + if (unlikely(*s == '\'' || *s == '"')) { + quote = *s; // remember the quote + s++; // skip the quote + } + + // store the first word + words[i++] = s; + + // while we have something + while (likely(*s)) { + // if it is an escape + if (unlikely(*s == '\\' && s[1])) { + s += 2; + continue; + } + + // if it is a quote + else if (unlikely(*s == quote)) { + quote = 0; + *s = ' '; + continue; + } + + // if it is a space + else if (unlikely(quote == 0 && isspace_map[(uint8_t)*s])) { + // terminate the word + *s++ = '\0'; + + // skip all white space + while (likely(isspace_map[(uint8_t)*s])) + s++; + + // check for a quote + if (unlikely(*s == '\'' || *s == '"')) { + quote = *s; // remember the quote + s++; // skip the quote + } + + // if we reached the end, stop + if (unlikely(!*s)) + break; + + // store the next word + if (likely(i < max_words)) + words[i++] = s; + else + break; + } + + // anything else + else + s++; + } + + if (likely(i < max_words)) + words[i] = NULL; + + return i; +} + +#define quoted_strings_splitter_query_group_by_label(str, words, max_words) \ + quoted_strings_splitter(str, words, max_words, isspace_map_group_by_label) + +#define quoted_strings_splitter_config(str, words, max_words) \ + quoted_strings_splitter(str, words, max_words, isspace_map_config) + +#define quoted_strings_splitter_pluginsd(str, words, max_words) \ + quoted_strings_splitter(str, words, max_words, isspace_map_pluginsd) + +#define quoted_strings_splitter_dyncfg_id(str, words, max_words) \ + quoted_strings_splitter(str, words, max_words, isspace_dyncfg_id_map) + +static inline char *get_word(char **words, size_t num_words, size_t index) { + if (unlikely(index >= num_words)) + return NULL; + + return words[index]; +} + +#endif //NETDATA_LINE_SPLITTER_H diff --git a/src/libnetdata/linked-lists.h b/src/libnetdata/linked-lists.h new file mode 100644 index 00000000..033d1122 --- /dev/null +++ b/src/libnetdata/linked-lists.h @@ -0,0 +1,133 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_LINKED_LISTS_H +#define NETDATA_LINKED_LISTS_H + +// --------------------------------------------------------------------------------------------- +// double linked list management +// inspired by https://github.com/troydhanson/uthash/blob/master/src/utlist.h + +#define DOUBLE_LINKED_LIST_PREPEND_ITEM_UNSAFE(head, item, prev, next) \ + do { \ + (item)->next = (head); \ + \ + if(likely(head)) { \ + (item)->prev = (head)->prev; \ + (head)->prev = (item); \ + } \ + else \ + (item)->prev = (item); \ + \ + (head) = (item); \ + } while (0) + +#define DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(head, item, prev, next) \ + do { \ + \ + (item)->next = NULL; \ + \ + if(likely(head)) { \ + (item)->prev = (head)->prev; \ + (head)->prev->next = (item); \ + (head)->prev = (item); \ + } \ + else { \ + (item)->prev = (item); \ + (head) = (item); \ + } \ + \ + } while (0) + +#define DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(head, item, prev, next) \ + do { \ + fatal_assert((head) != NULL); \ + fatal_assert((item)->prev != NULL); \ + \ + if((item)->prev == (item)) \ + /* it is the only item in the list */ \ + (head) = NULL; \ + \ + else if((item) == (head)) { \ + /* it is the first item */ \ + fatal_assert((item)->next != NULL); \ + (item)->next->prev = (item)->prev; \ + (head) = (item)->next; \ + } \ + else { \ + /* it is any other item */ \ + (item)->prev->next = (item)->next; \ + \ + if ((item)->next) \ + (item)->next->prev = (item)->prev; \ + else \ + (head)->prev = (item)->prev; \ + } \ + \ + (item)->next = NULL; \ + (item)->prev = NULL; \ + } while (0) + +#define DOUBLE_LINKED_LIST_INSERT_ITEM_BEFORE_UNSAFE(head, existing, item, prev, next) \ + do { \ + if (existing) { \ + fatal_assert((head) != NULL); \ + fatal_assert((item) != NULL); \ + \ + (item)->next = (existing); \ + (item)->prev = (existing)->prev; \ + (existing)->prev = (item); \ + \ + if ((head) == (existing)) \ + (head) = (item); \ + else \ + (item)->prev->next = (item); \ + \ + } \ + else \ + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(head, item, prev, next); \ + \ + } while (0) + +#define DOUBLE_LINKED_LIST_INSERT_ITEM_AFTER_UNSAFE(head, existing, item, prev, next) \ + do { \ + if (existing) { \ + fatal_assert((head) != NULL); \ + fatal_assert((item) != NULL); \ + \ + (item)->next = (existing)->next; \ + (item)->prev = (existing); \ + (existing)->next = (item); \ + \ + if ((item)->next) \ + (item)->next->prev = (item); \ + else \ + (head)->prev = (item); \ + } \ + else \ + DOUBLE_LINKED_LIST_PREPEND_ITEM_UNSAFE(head, item, prev, next); \ + \ + } while (0) + +#define DOUBLE_LINKED_LIST_APPEND_LIST_UNSAFE(head, head2, prev, next) \ + do { \ + if (head2) { \ + if (head) { \ + __typeof(head2) _head2_last_item = (head2)->prev; \ + \ + (head2)->prev = (head)->prev; \ + (head)->prev->next = (head2); \ + \ + (head)->prev = _head2_last_item; \ + } \ + else \ + (head) = (head2); \ + } \ + } while (0) + +#define DOUBLE_LINKED_LIST_FOREACH_FORWARD(head, var, prev, next) \ + for ((var) = (head); (var) ; (var) = (var)->next) + +#define DOUBLE_LINKED_LIST_FOREACH_BACKWARD(head, var, prev, next) \ + for ((var) = (head) ? (head)->prev : NULL ; (var) ; (var) = ((var) == (head)) ? NULL : (var)->prev) + +#endif //NETDATA_LINKED_LISTS_H diff --git a/src/libnetdata/locks/README.md b/src/libnetdata/locks/README.md new file mode 100644 index 00000000..35d602f2 --- /dev/null +++ b/src/libnetdata/locks/README.md @@ -0,0 +1,107 @@ +<!-- +title: "Locks" +custom_edit_url: https://github.com/netdata/netdata/edit/master/src/libnetdata/locks/README.md +sidebar_label: "Locks" +learn_status: "Published" +learn_topic_type: "Tasks" +learn_rel_path: "Developers/libnetdata" +--> + +# Locks + +## How to trace netdata locks + +To enable tracing rwlocks in netdata, compile netdata by setting `CFLAGS="-DNETDATA_TRACE_RWLOCKS=1"`, like this: + +``` +CFLAGS="-O1 -ggdb -DNETDATA_TRACE_RWLOCKS=1" ./netdata-installer.sh +``` + +During compilation, the compiler will log: + +``` +libnetdata/locks/locks.c:105:2: warning: #warning NETDATA_TRACE_RWLOCKS ENABLED - EXPECT A LOT OF OUTPUT [-Wcpp] + 105 | #warning NETDATA_TRACE_RWLOCKS ENABLED - EXPECT A LOT OF OUTPUT + | ^~~~~~~ +``` + +Once compiled, netdata will do the following: + +Every call to `netdata_rwlock_*()` is now measured in time. + +### logging of slow locks/unlocks + +If any call takes more than 10 usec, it will be logged like this: + +``` +RW_LOCK ON LOCK 0x0x7fbe1f2e5190: 4157038, 'ACLK_Query_2' (function build_context_param_list() 99@web/api/formatters/rrd2json.c) WAITED to UNLOCK for 29 usec. +``` + +The time can be changed by setting this `-DNETDATA_TRACE_RWLOCKS_WAIT_TIME_TO_IGNORE_USEC=20` (or whatever number) to the CFLAGS. + +### logging of long hold times + +If any lock is holded for more than 10000 usec, it will be logged like this: + +``` +RW_LOCK ON LOCK 0x0x55a20afc1b20: 4187198, 'ANALYTICS' (function analytics_gather_mutable_meta_data() 532@daemon/analytics.c) holded a 'R' for 13232 usec. +``` + +The time can be changed by setting this `-DNETDATA_TRACE_RWLOCKS_HOLD_TIME_TO_IGNORE_USEC=20000` (or whatever number) to the CFLAGS. + +### logging for probable pauses (predictive) + +The library maintains a linked-list of all the lock holders (one entry per thread). For this linked-list a mutex is used. So every call to the r/w locks now also has a mutex lock. + +If any call is expected to pause the caller (ie the caller is attempting a read lock while there is a write lock in place and vice versa), the library will log something like this: + +``` +RW_LOCK ON LOCK 0x0x5651c9fcce20: 4190039 'HEALTH' (function health_execute_pending_updates() 661@health/health.c) WANTS a 'W' lock (while holding 1 rwlocks and 1 mutexes). +There are 7 readers and 0 writers are holding the lock: + => 1: RW_LOCK: process 4190091 'WEB_SERVER[static14]' (function web_client_api_request_v1_data() 526@web/api/web_api_v1.c) is having 1 'R' lock for 709847 usec. + => 2: RW_LOCK: process 4190079 'WEB_SERVER[static6]' (function web_client_api_request_v1_data() 526@web/api/web_api_v1.c) is having 1 'R' lock for 709869 usec. + => 3: RW_LOCK: process 4190084 'WEB_SERVER[static10]' (function web_client_api_request_v1_data() 526@web/api/web_api_v1.c) is having 1 'R' lock for 709948 usec. + => 4: RW_LOCK: process 4190076 'WEB_SERVER[static3]' (function web_client_api_request_v1_data() 526@web/api/web_api_v1.c) is having 1 'R' lock for 710190 usec. + => 5: RW_LOCK: process 4190092 'WEB_SERVER[static15]' (function web_client_api_request_v1_data() 526@web/api/web_api_v1.c) is having 1 'R' lock for 710195 usec. + => 6: RW_LOCK: process 4190077 'WEB_SERVER[static4]' (function web_client_api_request_v1_data() 526@web/api/web_api_v1.c) is having 1 'R' lock for 710208 usec. + => 7: RW_LOCK: process 4190044 'WEB_SERVER[static1]' (function web_client_api_request_v1_data() 526@web/api/web_api_v1.c) is having 1 'R' lock for 710221 usec. +``` + +And each of the above is paired with a `GOT` log, like this: + +``` +RW_LOCK ON LOCK 0x0x5651c9fcce20: 4190039 'HEALTH' (function health_execute_pending_updates() 661@health/health.c) GOT a 'W' lock (while holding 2 rwlocks and 1 mutexes). +There are 0 readers and 1 writers are holding the lock: + => 1: RW_LOCK: process 4190039 'HEALTH' (function health_execute_pending_updates() 661@health/health.c) is having 1 'W' lock for 36 usec. +``` + +Keep in mind that the lock and log are not atomic. The list of callers is indicative (and sometimes just empty because the original holders of the lock, unlocked it until we had the chance to print their names). + +### POSIX compliance check + +The library may also log messages about POSIX unsupported cases, like this: + +``` +RW_LOCK FATAL ON LOCK 0x0x622000109290: 3609368 'PLUGIN[proc]' (function __rrdset_check_rdlock() 10@database/rrdset.c) attempts to acquire a 'W' lock. +But it is not supported by POSIX because: ALREADY HAS THIS LOCK +At this attempt, the task is holding 1 rwlocks and 1 mutexes. +There are 1 readers and 0 writers are holding the lock requested now: + => 1: RW_LOCK: process 3609368 'PLUGIN[proc]' (function rrdset_done() 1398@database/rrdset.c) is having 1 'R' lock for 0 usec. +``` + +### nested read locks + +When compiled with `-DNETDATA_TRACE_RWLOCKS_LOG_NESTED=1` the library will also detect nested read locks and print them like this: + +``` +RW_LOCK ON LOCK 0x0x7ff6ea46d190: 4140225 'WEB_SERVER[static14]' (function rrdr_json_wrapper_begin() 34@web/api/formatters/json_wrapper.c) NESTED READ LOCK REQUEST a 'R' lock (while holding 1 rwlocks and 1 mutexes). +There are 5 readers and 0 writers are holding the lock: + => 1: RW_LOCK: process 4140225 'WEB_SERVER[static14]' (function rrdr_lock_rrdset() 70@web/api/queries/rrdr.c) is having 1 'R' lock for 216667 usec. + => 2: RW_LOCK: process 4140211 'WEB_SERVER[static6]' (function rrdr_lock_rrdset() 70@web/api/queries/rrdr.c) is having 1 'R' lock for 220001 usec. + => 3: RW_LOCK: process 4140218 'WEB_SERVER[static8]' (function rrdr_lock_rrdset() 70@web/api/queries/rrdr.c) is having 1 'R' lock for 220001 usec. + => 4: RW_LOCK: process 4140224 'WEB_SERVER[static13]' (function rrdr_lock_rrdset() 70@web/api/queries/rrdr.c) is having 1 'R' lock for 220001 usec. + => 5: RW_LOCK: process 4140227 'WEB_SERVER[static16]' (function rrdr_lock_rrdset() 70@web/api/queries/rrdr.c) is having 1 'R' lock for 220001 usec. +``` + + + diff --git a/src/libnetdata/locks/locks.c b/src/libnetdata/locks/locks.c new file mode 100644 index 00000000..d01ee29f --- /dev/null +++ b/src/libnetdata/locks/locks.c @@ -0,0 +1,569 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" + +#ifdef NETDATA_TRACE_RWLOCKS + +#ifndef NETDATA_TRACE_RWLOCKS_WAIT_TIME_TO_IGNORE_USEC +#define NETDATA_TRACE_RWLOCKS_WAIT_TIME_TO_IGNORE_USEC 10 +#endif + +#ifndef NETDATA_TRACE_RWLOCKS_HOLD_TIME_TO_IGNORE_USEC +#define NETDATA_TRACE_RWLOCKS_HOLD_TIME_TO_IGNORE_USEC 10000 +#endif + +#ifndef NETDATA_THREAD_LOCKS_ARRAY_SIZE +#define NETDATA_THREAD_LOCKS_ARRAY_SIZE 10 +#endif + +#endif // NETDATA_TRACE_RWLOCKS + +// ---------------------------------------------------------------------------- +// mutex + +int __netdata_mutex_init(netdata_mutex_t *mutex) { + int ret = pthread_mutex_init(mutex, NULL); + if(unlikely(ret != 0)) + netdata_log_error("MUTEX_LOCK: failed to initialize (code %d).", ret); + return ret; +} + +int __netdata_mutex_destroy(netdata_mutex_t *mutex) { + int ret = pthread_mutex_destroy(mutex); + if(unlikely(ret != 0)) + netdata_log_error("MUTEX_LOCK: failed to destroy (code %d).", ret); + return ret; +} + +int __netdata_mutex_lock(netdata_mutex_t *mutex) { + int ret = pthread_mutex_lock(mutex); + if(unlikely(ret != 0)) { + netdata_log_error("MUTEX_LOCK: failed to get lock (code %d)", ret); + } + else + nd_thread_mutex_locked(); + + return ret; +} + +int __netdata_mutex_trylock(netdata_mutex_t *mutex) { + int ret = pthread_mutex_trylock(mutex); + if(ret != 0) + ; + else + nd_thread_mutex_locked(); + + return ret; +} + +int __netdata_mutex_unlock(netdata_mutex_t *mutex) { + int ret = pthread_mutex_unlock(mutex); + if(unlikely(ret != 0)) + netdata_log_error("MUTEX_LOCK: failed to unlock (code %d).", ret); + else + nd_thread_mutex_unlocked(); + + return ret; +} + +#ifdef NETDATA_TRACE_RWLOCKS + +int netdata_mutex_init_debug(const char *file __maybe_unused, const char *function __maybe_unused, + const unsigned long line __maybe_unused, netdata_mutex_t *mutex) { + netdata_log_debug(D_LOCKS, "MUTEX_LOCK: netdata_mutex_init(%p) from %lu@%s, %s()", mutex, line, file, function); + + int ret = __netdata_mutex_init(mutex); + + netdata_log_debug(D_LOCKS, "MUTEX_LOCK: netdata_mutex_init(%p) = %d, from %lu@%s, %s()", mutex, ret, line, file, function); + + return ret; +} + +int netdata_mutex_destroy_debug(const char *file __maybe_unused, const char *function __maybe_unused, + const unsigned long line __maybe_unused, netdata_mutex_t *mutex) { + netdata_log_debug(D_LOCKS, "MUTEX_LOCK: netdata_mutex_destroy(%p) from %lu@%s, %s()", mutex, line, file, function); + + int ret = __netdata_mutex_destroy(mutex); + + netdata_log_debug(D_LOCKS, "MUTEX_LOCK: netdata_mutex_destroy(%p) = %d, from %lu@%s, %s()", mutex, ret, line, file, function); + + return ret; +} + +int netdata_mutex_lock_debug(const char *file __maybe_unused, const char *function __maybe_unused, + const unsigned long line __maybe_unused, netdata_mutex_t *mutex) { + netdata_log_debug(D_LOCKS, "MUTEX_LOCK: netdata_mutex_lock(%p) from %lu@%s, %s()", mutex, line, file, function); + + usec_t start_s = now_monotonic_high_precision_usec(); + int ret = __netdata_mutex_lock(mutex); + usec_t end_s = now_monotonic_high_precision_usec(); + + // remove compiler unused variables warning + (void)start_s; + (void)end_s; + + netdata_log_debug(D_LOCKS, "MUTEX_LOCK: netdata_mutex_lock(%p) = %d in %llu usec, from %lu@%s, %s()", mutex, ret, end_s - start_s, line, file, function); + + return ret; +} + +int netdata_mutex_trylock_debug(const char *file __maybe_unused, const char *function __maybe_unused, + const unsigned long line __maybe_unused, netdata_mutex_t *mutex) { + netdata_log_debug(D_LOCKS, "MUTEX_LOCK: netdata_mutex_trylock(%p) from %lu@%s, %s()", mutex, line, file, function); + + usec_t start_s = now_monotonic_high_precision_usec(); + int ret = __netdata_mutex_trylock(mutex); + usec_t end_s = now_monotonic_high_precision_usec(); + + // remove compiler unused variables warning + (void)start_s; + (void)end_s; + + netdata_log_debug(D_LOCKS, "MUTEX_LOCK: netdata_mutex_trylock(%p) = %d in %llu usec, from %lu@%s, %s()", mutex, ret, end_s - start_s, line, file, function); + + return ret; +} + +int netdata_mutex_unlock_debug(const char *file __maybe_unused, const char *function __maybe_unused, + const unsigned long line __maybe_unused, netdata_mutex_t *mutex) { + netdata_log_debug(D_LOCKS, "MUTEX_LOCK: netdata_mutex_unlock(%p) from %lu@%s, %s()", mutex, line, file, function); + + usec_t start_s = now_monotonic_high_precision_usec(); + int ret = __netdata_mutex_unlock(mutex); + usec_t end_s = now_monotonic_high_precision_usec(); + + // remove compiler unused variables warning + (void)start_s; + (void)end_s; + + netdata_log_debug(D_LOCKS, "MUTEX_LOCK: netdata_mutex_unlock(%p) = %d in %llu usec, from %lu@%s, %s()", mutex, ret, end_s - start_s, line, file, function); + + return ret; +} + +#endif // NETDATA_TRACE_RWLOCKS + +// ---------------------------------------------------------------------------- +// rwlock + +int __netdata_rwlock_destroy(netdata_rwlock_t *rwlock) { + int ret = pthread_rwlock_destroy(&rwlock->rwlock_t); + if(unlikely(ret != 0)) + netdata_log_error("RW_LOCK: failed to destroy lock (code %d)", ret); + return ret; +} + +int __netdata_rwlock_init(netdata_rwlock_t *rwlock) { + int ret = pthread_rwlock_init(&rwlock->rwlock_t, NULL); + if(unlikely(ret != 0)) + netdata_log_error("RW_LOCK: failed to initialize lock (code %d)", ret); + return ret; +} + +int __netdata_rwlock_rdlock(netdata_rwlock_t *rwlock) { + int ret = pthread_rwlock_rdlock(&rwlock->rwlock_t); + if(unlikely(ret != 0)) + netdata_log_error("RW_LOCK: failed to obtain read lock (code %d)", ret); + else + nd_thread_rwlock_read_locked(); + + return ret; +} + +int __netdata_rwlock_wrlock(netdata_rwlock_t *rwlock) { + int ret = pthread_rwlock_wrlock(&rwlock->rwlock_t); + if(unlikely(ret != 0)) + netdata_log_error("RW_LOCK: failed to obtain write lock (code %d)", ret); + else + nd_thread_rwlock_write_locked(); + + return ret; +} + +int __netdata_rwlock_rdunlock(netdata_rwlock_t *rwlock) { + int ret = pthread_rwlock_unlock(&rwlock->rwlock_t); + if(unlikely(ret != 0)) + netdata_log_error("RW_LOCK: failed to release lock (code %d)", ret); + else + nd_thread_rwlock_read_unlocked(); + + return ret; +} + +int __netdata_rwlock_wrunlock(netdata_rwlock_t *rwlock) { + int ret = pthread_rwlock_unlock(&rwlock->rwlock_t); + if(unlikely(ret != 0)) + netdata_log_error("RW_LOCK: failed to release lock (code %d)", ret); + else + nd_thread_rwlock_write_unlocked(); + + return ret; +} + +int __netdata_rwlock_tryrdlock(netdata_rwlock_t *rwlock) { + int ret = pthread_rwlock_tryrdlock(&rwlock->rwlock_t); + if(ret != 0) + ; + else + nd_thread_rwlock_read_locked(); + + return ret; +} + +int __netdata_rwlock_trywrlock(netdata_rwlock_t *rwlock) { + int ret = pthread_rwlock_trywrlock(&rwlock->rwlock_t); + if(ret != 0) + ; + else + nd_thread_rwlock_write_locked(); + + return ret; +} + +// ---------------------------------------------------------------------------- +// spinlock implementation +// https://www.youtube.com/watch?v=rmGJc9PXpuE&t=41s + +void spinlock_init(SPINLOCK *spinlock) { + memset(spinlock, 0, sizeof(SPINLOCK)); +} + +static inline void spinlock_lock_internal(SPINLOCK *spinlock) { +#ifdef NETDATA_INTERNAL_CHECKS + size_t spins = 0; +#endif + + for(int i = 1; + __atomic_load_n(&spinlock->locked, __ATOMIC_RELAXED) || + __atomic_test_and_set(&spinlock->locked, __ATOMIC_ACQUIRE) + ; i++ + ) { + +#ifdef NETDATA_INTERNAL_CHECKS + spins++; +#endif + if(unlikely(i == 8)) { + i = 0; + tinysleep(); + } + } + + // we have the lock + +#ifdef NETDATA_INTERNAL_CHECKS + spinlock->spins += spins; + spinlock->locker_pid = gettid_cached(); +#endif + + nd_thread_spinlock_locked(); +} + +static inline void spinlock_unlock_internal(SPINLOCK *spinlock) { +#ifdef NETDATA_INTERNAL_CHECKS + spinlock->locker_pid = 0; +#endif + __atomic_clear(&spinlock->locked, __ATOMIC_RELEASE); + + nd_thread_spinlock_unlocked(); +} + +static inline bool spinlock_trylock_internal(SPINLOCK *spinlock) { + if(!__atomic_load_n(&spinlock->locked, __ATOMIC_RELAXED) && + !__atomic_test_and_set(&spinlock->locked, __ATOMIC_ACQUIRE)) { + // we got the lock + nd_thread_spinlock_locked(); + return true; + } + + return false; +} + +void spinlock_lock(SPINLOCK *spinlock) +{ + spinlock_lock_internal(spinlock); +} + +void spinlock_unlock(SPINLOCK *spinlock) +{ + spinlock_unlock_internal(spinlock); +} + +bool spinlock_trylock(SPINLOCK *spinlock) +{ + return spinlock_trylock_internal(spinlock); +} + +void spinlock_lock_cancelable(SPINLOCK *spinlock) +{ + spinlock_lock_internal(spinlock); +} + +void spinlock_unlock_cancelable(SPINLOCK *spinlock) +{ + spinlock_unlock_internal(spinlock); +} + +bool spinlock_trylock_cancelable(SPINLOCK *spinlock) +{ + return spinlock_trylock_internal(spinlock); +} + +// ---------------------------------------------------------------------------- +// rw_spinlock implementation + +void rw_spinlock_init(RW_SPINLOCK *rw_spinlock) { + rw_spinlock->readers = 0; + spinlock_init(&rw_spinlock->spinlock); +} + +void rw_spinlock_read_lock(RW_SPINLOCK *rw_spinlock) { + spinlock_lock(&rw_spinlock->spinlock); + __atomic_add_fetch(&rw_spinlock->readers, 1, __ATOMIC_RELAXED); + spinlock_unlock(&rw_spinlock->spinlock); + + nd_thread_rwspinlock_read_locked(); +} + +void rw_spinlock_read_unlock(RW_SPINLOCK *rw_spinlock) { +#ifndef NETDATA_INTERNAL_CHECKS + __atomic_sub_fetch(&rw_spinlock->readers, 1, __ATOMIC_RELAXED); +#else + int32_t x = __atomic_sub_fetch(&rw_spinlock->readers, 1, __ATOMIC_RELAXED); + if(x < 0) + fatal("RW_SPINLOCK: readers is negative %d", x); +#endif + + nd_thread_rwspinlock_read_unlocked(); +} + +void rw_spinlock_write_lock(RW_SPINLOCK *rw_spinlock) { + size_t spins = 0; + while(1) { + spins++; + spinlock_lock(&rw_spinlock->spinlock); + + if(__atomic_load_n(&rw_spinlock->readers, __ATOMIC_RELAXED) == 0) + break; + + // Busy wait until all readers have released their locks. + spinlock_unlock(&rw_spinlock->spinlock); + tinysleep(); + } + + (void)spins; + + nd_thread_rwspinlock_write_locked(); +} + +void rw_spinlock_write_unlock(RW_SPINLOCK *rw_spinlock) { + spinlock_unlock(&rw_spinlock->spinlock); + nd_thread_rwspinlock_write_unlocked(); +} + +bool rw_spinlock_tryread_lock(RW_SPINLOCK *rw_spinlock) { + if(spinlock_trylock(&rw_spinlock->spinlock)) { + __atomic_add_fetch(&rw_spinlock->readers, 1, __ATOMIC_RELAXED); + spinlock_unlock(&rw_spinlock->spinlock); + nd_thread_rwspinlock_read_locked(); + return true; + } + + return false; +} + +bool rw_spinlock_trywrite_lock(RW_SPINLOCK *rw_spinlock) { + if(spinlock_trylock(&rw_spinlock->spinlock)) { + if (__atomic_load_n(&rw_spinlock->readers, __ATOMIC_RELAXED) == 0) { + // No readers, we've successfully acquired the write lock + nd_thread_rwspinlock_write_locked(); + return true; + } + else { + // There are readers, unlock the spinlock and return false + spinlock_unlock(&rw_spinlock->spinlock); + } + } + + return false; +} + + +#ifdef NETDATA_TRACE_RWLOCKS + +// ---------------------------------------------------------------------------- +// lockers list + +static netdata_rwlock_locker *find_rwlock_locker(const char *file __maybe_unused, const char *function __maybe_unused, const unsigned long line __maybe_unused, netdata_rwlock_t *rwlock) { + pid_t pid = gettid(); + netdata_rwlock_locker *locker = NULL; + + __netdata_mutex_lock(&rwlock->lockers_mutex); + Pvoid_t *PValue = JudyLGet(rwlock->lockers_pid_JudyL, pid, PJE0); + if(PValue && *PValue) + locker = *PValue; + __netdata_mutex_unlock(&rwlock->lockers_mutex); + + return locker; +} + +static netdata_rwlock_locker *add_rwlock_locker(const char *file, const char *function, const unsigned long line, netdata_rwlock_t *rwlock, LOCKER_REQUEST lock_type) { + netdata_rwlock_locker *locker; + + locker = find_rwlock_locker(file, function, line, rwlock); + if(locker) { + locker->lock |= lock_type; + locker->refcount++; + } + else { + locker = mallocz(sizeof(netdata_rwlock_locker)); + locker->pid = gettid(); + locker->tag = netdata_thread_tag(); + locker->refcount = 1; + locker->lock = lock_type; + locker->got_it = false; + locker->file = file; + locker->function = function; + locker->line = line; + + __netdata_mutex_lock(&rwlock->lockers_mutex); + DOUBLE_LINKED_LIST_APPEND_UNSAFE(rwlock->lockers, locker, prev, next); + Pvoid_t *PValue = JudyLIns(&rwlock->lockers_pid_JudyL, locker->pid, PJE0); + *PValue = locker; + if (lock_type == RWLOCK_REQUEST_READ || lock_type == RWLOCK_REQUEST_TRYREAD) rwlock->readers++; + if (lock_type == RWLOCK_REQUEST_WRITE || lock_type == RWLOCK_REQUEST_TRYWRITE) rwlock->writers++; + __netdata_mutex_unlock(&rwlock->lockers_mutex); + } + + return locker; +} + +static void remove_rwlock_locker(const char *file __maybe_unused, const char *function __maybe_unused, const unsigned long line __maybe_unused, netdata_rwlock_t *rwlock, netdata_rwlock_locker *locker) { + __netdata_mutex_lock(&rwlock->lockers_mutex); + locker->refcount--; + if(!locker->refcount) { + DOUBLE_LINKED_LIST_REMOVE_UNSAFE(rwlock->lockers, locker, prev, next); + JudyLDel(&rwlock->lockers_pid_JudyL, locker->pid, PJE0); + if (locker->lock == RWLOCK_REQUEST_READ || locker->lock == RWLOCK_REQUEST_TRYREAD) rwlock->readers--; + else if (locker->lock == RWLOCK_REQUEST_WRITE || locker->lock == RWLOCK_REQUEST_TRYWRITE) rwlock->writers--; + freez(locker); + } + __netdata_mutex_unlock(&rwlock->lockers_mutex); +} + +// ---------------------------------------------------------------------------- +// debug versions of rwlock + +int netdata_rwlock_destroy_debug(const char *file __maybe_unused, const char *function __maybe_unused, + const unsigned long line __maybe_unused, netdata_rwlock_t *rwlock) { + + int ret = __netdata_rwlock_destroy(rwlock); + if(!ret) { + while (rwlock->lockers) + remove_rwlock_locker(file, function, line, rwlock, rwlock->lockers); + } + + return ret; +} + +int netdata_rwlock_init_debug(const char *file __maybe_unused, const char *function __maybe_unused, + const unsigned long line __maybe_unused, netdata_rwlock_t *rwlock) { + + int ret = __netdata_rwlock_init(rwlock); + if(!ret) { + __netdata_mutex_init(&rwlock->lockers_mutex); + rwlock->lockers_pid_JudyL = NULL; + rwlock->lockers = NULL; + rwlock->readers = 0; + rwlock->writers = 0; + } + + return ret; +} + +int netdata_rwlock_rdlock_debug(const char *file __maybe_unused, const char *function __maybe_unused, + const unsigned long line __maybe_unused, netdata_rwlock_t *rwlock) { + + netdata_rwlock_locker *locker = add_rwlock_locker(file, function, line, rwlock, RWLOCK_REQUEST_READ); + + int ret = __netdata_rwlock_rdlock(rwlock); + if(!ret) + locker->got_it = true; + else + remove_rwlock_locker(file, function, line, rwlock, locker); + + return ret; +} + +int netdata_rwlock_wrlock_debug(const char *file __maybe_unused, const char *function __maybe_unused, + const unsigned long line __maybe_unused, netdata_rwlock_t *rwlock) { + + netdata_rwlock_locker *locker = add_rwlock_locker(file, function, line, rwlock, RWLOCK_REQUEST_WRITE); + + int ret = __netdata_rwlock_wrlock(rwlock); + if(!ret) + locker->got_it = true; + else + remove_rwlock_locker(file, function, line, rwlock, locker); + + return ret; +} + +int netdata_rwlock_rdunlock_debug(const char *file __maybe_unused, const char *function __maybe_unused, + const unsigned long line __maybe_unused, netdata_rwlock_t *rwlock) { + + netdata_rwlock_locker *locker = find_rwlock_locker(file, function, line, rwlock); + + if(unlikely(!locker)) + fatal("UNLOCK WITHOUT LOCK"); + + int ret = __netdata_rwlock_rdunlock(rwlock); + if(likely(!ret)) + remove_rwlock_locker(file, function, line, rwlock, locker); + + return ret; +} + +int netdata_rwlock_wrunlock_debug(const char *file __maybe_unused, const char *function __maybe_unused, + const unsigned long line __maybe_unused, netdata_rwlock_t *rwlock) { + + netdata_rwlock_locker *locker = find_rwlock_locker(file, function, line, rwlock); + + if(unlikely(!locker)) + fatal("UNLOCK WITHOUT LOCK"); + + int ret = __netdata_rwlock_wrunlock(rwlock); + if(likely(!ret)) + remove_rwlock_locker(file, function, line, rwlock, locker); + + return ret; +} + +int netdata_rwlock_tryrdlock_debug(const char *file __maybe_unused, const char *function __maybe_unused, + const unsigned long line __maybe_unused, netdata_rwlock_t *rwlock) { + + netdata_rwlock_locker *locker = add_rwlock_locker(file, function, line, rwlock, RWLOCK_REQUEST_TRYREAD); + + int ret = __netdata_rwlock_tryrdlock(rwlock); + if(!ret) + locker->got_it = true; + else + remove_rwlock_locker(file, function, line, rwlock, locker); + + return ret; +} + +int netdata_rwlock_trywrlock_debug(const char *file __maybe_unused, const char *function __maybe_unused, + const unsigned long line __maybe_unused, netdata_rwlock_t *rwlock) { + + netdata_rwlock_locker *locker = add_rwlock_locker(file, function, line, rwlock, RWLOCK_REQUEST_TRYWRITE); + + int ret = __netdata_rwlock_trywrlock(rwlock); + if(!ret) + locker->got_it = true; + else + remove_rwlock_locker(file, function, line, rwlock, locker); + + return ret; +} + +#endif // NETDATA_TRACE_RWLOCKS diff --git a/src/libnetdata/locks/locks.h b/src/libnetdata/locks/locks.h new file mode 100644 index 00000000..d3873c29 --- /dev/null +++ b/src/libnetdata/locks/locks.h @@ -0,0 +1,165 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_LOCKS_H +#define NETDATA_LOCKS_H 1 + +#include "../libnetdata.h" +#include "../clocks/clocks.h" + +typedef pthread_mutex_t netdata_mutex_t; +#define NETDATA_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER + +typedef struct netdata_spinlock { + bool locked; +#ifdef NETDATA_INTERNAL_CHECKS + pid_t locker_pid; + size_t spins; +#endif +} SPINLOCK; + +#define NETDATA_SPINLOCK_INITIALIZER \ + { .locked = false } + +void spinlock_init(SPINLOCK *spinlock); +void spinlock_lock(SPINLOCK *spinlock); +void spinlock_unlock(SPINLOCK *spinlock); +bool spinlock_trylock(SPINLOCK *spinlock); + +void spinlock_lock_cancelable(SPINLOCK *spinlock); +void spinlock_unlock_cancelable(SPINLOCK *spinlock); +bool spinlock_trylock_cancelable(SPINLOCK *spinlock); + +typedef struct netdata_rw_spinlock { + int32_t readers; + SPINLOCK spinlock; +} RW_SPINLOCK; + +#define NETDATA_RW_SPINLOCK_INITIALIZER \ + { .readers = 0, .spinlock = NETDATA_SPINLOCK_INITIALIZER } + +void rw_spinlock_init(RW_SPINLOCK *rw_spinlock); +void rw_spinlock_read_lock(RW_SPINLOCK *rw_spinlock); +void rw_spinlock_read_unlock(RW_SPINLOCK *rw_spinlock); +void rw_spinlock_write_lock(RW_SPINLOCK *rw_spinlock); +void rw_spinlock_write_unlock(RW_SPINLOCK *rw_spinlock); +bool rw_spinlock_tryread_lock(RW_SPINLOCK *rw_spinlock); +bool rw_spinlock_trywrite_lock(RW_SPINLOCK *rw_spinlock); + +#ifdef NETDATA_TRACE_RWLOCKS + +typedef enum { + RWLOCK_REQUEST_READ = (1 << 0), + RWLOCK_REQUEST_WRITE = (1 << 1), + RWLOCK_REQUEST_TRYREAD = (1 << 2), + RWLOCK_REQUEST_TRYWRITE = (1 << 3), +} LOCKER_REQUEST; + +typedef struct netdata_rwlock_locker { + LOCKER_REQUEST lock; + bool got_it; + pid_t pid; + size_t refcount; + const char *tag; + const char *file; + const char *function; + unsigned long line; + struct netdata_rwlock_locker *next, *prev; +} netdata_rwlock_locker; + +typedef struct netdata_rwlock_t { + pthread_rwlock_t rwlock_t; // the lock + size_t readers; // the number of reader on the lock + size_t writers; // the number of writers on the lock + netdata_mutex_t lockers_mutex; // a mutex to protect the linked list of the lock holding threads + netdata_rwlock_locker *lockers; // the linked list of the lock holding threads + Pvoid_t lockers_pid_JudyL; +} netdata_rwlock_t; + +#define NETDATA_RWLOCK_INITIALIZER { \ + .rwlock_t = PTHREAD_RWLOCK_INITIALIZER, \ + .readers = 0, \ + .writers = 0, \ + .lockers_mutex = NETDATA_MUTEX_INITIALIZER, \ + .lockers = NULL, \ + .lockers_pid_JudyL = NULL, \ + } + +#else // NETDATA_TRACE_RWLOCKS + +typedef struct netdata_rwlock_t { + pthread_rwlock_t rwlock_t; +} netdata_rwlock_t; + +#define NETDATA_RWLOCK_INITIALIZER { \ + .rwlock_t = PTHREAD_RWLOCK_INITIALIZER \ + } + +#endif // NETDATA_TRACE_RWLOCKS + +int __netdata_mutex_init(netdata_mutex_t *mutex); +int __netdata_mutex_destroy(netdata_mutex_t *mutex); +int __netdata_mutex_lock(netdata_mutex_t *mutex); +int __netdata_mutex_trylock(netdata_mutex_t *mutex); +int __netdata_mutex_unlock(netdata_mutex_t *mutex); + +int __netdata_rwlock_destroy(netdata_rwlock_t *rwlock); +int __netdata_rwlock_init(netdata_rwlock_t *rwlock); +int __netdata_rwlock_rdlock(netdata_rwlock_t *rwlock); +int __netdata_rwlock_wrlock(netdata_rwlock_t *rwlock); +int __netdata_rwlock_rdunlock(netdata_rwlock_t *rwlock); +int __netdata_rwlock_wrunlock(netdata_rwlock_t *rwlock); +int __netdata_rwlock_tryrdlock(netdata_rwlock_t *rwlock); +int __netdata_rwlock_trywrlock(netdata_rwlock_t *rwlock); + +#ifdef NETDATA_TRACE_RWLOCKS + +int netdata_mutex_init_debug( const char *file, const char *function, const unsigned long line, netdata_mutex_t *mutex); +int netdata_mutex_destroy_debug( const char *file, const char *function, const unsigned long line, netdata_mutex_t *mutex); +int netdata_mutex_lock_debug( const char *file, const char *function, const unsigned long line, netdata_mutex_t *mutex); +int netdata_mutex_trylock_debug( const char *file, const char *function, const unsigned long line, netdata_mutex_t *mutex); +int netdata_mutex_unlock_debug( const char *file, const char *function, const unsigned long line, netdata_mutex_t *mutex); + +int netdata_rwlock_destroy_debug( const char *file, const char *function, const unsigned long line, netdata_rwlock_t *rwlock); +int netdata_rwlock_init_debug( const char *file, const char *function, const unsigned long line, netdata_rwlock_t *rwlock); +int netdata_rwlock_rdlock_debug( const char *file, const char *function, const unsigned long line, netdata_rwlock_t *rwlock); +int netdata_rwlock_wrlock_debug( const char *file, const char *function, const unsigned long line, netdata_rwlock_t *rwlock); +int netdata_rwlock_rdunlock_debug( const char *file, const char *function, const unsigned long line, netdata_rwlock_t *rwlock); +int netdata_rwlock_wrunlock_debug( const char *file, const char *function, const unsigned long line, netdata_rwlock_t *rwlock); +int netdata_rwlock_tryrdlock_debug( const char *file, const char *function, const unsigned long line, netdata_rwlock_t *rwlock); +int netdata_rwlock_trywrlock_debug( const char *file, const char *function, const unsigned long line, netdata_rwlock_t *rwlock); + +#define netdata_mutex_init(mutex) netdata_mutex_init_debug(__FILE__, __FUNCTION__, __LINE__, mutex) +#define netdata_mutex_destroy(mutex) netdata_mutex_init_debug(__FILE__, __FUNCTION__, __LINE__, mutex) +#define netdata_mutex_lock(mutex) netdata_mutex_lock_debug(__FILE__, __FUNCTION__, __LINE__, mutex) +#define netdata_mutex_trylock(mutex) netdata_mutex_trylock_debug(__FILE__, __FUNCTION__, __LINE__, mutex) +#define netdata_mutex_unlock(mutex) netdata_mutex_unlock_debug(__FILE__, __FUNCTION__, __LINE__, mutex) + +#define netdata_rwlock_destroy(rwlock) netdata_rwlock_destroy_debug(__FILE__, __FUNCTION__, __LINE__, rwlock) +#define netdata_rwlock_init(rwlock) netdata_rwlock_init_debug(__FILE__, __FUNCTION__, __LINE__, rwlock) +#define netdata_rwlock_rdlock(rwlock) netdata_rwlock_rdlock_debug(__FILE__, __FUNCTION__, __LINE__, rwlock) +#define netdata_rwlock_wrlock(rwlock) netdata_rwlock_wrlock_debug(__FILE__, __FUNCTION__, __LINE__, rwlock) +#define netdata_rwlock_rdunlock(rwlock) netdata_rwlock_rdunlock_debug(__FILE__, __FUNCTION__, __LINE__, rwlock) +#define netdata_rwlock_wrunlock(rwlock) netdata_rwlock_wrunlock_debug(__FILE__, __FUNCTION__, __LINE__, rwlock) +#define netdata_rwlock_tryrdlock(rwlock) netdata_rwlock_tryrdlock_debug(__FILE__, __FUNCTION__, __LINE__, rwlock) +#define netdata_rwlock_trywrlock(rwlock) netdata_rwlock_trywrlock_debug(__FILE__, __FUNCTION__, __LINE__, rwlock) + +#else // !NETDATA_TRACE_RWLOCKS + +#define netdata_mutex_init(mutex) __netdata_mutex_init(mutex) +#define netdata_mutex_destroy(mutex) __netdata_mutex_destroy(mutex) +#define netdata_mutex_lock(mutex) __netdata_mutex_lock(mutex) +#define netdata_mutex_trylock(mutex) __netdata_mutex_trylock(mutex) +#define netdata_mutex_unlock(mutex) __netdata_mutex_unlock(mutex) + +#define netdata_rwlock_destroy(rwlock) __netdata_rwlock_destroy(rwlock) +#define netdata_rwlock_init(rwlock) __netdata_rwlock_init(rwlock) +#define netdata_rwlock_rdlock(rwlock) __netdata_rwlock_rdlock(rwlock) +#define netdata_rwlock_wrlock(rwlock) __netdata_rwlock_wrlock(rwlock) +#define netdata_rwlock_rdunlock(rwlock) __netdata_rwlock_rdunlock(rwlock) +#define netdata_rwlock_wrunlock(rwlock) __netdata_rwlock_wrunlock(rwlock) +#define netdata_rwlock_tryrdlock(rwlock) __netdata_rwlock_tryrdlock(rwlock) +#define netdata_rwlock_trywrlock(rwlock) __netdata_rwlock_trywrlock(rwlock) + +#endif // NETDATA_TRACE_RWLOCKS + +#endif //NETDATA_LOCKS_H diff --git a/src/libnetdata/log/README.md b/src/libnetdata/log/README.md new file mode 100644 index 00000000..ef9ca1ef --- /dev/null +++ b/src/libnetdata/log/README.md @@ -0,0 +1,223 @@ +<!-- +title: "Log" +custom_edit_url: https://github.com/netdata/netdata/edit/master/src/libnetdata/log/README.md +sidebar_label: "Log" +learn_status: "Published" +learn_topic_type: "Tasks" +learn_rel_path: "Developers/libnetdata" +--> + +# Netdata Logging + +This document describes how Netdata generates its own logs, not how Netdata manages and queries logs databases. + +## Log sources + +Netdata supports the following log sources: + +1. **daemon**, logs generated by Netdata daemon. +2. **collector**, logs generated by Netdata collectors, including internal and external ones. +3. **access**, API requests received by Netdata +4. **health**, all alert transitions and notifications + +## Log outputs + +For each log source, Netdata supports the following output methods: + +- **off**, to disable this log source +- **journal**, to send the logs to systemd-journal. +- **syslog**, to send the logs to syslog. +- **system**, to send the output to `stderr` or `stdout` depending on the log source. +- **stdout**, to write the logs to Netdata's `stdout`. +- **stderr**, to write the logs to Netdata's `stderr`. +- **filename**, to send the logs to a file. + +For `daemon` and `collector` the default is `journal` when systemd-journal is available. +To decide if systemd-journal is available, Netdata checks: + +1. `stderr` is connected to systemd-journald +2. `/run/systemd/journal/socket` exists +3. `/host/run/systemd/journal/socket` exists (`/host` is configurable in containers) + +If any of the above is detected, Netdata will select `journal` for `daemon` and `collector` sources. + +All other sources default to a file. + +## Log formats + +| Format | Description | +|---------|--------------------------------------------------------------------------------------------------------| +| journal | journald-specific log format. Automatically selected when logging to systemd-journal. | +| logfmt | logs data as a series of key/value pairs. The default when logging to any output other than `journal`. | +| json | logs data in JSON format. | + +## Log levels + +Each time Netdata logs, it assigns a priority to the log. It can be one of this (in order of importance): + +| Level | Description | +|-----------|----------------------------------------------------------------------------------------| +| emergency | a fatal condition, Netdata will most likely exit immediately after. | +| alert | a very important issue that may affect how Netdata operates. | +| critical | a very important issue the user should know which, Netdata thinks it can survive. | +| error | an error condition indicating that Netdata is trying to do something, but it fails. | +| warning | something unexpected has happened that may or may not affect the operation of Netdata. | +| notice | something that does not affect the operation of Netdata, but the user should notice. | +| info | the default log level about information the user should know. | +| debug | these are more verbose logs that can be ignored. | + +## Logs Configuration + +In `netdata.conf`, there are the following settings: + +``` +[logs] + # logs to trigger flood protection = 1000 + # logs flood protection period = 60 + # facility = daemon + # level = info + # daemon = journal + # collector = journal + # access = /var/log/netdata/access.log + # health = /var/log/netdata/health.log +``` + +- `logs to trigger flood protection` and `logs flood protection period` enable logs flood protection for `daemon` and `collector` sources. It can also be configured per log source. +- `facility` is used only when Netdata logs to syslog. +- `level` defines the minimum [log level](#log-levels) of logs that will be logged. This setting is applied only to `daemon` and `collector` sources. It can also be configured per source. + +### Configuring log sources + +Each for the sources (`daemon`, `collector`, `access`, `health`), accepts the following: + +``` +source = {FORMAT},level={LEVEL},protection={LOG}/{PERIOD}@{OUTPUT} +``` + +Where: + +- `{FORMAT}`, is one of the [log formats](#log-formats), +- `{LEVEL}`, is the minimum [log level](#log-levels) to be logged, +- `{LOGS}` is the number of `logs to trigger flood protection` configured per output, +- `{PERIOD}` is the equivalent of `logs flood protection period` configured per output, +- `{OUTPUT}` is one of the `[log outputs](#log-outputs), + +All parameters can be omitted, except `{OUTPUT}`. If `{OUTPUT}` is the only given parameter, `@` can be omitted. + +### Logs rotation + +Netdata comes with `logrotate` configuration to rotate its log files periodically. + +The default is usually found in `/etc/logrotate.d/netdata`. + +Sending a `SIGHUP` to Netdata, will instruct it to re-open all its log files. + +## Log Fields + +<details> +<summary>All fields exposed by Netdata</summary> + +| journal | logfmt | json | Description | +|:--------------------------------------:|:------------------------------:|:------------------------------:|:---------------------------------------------------------------------------------------------------------:| +| `_SOURCE_REALTIME_TIMESTAMP` | `time` | `time` | the timestamp of the event | +| `SYSLOG_IDENTIFIER` | `comm` | `comm` | the program logging the event | +| `ND_LOG_SOURCE` | `source` | `source` | one of the [log sources](#log-sources) | +| `PRIORITY`<br/>numeric | `level`<br/>text | `level`<br/>numeric | one of the [log levels](#log-levels) | +| `ERRNO` | `errno` | `errno` | the numeric value of `errno` | +| `INVOCATION_ID` | - | - | a unique UUID of the Netdata session, reset on every Netdata restart, inherited by systemd when available | +| `CODE_LINE` | - | - | the line number of of the source code logging this event | +| `CODE_FILE` | - | - | the filename of the source code logging this event | +| `CODE_FUNCTION` | - | - | the function name of the source code logging this event | +| `TID` | `tid` | `tid` | the thread id of the thread logging this event | +| `THREAD_TAG` | `thread` | `thread` | the name of the thread logging this event | +| `MESSAGE_ID` | `msg_id` | `msg_id` | see [message IDs](#message-ids) | +| `ND_MODULE` | `module` | `module` | the Netdata module logging this event | +| `ND_NIDL_NODE` | `node` | `node` | the hostname of the node the event is related to | +| `ND_NIDL_INSTANCE` | `instance` | `instance` | the instance of the node the event is related to | +| `ND_NIDL_CONTEXT` | `context` | `context` | the context the event is related to (this is usually the chart name, as shown on netdata dashboards | +| `ND_NIDL_DIMENSION` | `dimension` | `dimension` | the dimension the event is related to | +| `ND_SRC_TRANSPORT` | `src_transport` | `src_transport` | when the event happened during a request, this is the request transport | +| `ND_SRC_IP` | `src_ip` | `src_ip` | when the event happened during an inbound request, this is the IP the request came from | +| `ND_SRC_PORT` | `src_port` | `src_port` | when the event happened during an inbound request, this is the port the request came from | +| `ND_SRC_FORWARDED_HOST` | `src_forwarded_host` | `src_forwarded_host` | the contents of the HTTP header `X-Forwarded-Host` | +| `ND_SRC_FORWARDED_FOR` | `src_forwarded_for` | `src_forwarded_for` | the contents of the HTTP header `X-Forwarded-For` | +| `ND_SRC_CAPABILITIES` | `src_capabilities` | `src_capabilities` | when the request came from a child, this is the communication capabilities of the child | +| `ND_DST_TRANSPORT` | `dst_transport` | `dst_transport` | when the event happened during an outbound request, this is the outbound request transport | +| `ND_DST_IP` | `dst_ip` | `dst_ip` | when the event happened during an outbound request, this is the IP the request destination | +| `ND_DST_PORT` | `dst_port` | `dst_port` | when the event happened during an outbound request, this is the port the request destination | +| `ND_DST_CAPABILITIES` | `dst_capabilities` | `dst_capabilities` | when the request goes to a parent, this is the communication capabilities of the parent | +| `ND_REQUEST_METHOD` | `req_method` | `req_method` | when the event happened during an inbound request, this is the method the request was received | +| `ND_RESPONSE_CODE` | `code` | `code` | when responding to a request, this this the response code | +| `ND_CONNECTION_ID` | `conn` | `conn` | when there is a connection id for an inbound connection, this is the connection id | +| `ND_TRANSACTION_ID` | `transaction` | `transaction` | the transaction id (UUID) of all API requests | +| `ND_RESPONSE_SENT_BYTES` | `sent_bytes` | `sent_bytes` | the bytes we sent to API responses | +| `ND_RESPONSE_SIZE_BYTES` | `size_bytes` | `size_bytes` | the uncompressed bytes of the API responses | +| `ND_RESPONSE_PREP_TIME_USEC` | `prep_ut` | `prep_ut` | the time needed to prepare a response | +| `ND_RESPONSE_SENT_TIME_USEC` | `sent_ut` | `sent_ut` | the time needed to send a response | +| `ND_RESPONSE_TOTAL_TIME_USEC` | `total_ut` | `total_ut` | the total time needed to complete a response | +| `ND_ALERT_ID` | `alert_id` | `alert_id` | the alert id this event is related to | +| `ND_ALERT_EVENT_ID` | `alert_event_id` | `alert_event_id` | a sequential number of the alert transition (per host) | +| `ND_ALERT_UNIQUE_ID` | `alert_unique_id` | `alert_unique_id` | a sequential number of the alert transition (per alert) | +| `ND_ALERT_TRANSITION_ID` | `alert_transition_id` | `alert_transition_id` | the unique UUID of this alert transition | +| `ND_ALERT_CONFIG` | `alert_config` | `alert_config` | the alert configuration hash (UUID) | +| `ND_ALERT_NAME` | `alert` | `alert` | the alert name | +| `ND_ALERT_CLASS` | `alert_class` | `alert_class` | the alert classification | +| `ND_ALERT_COMPONENT` | `alert_component` | `alert_component` | the alert component | +| `ND_ALERT_TYPE` | `alert_type` | `alert_type` | the alert type | +| `ND_ALERT_EXEC` | `alert_exec` | `alert_exec` | the alert notification program | +| `ND_ALERT_RECIPIENT` | `alert_recipient` | `alert_recipient` | the alert recipient(s) | +| `ND_ALERT_VALUE` | `alert_value` | `alert_value` | the current alert value | +| `ND_ALERT_VALUE_OLD` | `alert_value_old` | `alert_value_old` | the previous alert value | +| `ND_ALERT_STATUS` | `alert_status` | `alert_status` | the current alert status | +| `ND_ALERT_STATUS_OLD` | `alert_value_old` | `alert_value_old` | the previous alert value | +| `ND_ALERT_UNITS` | `alert_units` | `alert_units` | the units of the alert | +| `ND_ALERT_SUMMARY` | `alert_summary` | `alert_summary` | the summary text of the alert | +| `ND_ALERT_INFO` | `alert_info` | `alert_info` | the info text of the alert | +| `ND_ALERT_DURATION` | `alert_duration` | `alert_duration` | the duration the alert was in its previous state | +| `ND_ALERT_NOTIFICATION_TIMESTAMP_USEC` | `alert_notification_timestamp` | `alert_notification_timestamp` | the timestamp the notification delivery is scheduled | +| `ND_REQUEST` | `request` | `request` | the full request during which the event happened | +| `MESSAGE` | `msg` | `msg` | the event message | + +</details> + +### Message IDs + +Netdata assigns specific message IDs to certain events: + +- `ed4cdb8f1beb4ad3b57cb3cae2d162fa` when a Netdata child connects to this Netdata +- `6e2e3839067648968b646045dbf28d66` when this Netdata connects to a Netdata parent +- `9ce0cb58ab8b44df82c4bf1ad9ee22de` when alerts change state +- `6db0018e83e34320ae2a659d78019fb7` when notifications are sent + +You can view these events using the Netdata systemd-journal.plugin at the `MESSAGE_ID` filter, +or using `journalctl` like this: + +```bash +# query children connection +journalctl MESSAGE_ID=ed4cdb8f1beb4ad3b57cb3cae2d162fa + +# query parent connection +journalctl MESSAGE_ID=6e2e3839067648968b646045dbf28d66 + +# query alert transitions +journalctl MESSAGE_ID=9ce0cb58ab8b44df82c4bf1ad9ee22de + +# query alert notifications +journalctl MESSAGE_ID=6db0018e83e34320ae2a659d78019fb7 +``` + +## Using journalctl to query Netdata logs + +The Netdata service's processes execute within the `netdata` journal namespace. To view the Netdata logs, you should +specify the `--namespace=netdata` option. + +```bash +# Netdata logs since the last time the service was started +journalctl _SYSTEMD_INVOCATION_ID="$(systemctl show --value --property=InvocationID netdata)" --namespace=netdata + +# All netdata logs, the oldest entries are displayed first +journalctl -u netdata --namespace=netdata + +# All netdata logs, the newest entries are displayed first +journalctl -u netdata --namespace=netdata -r +``` diff --git a/src/libnetdata/log/journal.c b/src/libnetdata/log/journal.c new file mode 100644 index 00000000..2182212f --- /dev/null +++ b/src/libnetdata/log/journal.c @@ -0,0 +1,142 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "journal.h" + +bool is_path_unix_socket(const char *path) { + // Check if the path is valid + if(!path || !*path) + return false; + + struct stat statbuf; + + // Use stat to check if the file exists and is a socket + if (stat(path, &statbuf) == -1) + // The file does not exist or cannot be accessed + return false; + + // Check if the file is a socket + if (S_ISSOCK(statbuf.st_mode)) + return true; + + return false; +} + +bool is_stderr_connected_to_journal(void) { + const char *journal_stream = getenv("JOURNAL_STREAM"); + if (!journal_stream) + return false; // JOURNAL_STREAM is not set + + struct stat stderr_stat; + if (fstat(STDERR_FILENO, &stderr_stat) < 0) + return false; // Error in getting stderr info + + // Parse device and inode from JOURNAL_STREAM + char *endptr; + long journal_dev = strtol(journal_stream, &endptr, 10); + if (*endptr != ':') + return false; // Format error in JOURNAL_STREAM + + long journal_ino = strtol(endptr + 1, NULL, 10); + + return (stderr_stat.st_dev == (dev_t)journal_dev) && (stderr_stat.st_ino == (ino_t)journal_ino); +} + +int journal_direct_fd(const char *path) { + if(!path || !*path) + path = JOURNAL_DIRECT_SOCKET; + + if(!is_path_unix_socket(path)) + return -1; + + int fd = socket(AF_UNIX, SOCK_DGRAM| DEFAULT_SOCKET_FLAGS, 0); + if (fd < 0) return -1; + + sock_setcloexec(fd); + + struct sockaddr_un addr; + memset(&addr, 0, sizeof(struct sockaddr_un)); + addr.sun_family = AF_UNIX; + strncpy(addr.sun_path, path, sizeof(addr.sun_path) - 1); + + // Connect the socket (optional, but can simplify send operations) + if (connect(fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) { + close(fd); + return -1; + } + + return fd; +} + +static inline bool journal_send_with_memfd(int fd __maybe_unused, const char *msg __maybe_unused, size_t msg_len __maybe_unused) { +#if defined(__NR_memfd_create) && defined(MFD_ALLOW_SEALING) && defined(F_ADD_SEALS) && defined(F_SEAL_SHRINK) && defined(F_SEAL_GROW) && defined(F_SEAL_WRITE) + // Create a memory file descriptor + int memfd = (int)syscall(__NR_memfd_create, "journald", MFD_ALLOW_SEALING); + if (memfd < 0) return false; + + // Write data to the memfd + if (write(memfd, msg, msg_len) != (ssize_t)msg_len) { + close(memfd); + return false; + } + + // Seal the memfd to make it immutable + if (fcntl(memfd, F_ADD_SEALS, F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE) < 0) { + close(memfd); + return false; + } + + struct iovec iov = {0}; + struct msghdr msghdr = {0}; + struct cmsghdr *cmsghdr; + char cmsgbuf[CMSG_SPACE(sizeof(int))]; + + msghdr.msg_iov = &iov; + msghdr.msg_iovlen = 1; + msghdr.msg_control = cmsgbuf; + msghdr.msg_controllen = sizeof(cmsgbuf); + + cmsghdr = CMSG_FIRSTHDR(&msghdr); + if(!cmsghdr) { + close(memfd); + return false; + } + + cmsghdr->cmsg_level = SOL_SOCKET; + cmsghdr->cmsg_type = SCM_RIGHTS; + cmsghdr->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsghdr), &memfd, sizeof(int)); + + ssize_t r = sendmsg(fd, &msghdr, 0); + + close(memfd); + return r >= 0; +#else + return false; +#endif +} + +bool journal_direct_send(int fd, const char *msg, size_t msg_len) { + // Send the datagram + if (send(fd, msg, msg_len, 0) < 0) { + if(errno != EMSGSIZE) + return false; + + // datagram is too large, fallback to memfd + if(!journal_send_with_memfd(fd, msg, msg_len)) + return false; + } + + return true; +} + +void journal_construct_path(char *dst, size_t dst_len, const char *host_prefix, const char *namespace_str) { + if(!host_prefix) + host_prefix = ""; + + if(namespace_str) + snprintfz(dst, dst_len, "%s/run/systemd/journal.%s/socket", + host_prefix, namespace_str); + else + snprintfz(dst, dst_len, "%s" JOURNAL_DIRECT_SOCKET, + host_prefix); +} diff --git a/src/libnetdata/log/journal.h b/src/libnetdata/log/journal.h new file mode 100644 index 00000000..df8ece18 --- /dev/null +++ b/src/libnetdata/log/journal.h @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" + +#ifndef NETDATA_LOG_JOURNAL_H +#define NETDATA_LOG_JOURNAL_H + +#define JOURNAL_DIRECT_SOCKET "/run/systemd/journal/socket" + +void journal_construct_path(char *dst, size_t dst_len, const char *host_prefix, const char *namespace_str); + +int journal_direct_fd(const char *path); +bool journal_direct_send(int fd, const char *msg, size_t msg_len); + +bool is_path_unix_socket(const char *path); +bool is_stderr_connected_to_journal(void); + +#endif //NETDATA_LOG_JOURNAL_H diff --git a/src/libnetdata/log/log.c b/src/libnetdata/log/log.c new file mode 100644 index 00000000..501b6632 --- /dev/null +++ b/src/libnetdata/log/log.c @@ -0,0 +1,2427 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +// do not REMOVE this, it is used by systemd-journal includes to prevent saving the file, function, line of the +// source code that makes the calls, allowing our loggers to log the lines of source code that actually log +#define SD_JOURNAL_SUPPRESS_LOCATION + +#include "../libnetdata.h" + +#ifdef __FreeBSD__ +#include <sys/endian.h> +#endif + +#ifdef __APPLE__ +#include <machine/endian.h> +#endif + +#if !defined(ENABLE_SENTRY) && defined(HAVE_BACKTRACE) +#include <execinfo.h> +#endif + +#ifdef HAVE_SYSTEMD +#include <systemd/sd-journal.h> +#endif + +const char *program_name = ""; + +uint64_t debug_flags = 0; + +#ifdef ENABLE_ACLK +int aclklog_enabled = 0; +#endif + +// ---------------------------------------------------------------------------- + +struct nd_log_source; +static bool nd_log_limit_reached(struct nd_log_source *source); + +// ---------------------------------------------------------------------------- +// logging method + +typedef enum __attribute__((__packed__)) { + NDLM_DISABLED = 0, + NDLM_DEVNULL, + NDLM_DEFAULT, + NDLM_JOURNAL, + NDLM_SYSLOG, + NDLM_STDOUT, + NDLM_STDERR, + NDLM_FILE, +} ND_LOG_METHOD; + +static struct { + ND_LOG_METHOD method; + const char *name; +} nd_log_methods[] = { + { .method = NDLM_DISABLED, .name = "none" }, + { .method = NDLM_DEVNULL, .name = "/dev/null" }, + { .method = NDLM_DEFAULT, .name = "default" }, + { .method = NDLM_JOURNAL, .name = "journal" }, + { .method = NDLM_SYSLOG, .name = "syslog" }, + { .method = NDLM_STDOUT, .name = "stdout" }, + { .method = NDLM_STDERR, .name = "stderr" }, + { .method = NDLM_FILE, .name = "file" }, +}; + +static ND_LOG_METHOD nd_log_method2id(const char *method) { + if(!method || !*method) + return NDLM_DEFAULT; + + size_t entries = sizeof(nd_log_methods) / sizeof(nd_log_methods[0]); + for(size_t i = 0; i < entries ;i++) { + if(strcmp(nd_log_methods[i].name, method) == 0) + return nd_log_methods[i].method; + } + + return NDLM_FILE; +} + +static const char *nd_log_id2method(ND_LOG_METHOD method) { + size_t entries = sizeof(nd_log_methods) / sizeof(nd_log_methods[0]); + for(size_t i = 0; i < entries ;i++) { + if(method == nd_log_methods[i].method) + return nd_log_methods[i].name; + } + + return "unknown"; +} + +#define IS_VALID_LOG_METHOD_FOR_EXTERNAL_PLUGINS(ndlo) ((ndlo) == NDLM_JOURNAL || (ndlo) == NDLM_SYSLOG || (ndlo) == NDLM_STDERR) + +const char *nd_log_method_for_external_plugins(const char *s) { + if(s && *s) { + ND_LOG_METHOD method = nd_log_method2id(s); + if(IS_VALID_LOG_METHOD_FOR_EXTERNAL_PLUGINS(method)) + return nd_log_id2method(method); + } + + return nd_log_id2method(NDLM_STDERR); +} + +// ---------------------------------------------------------------------------- +// workaround strerror_r() + +#if defined(STRERROR_R_CHAR_P) +// GLIBC version of strerror_r +static const char *strerror_result(const char *a, const char *b) { (void)b; return a; } +#elif defined(HAVE_STRERROR_R) +// POSIX version of strerror_r +static const char *strerror_result(int a, const char *b) { (void)a; return b; } +#elif defined(HAVE_C__GENERIC) + +// what a trick! +// http://stackoverflow.com/questions/479207/function-overloading-in-c +static const char *strerror_result_int(int a, const char *b) { (void)a; return b; } +static const char *strerror_result_string(const char *a, const char *b) { (void)b; return a; } + +#define strerror_result(a, b) _Generic((a), \ + int: strerror_result_int, \ + char *: strerror_result_string \ + )(a, b) + +#else +#error "cannot detect the format of function strerror_r()" +#endif + +static const char *errno2str(int errnum, char *buf, size_t size) { + return strerror_result(strerror_r(errnum, buf, size), buf); +} + +// ---------------------------------------------------------------------------- +// facilities +// +// sys/syslog.h (Linux) +// sys/sys/syslog.h (FreeBSD) +// bsd/sys/syslog.h (darwin-xnu) + +static struct { + int facility; + const char *name; +} nd_log_facilities[] = { + { LOG_AUTH, "auth" }, + { LOG_AUTHPRIV, "authpriv" }, + { LOG_CRON, "cron" }, + { LOG_DAEMON, "daemon" }, + { LOG_FTP, "ftp" }, + { LOG_KERN, "kern" }, + { LOG_LPR, "lpr" }, + { LOG_MAIL, "mail" }, + { LOG_NEWS, "news" }, + { LOG_SYSLOG, "syslog" }, + { LOG_USER, "user" }, + { LOG_UUCP, "uucp" }, + { LOG_LOCAL0, "local0" }, + { LOG_LOCAL1, "local1" }, + { LOG_LOCAL2, "local2" }, + { LOG_LOCAL3, "local3" }, + { LOG_LOCAL4, "local4" }, + { LOG_LOCAL5, "local5" }, + { LOG_LOCAL6, "local6" }, + { LOG_LOCAL7, "local7" }, + +#ifdef __FreeBSD__ + { LOG_CONSOLE, "console" }, + { LOG_NTP, "ntp" }, + + // FreeBSD does not consider 'security' as deprecated. + { LOG_SECURITY, "security" }, +#else + // For all other O/S 'security' is mapped to 'auth'. + { LOG_AUTH, "security" }, +#endif + +#ifdef __APPLE__ + { LOG_INSTALL, "install" }, + { LOG_NETINFO, "netinfo" }, + { LOG_RAS, "ras" }, + { LOG_REMOTEAUTH, "remoteauth" }, + { LOG_LAUNCHD, "launchd" }, + +#endif +}; + +static int nd_log_facility2id(const char *facility) { + size_t entries = sizeof(nd_log_facilities) / sizeof(nd_log_facilities[0]); + for(size_t i = 0; i < entries ;i++) { + if(strcmp(nd_log_facilities[i].name, facility) == 0) + return nd_log_facilities[i].facility; + } + + return LOG_DAEMON; +} + +static const char *nd_log_id2facility(int facility) { + size_t entries = sizeof(nd_log_facilities) / sizeof(nd_log_facilities[0]); + for(size_t i = 0; i < entries ;i++) { + if(nd_log_facilities[i].facility == facility) + return nd_log_facilities[i].name; + } + + return "daemon"; +} + +// ---------------------------------------------------------------------------- +// priorities + +static struct { + ND_LOG_FIELD_PRIORITY priority; + const char *name; +} nd_log_priorities[] = { + { .priority = NDLP_EMERG, .name = "emergency" }, + { .priority = NDLP_EMERG, .name = "emerg" }, + { .priority = NDLP_ALERT, .name = "alert" }, + { .priority = NDLP_CRIT, .name = "critical" }, + { .priority = NDLP_CRIT, .name = "crit" }, + { .priority = NDLP_ERR, .name = "error" }, + { .priority = NDLP_ERR, .name = "err" }, + { .priority = NDLP_WARNING, .name = "warning" }, + { .priority = NDLP_WARNING, .name = "warn" }, + { .priority = NDLP_NOTICE, .name = "notice" }, + { .priority = NDLP_INFO, .name = NDLP_INFO_STR }, + { .priority = NDLP_DEBUG, .name = "debug" }, +}; + +int nd_log_priority2id(const char *priority) { + size_t entries = sizeof(nd_log_priorities) / sizeof(nd_log_priorities[0]); + for(size_t i = 0; i < entries ;i++) { + if(strcmp(nd_log_priorities[i].name, priority) == 0) + return nd_log_priorities[i].priority; + } + + return NDLP_INFO; +} + +const char *nd_log_id2priority(ND_LOG_FIELD_PRIORITY priority) { + size_t entries = sizeof(nd_log_priorities) / sizeof(nd_log_priorities[0]); + for(size_t i = 0; i < entries ;i++) { + if(priority == nd_log_priorities[i].priority) + return nd_log_priorities[i].name; + } + + return NDLP_INFO_STR; +} + +// ---------------------------------------------------------------------------- +// log sources + +const char *nd_log_sources[] = { + [NDLS_UNSET] = "UNSET", + [NDLS_ACCESS] = "access", + [NDLS_ACLK] = "aclk", + [NDLS_COLLECTORS] = "collector", + [NDLS_DAEMON] = "daemon", + [NDLS_HEALTH] = "health", + [NDLS_DEBUG] = "debug", +}; + +size_t nd_log_source2id(const char *source, ND_LOG_SOURCES def) { + size_t entries = sizeof(nd_log_sources) / sizeof(nd_log_sources[0]); + for(size_t i = 0; i < entries ;i++) { + if(strcmp(nd_log_sources[i], source) == 0) + return i; + } + + return def; +} + + +static const char *nd_log_id2source(ND_LOG_SOURCES source) { + size_t entries = sizeof(nd_log_sources) / sizeof(nd_log_sources[0]); + if(source < entries) + return nd_log_sources[source]; + + return nd_log_sources[NDLS_COLLECTORS]; +} + +// ---------------------------------------------------------------------------- +// log output formats + +typedef enum __attribute__((__packed__)) { + NDLF_JOURNAL, + NDLF_LOGFMT, + NDLF_JSON, +} ND_LOG_FORMAT; + +static struct { + ND_LOG_FORMAT format; + const char *name; +} nd_log_formats[] = { + { .format = NDLF_JOURNAL, .name = "journal" }, + { .format = NDLF_LOGFMT, .name = "logfmt" }, + { .format = NDLF_JSON, .name = "json" }, +}; + +static ND_LOG_FORMAT nd_log_format2id(const char *format) { + if(!format || !*format) + return NDLF_LOGFMT; + + size_t entries = sizeof(nd_log_formats) / sizeof(nd_log_formats[0]); + for(size_t i = 0; i < entries ;i++) { + if(strcmp(nd_log_formats[i].name, format) == 0) + return nd_log_formats[i].format; + } + + return NDLF_LOGFMT; +} + +static const char *nd_log_id2format(ND_LOG_FORMAT format) { + size_t entries = sizeof(nd_log_formats) / sizeof(nd_log_formats[0]); + for(size_t i = 0; i < entries ;i++) { + if(format == nd_log_formats[i].format) + return nd_log_formats[i].name; + } + + return "logfmt"; +} + +// ---------------------------------------------------------------------------- +// format dates + +void log_date(char *buffer, size_t len, time_t now) { + if(unlikely(!buffer || !len)) + return; + + time_t t = now; + struct tm *tmp, tmbuf; + + tmp = localtime_r(&t, &tmbuf); + + if (unlikely(!tmp)) { + buffer[0] = '\0'; + return; + } + + if (unlikely(strftime(buffer, len, "%Y-%m-%d %H:%M:%S", tmp) == 0)) + buffer[0] = '\0'; + + buffer[len - 1] = '\0'; +} + +// ---------------------------------------------------------------------------- + +struct nd_log_limit { + usec_t started_monotonic_ut; + uint32_t counter; + uint32_t prevented; + + uint32_t throttle_period; + uint32_t logs_per_period; + uint32_t logs_per_period_backup; +}; + +#define ND_LOG_LIMITS_DEFAULT (struct nd_log_limit){ .logs_per_period = ND_LOG_DEFAULT_THROTTLE_LOGS, .logs_per_period_backup = ND_LOG_DEFAULT_THROTTLE_LOGS, .throttle_period = ND_LOG_DEFAULT_THROTTLE_PERIOD, } +#define ND_LOG_LIMITS_UNLIMITED (struct nd_log_limit){ .logs_per_period = 0, .logs_per_period_backup = 0, .throttle_period = 0, } + +struct nd_log_source { + SPINLOCK spinlock; + ND_LOG_METHOD method; + ND_LOG_FORMAT format; + const char *filename; + int fd; + FILE *fp; + + ND_LOG_FIELD_PRIORITY min_priority; + const char *pending_msg; + struct nd_log_limit limits; +}; + +static struct { + nd_uuid_t invocation_id; + + ND_LOG_SOURCES overwrite_process_source; + + struct nd_log_source sources[_NDLS_MAX]; + + struct { + bool initialized; + } journal; + + struct { + bool initialized; + int fd; + char filename[FILENAME_MAX + 1]; + } journal_direct; + + struct { + bool initialized; + int facility; + } syslog; + + struct { + SPINLOCK spinlock; + bool initialized; + } std_output; + + struct { + SPINLOCK spinlock; + bool initialized; + } std_error; + +} nd_log = { + .overwrite_process_source = 0, + .journal = { + .initialized = false, + }, + .journal_direct = { + .initialized = false, + .fd = -1, + }, + .syslog = { + .initialized = false, + .facility = LOG_DAEMON, + }, + .std_output = { + .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .initialized = false, + }, + .std_error = { + .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .initialized = false, + }, + .sources = { + [NDLS_UNSET] = { + .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .method = NDLM_DISABLED, + .format = NDLF_JOURNAL, + .filename = NULL, + .fd = -1, + .fp = NULL, + .min_priority = NDLP_EMERG, + .limits = ND_LOG_LIMITS_UNLIMITED, + }, + [NDLS_ACCESS] = { + .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .method = NDLM_DEFAULT, + .format = NDLF_LOGFMT, + .filename = LOG_DIR "/access.log", + .fd = -1, + .fp = NULL, + .min_priority = NDLP_DEBUG, + .limits = ND_LOG_LIMITS_UNLIMITED, + }, + [NDLS_ACLK] = { + .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .method = NDLM_FILE, + .format = NDLF_LOGFMT, + .filename = LOG_DIR "/aclk.log", + .fd = -1, + .fp = NULL, + .min_priority = NDLP_DEBUG, + .limits = ND_LOG_LIMITS_UNLIMITED, + }, + [NDLS_COLLECTORS] = { + .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .method = NDLM_DEFAULT, + .format = NDLF_LOGFMT, + .filename = LOG_DIR "/collectors.log", + .fd = STDERR_FILENO, + .fp = NULL, + .min_priority = NDLP_INFO, + .limits = ND_LOG_LIMITS_DEFAULT, + }, + [NDLS_DEBUG] = { + .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .method = NDLM_DISABLED, + .format = NDLF_LOGFMT, + .filename = LOG_DIR "/debug.log", + .fd = STDOUT_FILENO, + .fp = NULL, + .min_priority = NDLP_DEBUG, + .limits = ND_LOG_LIMITS_UNLIMITED, + }, + [NDLS_DAEMON] = { + .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .method = NDLM_DEFAULT, + .filename = LOG_DIR "/daemon.log", + .format = NDLF_LOGFMT, + .fd = -1, + .fp = NULL, + .min_priority = NDLP_INFO, + .limits = ND_LOG_LIMITS_DEFAULT, + }, + [NDLS_HEALTH] = { + .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .method = NDLM_DEFAULT, + .format = NDLF_LOGFMT, + .filename = LOG_DIR "/health.log", + .fd = -1, + .fp = NULL, + .min_priority = NDLP_DEBUG, + .limits = ND_LOG_LIMITS_UNLIMITED, + }, + }, +}; + +__attribute__((constructor)) void initialize_invocation_id(void) { + // check for a NETDATA_INVOCATION_ID + if(uuid_parse_flexi(getenv("NETDATA_INVOCATION_ID"), nd_log.invocation_id) != 0) { + // not found, check for systemd set INVOCATION_ID + if(uuid_parse_flexi(getenv("INVOCATION_ID"), nd_log.invocation_id) != 0) { + // not found, generate a new one + uuid_generate_random(nd_log.invocation_id); + } + } + + char uuid[UUID_COMPACT_STR_LEN]; + uuid_unparse_lower_compact(nd_log.invocation_id, uuid); + setenv("NETDATA_INVOCATION_ID", uuid, 1); +} + +int nd_log_health_fd(void) { + if(nd_log.sources[NDLS_HEALTH].method == NDLM_FILE && nd_log.sources[NDLS_HEALTH].fd != -1) + return nd_log.sources[NDLS_HEALTH].fd; + + return STDERR_FILENO; +} + +void nd_log_set_user_settings(ND_LOG_SOURCES source, const char *setting) { + char buf[FILENAME_MAX + 100]; + if(setting && *setting) + strncpyz(buf, setting, sizeof(buf) - 1); + else + buf[0] = '\0'; + + struct nd_log_source *ls = &nd_log.sources[source]; + char *output = strrchr(buf, '@'); + + if(!output) + // all of it is the output + output = buf; + else { + // we found an '@', the next char is the output + *output = '\0'; + output++; + + // parse the other params + char *remaining = buf; + while(remaining) { + char *value = strsep_skip_consecutive_separators(&remaining, ","); + if (!value || !*value) continue; + + char *name = strsep_skip_consecutive_separators(&value, "="); + if (!name || !*name) continue; + + if(strcmp(name, "logfmt") == 0) + ls->format = NDLF_LOGFMT; + else if(strcmp(name, "json") == 0) + ls->format = NDLF_JSON; + else if(strcmp(name, "journal") == 0) + ls->format = NDLF_JOURNAL; + else if(strcmp(name, "level") == 0 && value && *value) + ls->min_priority = nd_log_priority2id(value); + else if(strcmp(name, "protection") == 0 && value && *value) { + if(strcmp(value, "off") == 0 || strcmp(value, "none") == 0) { + ls->limits = ND_LOG_LIMITS_UNLIMITED; + ls->limits.counter = 0; + ls->limits.prevented = 0; + } + else { + ls->limits = ND_LOG_LIMITS_DEFAULT; + + char *slash = strchr(value, '/'); + if(slash) { + *slash = '\0'; + slash++; + ls->limits.logs_per_period = ls->limits.logs_per_period_backup = str2u(value); + ls->limits.throttle_period = str2u(slash); + } + else { + ls->limits.logs_per_period = ls->limits.logs_per_period_backup = str2u(value); + ls->limits.throttle_period = ND_LOG_DEFAULT_THROTTLE_PERIOD; + } + } + } + else + nd_log(NDLS_DAEMON, NDLP_ERR, "Error while parsing configuration of log source '%s'. " + "In config '%s', '%s' is not understood.", + nd_log_id2source(source), setting, name); + } + } + + if(!output || !*output || strcmp(output, "none") == 0 || strcmp(output, "off") == 0) { + ls->method = NDLM_DISABLED; + ls->filename = "/dev/null"; + } + else if(strcmp(output, "journal") == 0) { + ls->method = NDLM_JOURNAL; + ls->filename = NULL; + } + else if(strcmp(output, "syslog") == 0) { + ls->method = NDLM_SYSLOG; + ls->filename = NULL; + } + else if(strcmp(output, "/dev/null") == 0) { + ls->method = NDLM_DEVNULL; + ls->filename = "/dev/null"; + } + else if(strcmp(output, "system") == 0) { + if(ls->fd == STDERR_FILENO) { + ls->method = NDLM_STDERR; + ls->filename = NULL; + ls->fd = STDERR_FILENO; + } + else { + ls->method = NDLM_STDOUT; + ls->filename = NULL; + ls->fd = STDOUT_FILENO; + } + } + else if(strcmp(output, "stderr") == 0) { + ls->method = NDLM_STDERR; + ls->filename = NULL; + ls->fd = STDERR_FILENO; + } + else if(strcmp(output, "stdout") == 0) { + ls->method = NDLM_STDOUT; + ls->filename = NULL; + ls->fd = STDOUT_FILENO; + } + else { + ls->method = NDLM_FILE; + ls->filename = strdupz(output); + } + +#if defined(NETDATA_INTERNAL_CHECKS) || defined(NETDATA_DEV_MODE) + ls->min_priority = NDLP_DEBUG; +#endif + + if(source == NDLS_COLLECTORS) { + // set the method for the collector processes we will spawn + + ND_LOG_METHOD method; + ND_LOG_FORMAT format = ls->format; + ND_LOG_FIELD_PRIORITY priority = ls->min_priority; + + if(ls->method == NDLM_SYSLOG || ls->method == NDLM_JOURNAL) + method = ls->method; + else + method = NDLM_STDERR; + + setenv("NETDATA_LOG_METHOD", nd_log_id2method(method), 1); + setenv("NETDATA_LOG_FORMAT", nd_log_id2format(format), 1); + setenv("NETDATA_LOG_LEVEL", nd_log_id2priority(priority), 1); + } +} + +void nd_log_set_priority_level(const char *setting) { + if(!setting || !*setting) + setting = "info"; + + ND_LOG_FIELD_PRIORITY priority = nd_log_priority2id(setting); + +#if defined(NETDATA_INTERNAL_CHECKS) || defined(NETDATA_DEV_MODE) + priority = NDLP_DEBUG; +#endif + + for (size_t i = 0; i < _NDLS_MAX; i++) { + if (i != NDLS_DEBUG) + nd_log.sources[i].min_priority = priority; + } + + // the right one + setenv("NETDATA_LOG_LEVEL", nd_log_id2priority(priority), 1); +} + +void nd_log_set_facility(const char *facility) { + if(!facility || !*facility) + facility = "daemon"; + + nd_log.syslog.facility = nd_log_facility2id(facility); + setenv("NETDATA_SYSLOG_FACILITY", nd_log_id2facility(nd_log.syslog.facility), 1); +} + +void nd_log_set_flood_protection(size_t logs, time_t period) { + nd_log.sources[NDLS_DAEMON].limits.logs_per_period = + nd_log.sources[NDLS_DAEMON].limits.logs_per_period_backup; + nd_log.sources[NDLS_COLLECTORS].limits.logs_per_period = + nd_log.sources[NDLS_COLLECTORS].limits.logs_per_period_backup = logs; + + nd_log.sources[NDLS_DAEMON].limits.throttle_period = + nd_log.sources[NDLS_COLLECTORS].limits.throttle_period = period; + + char buf[100]; + snprintfz(buf, sizeof(buf), "%" PRIu64, (uint64_t )period); + setenv("NETDATA_ERRORS_THROTTLE_PERIOD", buf, 1); + snprintfz(buf, sizeof(buf), "%" PRIu64, (uint64_t )logs); + setenv("NETDATA_ERRORS_PER_PERIOD", buf, 1); +} + +static bool nd_log_journal_systemd_init(void) { +#ifdef HAVE_SYSTEMD + nd_log.journal.initialized = true; +#else + nd_log.journal.initialized = false; +#endif + + return nd_log.journal.initialized; +} + +static void nd_log_journal_direct_set_env(void) { + if(nd_log.sources[NDLS_COLLECTORS].method == NDLM_JOURNAL) + setenv("NETDATA_SYSTEMD_JOURNAL_PATH", nd_log.journal_direct.filename, 1); +} + +static bool nd_log_journal_direct_init(const char *path) { + if(nd_log.journal_direct.initialized) { + nd_log_journal_direct_set_env(); + return true; + } + + int fd; + char filename[FILENAME_MAX + 1]; + if(!is_path_unix_socket(path)) { + + journal_construct_path(filename, sizeof(filename), netdata_configured_host_prefix, "netdata"); + if (!is_path_unix_socket(filename) || (fd = journal_direct_fd(filename)) == -1) { + + journal_construct_path(filename, sizeof(filename), netdata_configured_host_prefix, NULL); + if (!is_path_unix_socket(filename) || (fd = journal_direct_fd(filename)) == -1) { + + journal_construct_path(filename, sizeof(filename), NULL, "netdata"); + if (!is_path_unix_socket(filename) || (fd = journal_direct_fd(filename)) == -1) { + + journal_construct_path(filename, sizeof(filename), NULL, NULL); + if (!is_path_unix_socket(filename) || (fd = journal_direct_fd(filename)) == -1) + return false; + } + } + } + } + else { + snprintfz(filename, sizeof(filename), "%s", path); + fd = journal_direct_fd(filename); + } + + if(fd < 0) + return false; + + nd_log.journal_direct.fd = fd; + nd_log.journal_direct.initialized = true; + + strncpyz(nd_log.journal_direct.filename, filename, sizeof(nd_log.journal_direct.filename) - 1); + nd_log_journal_direct_set_env(); + + return true; +} + +static void nd_log_syslog_init() { + if(nd_log.syslog.initialized) + return; + + openlog(program_name, LOG_PID, nd_log.syslog.facility); + nd_log.syslog.initialized = true; +} + +void nd_log_initialize_for_external_plugins(const char *name) { + // if we don't run under Netdata, log to stderr, + // otherwise, use the logging method Netdata wants us to use. + setenv("NETDATA_LOG_METHOD", "stderr", 0); + setenv("NETDATA_LOG_FORMAT", "logfmt", 0); + + nd_log.overwrite_process_source = NDLS_COLLECTORS; + program_name = name; + + for(size_t i = 0; i < _NDLS_MAX ;i++) { + nd_log.sources[i].method = STDERR_FILENO; + nd_log.sources[i].fd = -1; + nd_log.sources[i].fp = NULL; + } + + nd_log_set_priority_level(getenv("NETDATA_LOG_LEVEL")); + nd_log_set_facility(getenv("NETDATA_SYSLOG_FACILITY")); + + time_t period = 1200; + size_t logs = 200; + const char *s = getenv("NETDATA_ERRORS_THROTTLE_PERIOD"); + if(s && *s >= '0' && *s <= '9') { + period = str2l(s); + if(period < 0) period = 0; + } + + s = getenv("NETDATA_ERRORS_PER_PERIOD"); + if(s && *s >= '0' && *s <= '9') + logs = str2u(s); + + nd_log_set_flood_protection(logs, period); + + if(!netdata_configured_host_prefix) { + s = getenv("NETDATA_HOST_PREFIX"); + if(s && *s) + netdata_configured_host_prefix = (char *)s; + } + + ND_LOG_METHOD method = nd_log_method2id(getenv("NETDATA_LOG_METHOD")); + ND_LOG_FORMAT format = nd_log_format2id(getenv("NETDATA_LOG_FORMAT")); + + if(!IS_VALID_LOG_METHOD_FOR_EXTERNAL_PLUGINS(method)) { + if(is_stderr_connected_to_journal()) { + nd_log(NDLS_COLLECTORS, NDLP_WARNING, "NETDATA_LOG_METHOD is not set. Using journal."); + method = NDLM_JOURNAL; + } + else { + nd_log(NDLS_COLLECTORS, NDLP_WARNING, "NETDATA_LOG_METHOD is not set. Using stderr."); + method = NDLM_STDERR; + } + } + + switch(method) { + case NDLM_JOURNAL: + if(!nd_log_journal_direct_init(getenv("NETDATA_SYSTEMD_JOURNAL_PATH")) || + !nd_log_journal_direct_init(NULL) || !nd_log_journal_systemd_init()) { + nd_log(NDLS_COLLECTORS, NDLP_WARNING, "Failed to initialize journal. Using stderr."); + method = NDLM_STDERR; + } + break; + + case NDLM_SYSLOG: + nd_log_syslog_init(); + break; + + default: + method = NDLM_STDERR; + break; + } + + for(size_t i = 0; i < _NDLS_MAX ;i++) { + nd_log.sources[i].method = method; + nd_log.sources[i].format = format; + nd_log.sources[i].fd = -1; + nd_log.sources[i].fp = NULL; + } + +// nd_log(NDLS_COLLECTORS, NDLP_NOTICE, "FINAL_LOG_METHOD: %s", nd_log_id2method(method)); +} + +static bool nd_log_replace_existing_fd(struct nd_log_source *e, int new_fd) { + if(new_fd == -1 || e->fd == -1 || + (e->fd == STDOUT_FILENO && nd_log.std_output.initialized) || + (e->fd == STDERR_FILENO && nd_log.std_error.initialized)) + return false; + + if(new_fd != e->fd) { + int t = dup2(new_fd, e->fd); + + bool ret = true; + if (t == -1) { + netdata_log_error("Cannot dup2() new fd %d to old fd %d for '%s'", new_fd, e->fd, e->filename); + ret = false; + } + else + close(new_fd); + + if(e->fd == STDOUT_FILENO) + nd_log.std_output.initialized = true; + else if(e->fd == STDERR_FILENO) + nd_log.std_error.initialized = true; + + return ret; + } + + return false; +} + +static void nd_log_open(struct nd_log_source *e, ND_LOG_SOURCES source) { + if(e->method == NDLM_DEFAULT) + nd_log_set_user_settings(source, e->filename); + + if((e->method == NDLM_FILE && !e->filename) || + (e->method == NDLM_DEVNULL && e->fd == -1)) + e->method = NDLM_DISABLED; + + if(e->fp) + fflush(e->fp); + + switch(e->method) { + case NDLM_SYSLOG: + nd_log_syslog_init(); + break; + + case NDLM_JOURNAL: + nd_log_journal_direct_init(NULL); + nd_log_journal_systemd_init(); + break; + + case NDLM_STDOUT: + e->fp = stdout; + e->fd = STDOUT_FILENO; + break; + + case NDLM_DISABLED: + break; + + case NDLM_DEFAULT: + case NDLM_STDERR: + e->method = NDLM_STDERR; + e->fp = stderr; + e->fd = STDERR_FILENO; + break; + + case NDLM_DEVNULL: + case NDLM_FILE: { + int fd = open(e->filename, O_WRONLY | O_APPEND | O_CREAT, 0664); + if(fd == -1) { + if(e->fd != STDOUT_FILENO && e->fd != STDERR_FILENO) { + e->fd = STDERR_FILENO; + e->method = NDLM_STDERR; + netdata_log_error("Cannot open log file '%s'. Falling back to stderr.", e->filename); + } + else + netdata_log_error("Cannot open log file '%s'. Leaving fd %d as-is.", e->filename, e->fd); + } + else { + if (!nd_log_replace_existing_fd(e, fd)) { + if(e->fd == STDOUT_FILENO || e->fd == STDERR_FILENO) { + if(e->fd == STDOUT_FILENO) + e->method = NDLM_STDOUT; + else if(e->fd == STDERR_FILENO) + e->method = NDLM_STDERR; + + // we have dup2() fd, so we can close the one we opened + if(fd != STDOUT_FILENO && fd != STDERR_FILENO) + close(fd); + } + else + e->fd = fd; + } + } + + // at this point we have e->fd set properly + + if(e->fd == STDOUT_FILENO) + e->fp = stdout; + else if(e->fd == STDERR_FILENO) + e->fp = stderr; + + if(!e->fp) { + e->fp = fdopen(e->fd, "a"); + if (!e->fp) { + netdata_log_error("Cannot fdopen() fd %d ('%s')", e->fd, e->filename); + + if(e->fd != STDOUT_FILENO && e->fd != STDERR_FILENO) + close(e->fd); + + e->fp = stderr; + e->fd = STDERR_FILENO; + } + } + else { + if (setvbuf(e->fp, NULL, _IOLBF, 0) != 0) + netdata_log_error("Cannot set line buffering on fd %d ('%s')", e->fd, e->filename); + } + } + break; + } +} + +static void nd_log_stdin_init(int fd, const char *filename) { + int f = open(filename, O_WRONLY | O_APPEND | O_CREAT, 0664); + if(f == -1) + return; + + if(f != fd) { + dup2(f, fd); + close(f); + } +} + +void nd_log_initialize(void) { + nd_log_stdin_init(STDIN_FILENO, "/dev/null"); + + for(size_t i = 0 ; i < _NDLS_MAX ; i++) + nd_log_open(&nd_log.sources[i], i); +} + +void nd_log_reopen_log_files(void) { + netdata_log_info("Reopening all log files."); + + nd_log.std_output.initialized = false; + nd_log.std_error.initialized = false; + nd_log_initialize(); + + netdata_log_info("Log files re-opened."); +} + +void chown_open_file(int fd, uid_t uid, gid_t gid) { + if(fd == -1) return; + + struct stat buf; + + if(fstat(fd, &buf) == -1) { + netdata_log_error("Cannot fstat() fd %d", fd); + return; + } + + if((buf.st_uid != uid || buf.st_gid != gid) && S_ISREG(buf.st_mode)) { + if(fchown(fd, uid, gid) == -1) + netdata_log_error("Cannot fchown() fd %d.", fd); + } +} + +void nd_log_chown_log_files(uid_t uid, gid_t gid) { + for(size_t i = 0 ; i < _NDLS_MAX ; i++) { + if(nd_log.sources[i].fd != -1 && nd_log.sources[i].fd != STDIN_FILENO) + chown_open_file(nd_log.sources[i].fd, uid, gid); + } +} + +// ---------------------------------------------------------------------------- +// annotators +struct log_field; +static void errno_annotator(BUFFER *wb, const char *key, struct log_field *lf); +static void priority_annotator(BUFFER *wb, const char *key, struct log_field *lf); +static void timestamp_usec_annotator(BUFFER *wb, const char *key, struct log_field *lf); + +// ---------------------------------------------------------------------------- + +typedef void (*annotator_t)(BUFFER *wb, const char *key, struct log_field *lf); + +struct log_field { + const char *journal; + const char *logfmt; + annotator_t logfmt_annotator; + struct log_stack_entry entry; +}; + +#define THREAD_LOG_STACK_MAX 50 + +static __thread struct log_stack_entry *thread_log_stack_base[THREAD_LOG_STACK_MAX]; +static __thread size_t thread_log_stack_next = 0; + +static __thread struct log_field thread_log_fields[_NDF_MAX] = { + // THE ORDER DEFINES THE ORDER FIELDS WILL APPEAR IN logfmt + + [NDF_STOP] = { // processing will not stop on this - so it is ok to be first + .journal = NULL, + .logfmt = NULL, + .logfmt_annotator = NULL, + }, + [NDF_TIMESTAMP_REALTIME_USEC] = { + .journal = NULL, + .logfmt = "time", + .logfmt_annotator = timestamp_usec_annotator, + }, + [NDF_SYSLOG_IDENTIFIER] = { + .journal = "SYSLOG_IDENTIFIER", // standard journald field + .logfmt = "comm", + }, + [NDF_LOG_SOURCE] = { + .journal = "ND_LOG_SOURCE", + .logfmt = "source", + }, + [NDF_PRIORITY] = { + .journal = "PRIORITY", // standard journald field + .logfmt = "level", + .logfmt_annotator = priority_annotator, + }, + [NDF_ERRNO] = { + .journal = "ERRNO", // standard journald field + .logfmt = "errno", + .logfmt_annotator = errno_annotator, + }, + [NDF_INVOCATION_ID] = { + .journal = "INVOCATION_ID", // standard journald field + .logfmt = NULL, + }, + [NDF_LINE] = { + .journal = "CODE_LINE", // standard journald field + .logfmt = NULL, + }, + [NDF_FILE] = { + .journal = "CODE_FILE", // standard journald field + .logfmt = NULL, + }, + [NDF_FUNC] = { + .journal = "CODE_FUNC", // standard journald field + .logfmt = NULL, + }, + [NDF_TID] = { + .journal = "TID", // standard journald field + .logfmt = "tid", + }, + [NDF_THREAD_TAG] = { + .journal = "THREAD_TAG", + .logfmt = "thread", + }, + [NDF_MESSAGE_ID] = { + .journal = "MESSAGE_ID", + .logfmt = "msg_id", + }, + [NDF_MODULE] = { + .journal = "ND_MODULE", + .logfmt = "module", + }, + [NDF_NIDL_NODE] = { + .journal = "ND_NIDL_NODE", + .logfmt = "node", + }, + [NDF_NIDL_INSTANCE] = { + .journal = "ND_NIDL_INSTANCE", + .logfmt = "instance", + }, + [NDF_NIDL_CONTEXT] = { + .journal = "ND_NIDL_CONTEXT", + .logfmt = "context", + }, + [NDF_NIDL_DIMENSION] = { + .journal = "ND_NIDL_DIMENSION", + .logfmt = "dimension", + }, + [NDF_SRC_TRANSPORT] = { + .journal = "ND_SRC_TRANSPORT", + .logfmt = "src_transport", + }, + [NDF_ACCOUNT_ID] = { + .journal = "ND_ACCOUNT_ID", + .logfmt = "account", + }, + [NDF_USER_NAME] = { + .journal = "ND_USER_NAME", + .logfmt = "user", + }, + [NDF_USER_ROLE] = { + .journal = "ND_USER_ROLE", + .logfmt = "role", + }, + [NDF_USER_ACCESS] = { + .journal = "ND_USER_PERMISSIONS", + .logfmt = "permissions", + }, + [NDF_SRC_IP] = { + .journal = "ND_SRC_IP", + .logfmt = "src_ip", + }, + [NDF_SRC_FORWARDED_HOST] = { + .journal = "ND_SRC_FORWARDED_HOST", + .logfmt = "src_forwarded_host", + }, + [NDF_SRC_FORWARDED_FOR] = { + .journal = "ND_SRC_FORWARDED_FOR", + .logfmt = "src_forwarded_for", + }, + [NDF_SRC_PORT] = { + .journal = "ND_SRC_PORT", + .logfmt = "src_port", + }, + [NDF_SRC_CAPABILITIES] = { + .journal = "ND_SRC_CAPABILITIES", + .logfmt = "src_capabilities", + }, + [NDF_DST_TRANSPORT] = { + .journal = "ND_DST_TRANSPORT", + .logfmt = "dst_transport", + }, + [NDF_DST_IP] = { + .journal = "ND_DST_IP", + .logfmt = "dst_ip", + }, + [NDF_DST_PORT] = { + .journal = "ND_DST_PORT", + .logfmt = "dst_port", + }, + [NDF_DST_CAPABILITIES] = { + .journal = "ND_DST_CAPABILITIES", + .logfmt = "dst_capabilities", + }, + [NDF_REQUEST_METHOD] = { + .journal = "ND_REQUEST_METHOD", + .logfmt = "req_method", + }, + [NDF_RESPONSE_CODE] = { + .journal = "ND_RESPONSE_CODE", + .logfmt = "code", + }, + [NDF_CONNECTION_ID] = { + .journal = "ND_CONNECTION_ID", + .logfmt = "conn", + }, + [NDF_TRANSACTION_ID] = { + .journal = "ND_TRANSACTION_ID", + .logfmt = "transaction", + }, + [NDF_RESPONSE_SENT_BYTES] = { + .journal = "ND_RESPONSE_SENT_BYTES", + .logfmt = "sent_bytes", + }, + [NDF_RESPONSE_SIZE_BYTES] = { + .journal = "ND_RESPONSE_SIZE_BYTES", + .logfmt = "size_bytes", + }, + [NDF_RESPONSE_PREPARATION_TIME_USEC] = { + .journal = "ND_RESPONSE_PREP_TIME_USEC", + .logfmt = "prep_ut", + }, + [NDF_RESPONSE_SENT_TIME_USEC] = { + .journal = "ND_RESPONSE_SENT_TIME_USEC", + .logfmt = "sent_ut", + }, + [NDF_RESPONSE_TOTAL_TIME_USEC] = { + .journal = "ND_RESPONSE_TOTAL_TIME_USEC", + .logfmt = "total_ut", + }, + [NDF_ALERT_ID] = { + .journal = "ND_ALERT_ID", + .logfmt = "alert_id", + }, + [NDF_ALERT_UNIQUE_ID] = { + .journal = "ND_ALERT_UNIQUE_ID", + .logfmt = "alert_unique_id", + }, + [NDF_ALERT_TRANSITION_ID] = { + .journal = "ND_ALERT_TRANSITION_ID", + .logfmt = "alert_transition_id", + }, + [NDF_ALERT_EVENT_ID] = { + .journal = "ND_ALERT_EVENT_ID", + .logfmt = "alert_event_id", + }, + [NDF_ALERT_CONFIG_HASH] = { + .journal = "ND_ALERT_CONFIG", + .logfmt = "alert_config", + }, + [NDF_ALERT_NAME] = { + .journal = "ND_ALERT_NAME", + .logfmt = "alert", + }, + [NDF_ALERT_CLASS] = { + .journal = "ND_ALERT_CLASS", + .logfmt = "alert_class", + }, + [NDF_ALERT_COMPONENT] = { + .journal = "ND_ALERT_COMPONENT", + .logfmt = "alert_component", + }, + [NDF_ALERT_TYPE] = { + .journal = "ND_ALERT_TYPE", + .logfmt = "alert_type", + }, + [NDF_ALERT_EXEC] = { + .journal = "ND_ALERT_EXEC", + .logfmt = "alert_exec", + }, + [NDF_ALERT_RECIPIENT] = { + .journal = "ND_ALERT_RECIPIENT", + .logfmt = "alert_recipient", + }, + [NDF_ALERT_VALUE] = { + .journal = "ND_ALERT_VALUE", + .logfmt = "alert_value", + }, + [NDF_ALERT_VALUE_OLD] = { + .journal = "ND_ALERT_VALUE_OLD", + .logfmt = "alert_value_old", + }, + [NDF_ALERT_STATUS] = { + .journal = "ND_ALERT_STATUS", + .logfmt = "alert_status", + }, + [NDF_ALERT_STATUS_OLD] = { + .journal = "ND_ALERT_STATUS_OLD", + .logfmt = "alert_value_old", + }, + [NDF_ALERT_UNITS] = { + .journal = "ND_ALERT_UNITS", + .logfmt = "alert_units", + }, + [NDF_ALERT_SUMMARY] = { + .journal = "ND_ALERT_SUMMARY", + .logfmt = "alert_summary", + }, + [NDF_ALERT_INFO] = { + .journal = "ND_ALERT_INFO", + .logfmt = "alert_info", + }, + [NDF_ALERT_DURATION] = { + .journal = "ND_ALERT_DURATION", + .logfmt = "alert_duration", + }, + [NDF_ALERT_NOTIFICATION_REALTIME_USEC] = { + .journal = "ND_ALERT_NOTIFICATION_TIMESTAMP_USEC", + .logfmt = "alert_notification_timestamp", + .logfmt_annotator = timestamp_usec_annotator, + }, + + // put new items here + // leave the request URL and the message last + + [NDF_REQUEST] = { + .journal = "ND_REQUEST", + .logfmt = "request", + }, + [NDF_MESSAGE] = { + .journal = "MESSAGE", + .logfmt = "msg", + }, +}; + +#define THREAD_FIELDS_MAX (sizeof(thread_log_fields) / sizeof(thread_log_fields[0])) + +ND_LOG_FIELD_ID nd_log_field_id_by_name(const char *field, size_t len) { + for(size_t i = 0; i < THREAD_FIELDS_MAX ;i++) { + if(thread_log_fields[i].journal && strlen(thread_log_fields[i].journal) == len && strncmp(field, thread_log_fields[i].journal, len) == 0) + return i; + } + + return NDF_STOP; +} + +void log_stack_pop(void *ptr) { + if(!ptr) return; + + struct log_stack_entry *lgs = *(struct log_stack_entry (*)[])ptr; + + if(unlikely(!thread_log_stack_next || lgs != thread_log_stack_base[thread_log_stack_next - 1])) { + fatal("You cannot pop in the middle of the stack, or an item not in the stack"); + return; + } + + thread_log_stack_next--; +} + +void log_stack_push(struct log_stack_entry *lgs) { + if(!lgs || thread_log_stack_next >= THREAD_LOG_STACK_MAX) return; + thread_log_stack_base[thread_log_stack_next++] = lgs; +} + +// ---------------------------------------------------------------------------- +// json formatter + +static void nd_logger_json(BUFFER *wb, struct log_field *fields, size_t fields_max) { + + // --- FIELD_PARSER_VERSIONS --- + // + // IMPORTANT: + // THERE ARE 6 VERSIONS OF THIS CODE + // + // 1. journal (direct socket API), + // 2. journal (libsystemd API), + // 3. logfmt, + // 4. json, + // 5. convert to uint64 + // 6. convert to int64 + // + // UPDATE ALL OF THEM FOR NEW FEATURES OR FIXES + + buffer_json_initialize(wb, "\"", "\"", 0, true, BUFFER_JSON_OPTIONS_MINIFY); + CLEAN_BUFFER *tmp = NULL; + + for (size_t i = 0; i < fields_max; i++) { + if (!fields[i].entry.set || !fields[i].logfmt) + continue; + + const char *key = fields[i].logfmt; + + const char *s = NULL; + switch(fields[i].entry.type) { + case NDFT_TXT: + s = fields[i].entry.txt; + break; + case NDFT_STR: + s = string2str(fields[i].entry.str); + break; + case NDFT_BFR: + s = buffer_tostring(fields[i].entry.bfr); + break; + case NDFT_U64: + buffer_json_member_add_uint64(wb, key, fields[i].entry.u64); + break; + case NDFT_I64: + buffer_json_member_add_int64(wb, key, fields[i].entry.i64); + break; + case NDFT_DBL: + buffer_json_member_add_double(wb, key, fields[i].entry.dbl); + break; + case NDFT_UUID: + if(!uuid_is_null(*fields[i].entry.uuid)) { + char u[UUID_COMPACT_STR_LEN]; + uuid_unparse_lower_compact(*fields[i].entry.uuid, u); + buffer_json_member_add_string(wb, key, u); + } + break; + case NDFT_CALLBACK: { + if(!tmp) + tmp = buffer_create(1024, NULL); + else + buffer_flush(tmp); + if(fields[i].entry.cb.formatter(tmp, fields[i].entry.cb.formatter_data)) + s = buffer_tostring(tmp); + else + s = NULL; + } + break; + default: + s = "UNHANDLED"; + break; + } + + if(s && *s) + buffer_json_member_add_string(wb, key, s); + } + + buffer_json_finalize(wb); +} + +// ---------------------------------------------------------------------------- +// logfmt formatter + + +static int64_t log_field_to_int64(struct log_field *lf) { + + // --- FIELD_PARSER_VERSIONS --- + // + // IMPORTANT: + // THERE ARE 6 VERSIONS OF THIS CODE + // + // 1. journal (direct socket API), + // 2. journal (libsystemd API), + // 3. logfmt, + // 4. json, + // 5. convert to uint64 + // 6. convert to int64 + // + // UPDATE ALL OF THEM FOR NEW FEATURES OR FIXES + + CLEAN_BUFFER *tmp = NULL; + const char *s = NULL; + + switch(lf->entry.type) { + case NDFT_UUID: + case NDFT_UNSET: + return 0; + + case NDFT_TXT: + s = lf->entry.txt; + break; + + case NDFT_STR: + s = string2str(lf->entry.str); + break; + + case NDFT_BFR: + s = buffer_tostring(lf->entry.bfr); + break; + + case NDFT_CALLBACK: + tmp = buffer_create(0, NULL); + + if(lf->entry.cb.formatter(tmp, lf->entry.cb.formatter_data)) + s = buffer_tostring(tmp); + else + s = NULL; + break; + + case NDFT_U64: + return (int64_t)lf->entry.u64; + + case NDFT_I64: + return (int64_t)lf->entry.i64; + + case NDFT_DBL: + return (int64_t)lf->entry.dbl; + } + + if(s && *s) + return str2ll(s, NULL); + + return 0; +} + +static uint64_t log_field_to_uint64(struct log_field *lf) { + + // --- FIELD_PARSER_VERSIONS --- + // + // IMPORTANT: + // THERE ARE 6 VERSIONS OF THIS CODE + // + // 1. journal (direct socket API), + // 2. journal (libsystemd API), + // 3. logfmt, + // 4. json, + // 5. convert to uint64 + // 6. convert to int64 + // + // UPDATE ALL OF THEM FOR NEW FEATURES OR FIXES + + CLEAN_BUFFER *tmp = NULL; + const char *s = NULL; + + switch(lf->entry.type) { + case NDFT_UUID: + case NDFT_UNSET: + return 0; + + case NDFT_TXT: + s = lf->entry.txt; + break; + + case NDFT_STR: + s = string2str(lf->entry.str); + break; + + case NDFT_BFR: + s = buffer_tostring(lf->entry.bfr); + break; + + case NDFT_CALLBACK: + tmp = buffer_create(0, NULL); + + if(lf->entry.cb.formatter(tmp, lf->entry.cb.formatter_data)) + s = buffer_tostring(tmp); + else + s = NULL; + break; + + case NDFT_U64: + return lf->entry.u64; + + case NDFT_I64: + return lf->entry.i64; + + case NDFT_DBL: + return (uint64_t) lf->entry.dbl; + } + + if(s && *s) + return str2uint64_t(s, NULL); + + return 0; +} + +static void timestamp_usec_annotator(BUFFER *wb, const char *key, struct log_field *lf) { + usec_t ut = log_field_to_uint64(lf); + + if(!ut) + return; + + char datetime[RFC3339_MAX_LENGTH]; + rfc3339_datetime_ut(datetime, sizeof(datetime), ut, 3, false); + + if(buffer_strlen(wb)) + buffer_fast_strcat(wb, " ", 1); + + buffer_strcat(wb, key); + buffer_fast_strcat(wb, "=", 1); + buffer_json_strcat(wb, datetime); +} + +static void errno_annotator(BUFFER *wb, const char *key, struct log_field *lf) { + int64_t errnum = log_field_to_int64(lf); + + if(errnum == 0) + return; + + char buf[1024]; + const char *s = errno2str((int)errnum, buf, sizeof(buf)); + + if(buffer_strlen(wb)) + buffer_fast_strcat(wb, " ", 1); + + buffer_strcat(wb, key); + buffer_fast_strcat(wb, "=\"", 2); + buffer_print_int64(wb, errnum); + buffer_fast_strcat(wb, ", ", 2); + buffer_json_strcat(wb, s); + buffer_fast_strcat(wb, "\"", 1); +} + +static void priority_annotator(BUFFER *wb, const char *key, struct log_field *lf) { + uint64_t pri = log_field_to_uint64(lf); + + if(buffer_strlen(wb)) + buffer_fast_strcat(wb, " ", 1); + + buffer_strcat(wb, key); + buffer_fast_strcat(wb, "=", 1); + buffer_strcat(wb, nd_log_id2priority(pri)); +} + +static bool needs_quotes_for_logfmt(const char *s) +{ + static bool safe_for_logfmt[256] = { + [' '] = true, ['!'] = true, ['"'] = false, ['#'] = true, ['$'] = true, ['%'] = true, ['&'] = true, + ['\''] = true, ['('] = true, [')'] = true, ['*'] = true, ['+'] = true, [','] = true, ['-'] = true, + ['.'] = true, ['/'] = true, ['0'] = true, ['1'] = true, ['2'] = true, ['3'] = true, ['4'] = true, + ['5'] = true, ['6'] = true, ['7'] = true, ['8'] = true, ['9'] = true, [':'] = true, [';'] = true, + ['<'] = true, ['='] = true, ['>'] = true, ['?'] = true, ['@'] = true, ['A'] = true, ['B'] = true, + ['C'] = true, ['D'] = true, ['E'] = true, ['F'] = true, ['G'] = true, ['H'] = true, ['I'] = true, + ['J'] = true, ['K'] = true, ['L'] = true, ['M'] = true, ['N'] = true, ['O'] = true, ['P'] = true, + ['Q'] = true, ['R'] = true, ['S'] = true, ['T'] = true, ['U'] = true, ['V'] = true, ['W'] = true, + ['X'] = true, ['Y'] = true, ['Z'] = true, ['['] = true, ['\\'] = false, [']'] = true, ['^'] = true, + ['_'] = true, ['`'] = true, ['a'] = true, ['b'] = true, ['c'] = true, ['d'] = true, ['e'] = true, + ['f'] = true, ['g'] = true, ['h'] = true, ['i'] = true, ['j'] = true, ['k'] = true, ['l'] = true, + ['m'] = true, ['n'] = true, ['o'] = true, ['p'] = true, ['q'] = true, ['r'] = true, ['s'] = true, + ['t'] = true, ['u'] = true, ['v'] = true, ['w'] = true, ['x'] = true, ['y'] = true, ['z'] = true, + ['{'] = true, ['|'] = true, ['}'] = true, ['~'] = true, [0x7f] = true, + }; + + if(!*s) + return true; + + while(*s) { + if(*s == '=' || isspace((uint8_t)*s) || !safe_for_logfmt[(uint8_t)*s]) + return true; + + s++; + } + + return false; +} + +static void string_to_logfmt(BUFFER *wb, const char *s) +{ + bool spaces = needs_quotes_for_logfmt(s); + + if(spaces) + buffer_fast_strcat(wb, "\"", 1); + + buffer_json_strcat(wb, s); + + if(spaces) + buffer_fast_strcat(wb, "\"", 1); +} + +static void nd_logger_logfmt(BUFFER *wb, struct log_field *fields, size_t fields_max) +{ + + // --- FIELD_PARSER_VERSIONS --- + // + // IMPORTANT: + // THERE ARE 6 VERSIONS OF THIS CODE + // + // 1. journal (direct socket API), + // 2. journal (libsystemd API), + // 3. logfmt, + // 4. json, + // 5. convert to uint64 + // 6. convert to int64 + // + // UPDATE ALL OF THEM FOR NEW FEATURES OR FIXES + + CLEAN_BUFFER *tmp = NULL; + + for (size_t i = 0; i < fields_max; i++) { + if (!fields[i].entry.set || !fields[i].logfmt) + continue; + + const char *key = fields[i].logfmt; + + if(fields[i].logfmt_annotator) + fields[i].logfmt_annotator(wb, key, &fields[i]); + else { + if(buffer_strlen(wb)) + buffer_fast_strcat(wb, " ", 1); + + switch(fields[i].entry.type) { + case NDFT_TXT: + if(*fields[i].entry.txt) { + buffer_strcat(wb, key); + buffer_fast_strcat(wb, "=", 1); + string_to_logfmt(wb, fields[i].entry.txt); + } + break; + case NDFT_STR: + buffer_strcat(wb, key); + buffer_fast_strcat(wb, "=", 1); + string_to_logfmt(wb, string2str(fields[i].entry.str)); + break; + case NDFT_BFR: + if(buffer_strlen(fields[i].entry.bfr)) { + buffer_strcat(wb, key); + buffer_fast_strcat(wb, "=", 1); + string_to_logfmt(wb, buffer_tostring(fields[i].entry.bfr)); + } + break; + case NDFT_U64: + buffer_strcat(wb, key); + buffer_fast_strcat(wb, "=", 1); + buffer_print_uint64(wb, fields[i].entry.u64); + break; + case NDFT_I64: + buffer_strcat(wb, key); + buffer_fast_strcat(wb, "=", 1); + buffer_print_int64(wb, fields[i].entry.i64); + break; + case NDFT_DBL: + buffer_strcat(wb, key); + buffer_fast_strcat(wb, "=", 1); + buffer_print_netdata_double(wb, fields[i].entry.dbl); + break; + case NDFT_UUID: + if(!uuid_is_null(*fields[i].entry.uuid)) { + char u[UUID_COMPACT_STR_LEN]; + uuid_unparse_lower_compact(*fields[i].entry.uuid, u); + buffer_strcat(wb, key); + buffer_fast_strcat(wb, "=", 1); + buffer_fast_strcat(wb, u, sizeof(u) - 1); + } + break; + case NDFT_CALLBACK: { + if(!tmp) + tmp = buffer_create(1024, NULL); + else + buffer_flush(tmp); + if(fields[i].entry.cb.formatter(tmp, fields[i].entry.cb.formatter_data)) { + buffer_strcat(wb, key); + buffer_fast_strcat(wb, "=", 1); + string_to_logfmt(wb, buffer_tostring(tmp)); + } + } + break; + default: + buffer_strcat(wb, "UNHANDLED"); + break; + } + } + } +} + +// ---------------------------------------------------------------------------- +// journal logger + +bool nd_log_journal_socket_available(void) { + if(netdata_configured_host_prefix && *netdata_configured_host_prefix) { + char filename[FILENAME_MAX + 1]; + + snprintfz(filename, sizeof(filename), "%s%s", + netdata_configured_host_prefix, "/run/systemd/journal/socket"); + + if(is_path_unix_socket(filename)) + return true; + } + + return is_path_unix_socket("/run/systemd/journal/socket"); +} + +static bool nd_logger_journal_libsystemd(struct log_field *fields __maybe_unused, size_t fields_max __maybe_unused) { +#ifdef HAVE_SYSTEMD + + // --- FIELD_PARSER_VERSIONS --- + // + // IMPORTANT: + // THERE ARE 6 VERSIONS OF THIS CODE + // + // 1. journal (direct socket API), + // 2. journal (libsystemd API), + // 3. logfmt, + // 4. json, + // 5. convert to uint64 + // 6. convert to int64 + // + // UPDATE ALL OF THEM FOR NEW FEATURES OR FIXES + + struct iovec iov[fields_max]; + int iov_count = 0; + + memset(iov, 0, sizeof(iov)); + + CLEAN_BUFFER *tmp = NULL; + + for (size_t i = 0; i < fields_max; i++) { + if (!fields[i].entry.set || !fields[i].journal) + continue; + + const char *key = fields[i].journal; + char *value = NULL; + int rc = 0; + switch (fields[i].entry.type) { + case NDFT_TXT: + if(*fields[i].entry.txt) + rc = asprintf(&value, "%s=%s", key, fields[i].entry.txt); + break; + case NDFT_STR: + rc = asprintf(&value, "%s=%s", key, string2str(fields[i].entry.str)); + break; + case NDFT_BFR: + if(buffer_strlen(fields[i].entry.bfr)) + rc = asprintf(&value, "%s=%s", key, buffer_tostring(fields[i].entry.bfr)); + break; + case NDFT_U64: + rc = asprintf(&value, "%s=%" PRIu64, key, fields[i].entry.u64); + break; + case NDFT_I64: + rc = asprintf(&value, "%s=%" PRId64, key, fields[i].entry.i64); + break; + case NDFT_DBL: + rc = asprintf(&value, "%s=%f", key, fields[i].entry.dbl); + break; + case NDFT_UUID: + if(!uuid_is_null(*fields[i].entry.uuid)) { + char u[UUID_COMPACT_STR_LEN]; + uuid_unparse_lower_compact(*fields[i].entry.uuid, u); + rc = asprintf(&value, "%s=%s", key, u); + } + break; + case NDFT_CALLBACK: { + if(!tmp) + tmp = buffer_create(1024, NULL); + else + buffer_flush(tmp); + if(fields[i].entry.cb.formatter(tmp, fields[i].entry.cb.formatter_data)) + rc = asprintf(&value, "%s=%s", key, buffer_tostring(tmp)); + } + break; + default: + rc = asprintf(&value, "%s=%s", key, "UNHANDLED"); + break; + } + + if (rc != -1 && value) { + iov[iov_count].iov_base = value; + iov[iov_count].iov_len = strlen(value); + iov_count++; + } + } + + int r = sd_journal_sendv(iov, iov_count); + + // Clean up allocated memory + for (int i = 0; i < iov_count; i++) { + if (iov[i].iov_base != NULL) { + free(iov[i].iov_base); + } + } + + return r == 0; +#else + return false; +#endif +} + +static bool nd_logger_journal_direct(struct log_field *fields, size_t fields_max) { + if(!nd_log.journal_direct.initialized) + return false; + + // --- FIELD_PARSER_VERSIONS --- + // + // IMPORTANT: + // THERE ARE 6 VERSIONS OF THIS CODE + // + // 1. journal (direct socket API), + // 2. journal (libsystemd API), + // 3. logfmt, + // 4. json, + // 5. convert to uint64 + // 6. convert to int64 + // + // UPDATE ALL OF THEM FOR NEW FEATURES OR FIXES + + CLEAN_BUFFER *wb = buffer_create(4096, NULL); + CLEAN_BUFFER *tmp = NULL; + + for (size_t i = 0; i < fields_max; i++) { + if (!fields[i].entry.set || !fields[i].journal) + continue; + + const char *key = fields[i].journal; + + const char *s = NULL; + switch(fields[i].entry.type) { + case NDFT_TXT: + s = fields[i].entry.txt; + break; + case NDFT_STR: + s = string2str(fields[i].entry.str); + break; + case NDFT_BFR: + s = buffer_tostring(fields[i].entry.bfr); + break; + case NDFT_U64: + buffer_strcat(wb, key); + buffer_putc(wb, '='); + buffer_print_uint64(wb, fields[i].entry.u64); + buffer_putc(wb, '\n'); + break; + case NDFT_I64: + buffer_strcat(wb, key); + buffer_putc(wb, '='); + buffer_print_int64(wb, fields[i].entry.i64); + buffer_putc(wb, '\n'); + break; + case NDFT_DBL: + buffer_strcat(wb, key); + buffer_putc(wb, '='); + buffer_print_netdata_double(wb, fields[i].entry.dbl); + buffer_putc(wb, '\n'); + break; + case NDFT_UUID: + if(!uuid_is_null(*fields[i].entry.uuid)) { + char u[UUID_COMPACT_STR_LEN]; + uuid_unparse_lower_compact(*fields[i].entry.uuid, u); + buffer_strcat(wb, key); + buffer_putc(wb, '='); + buffer_fast_strcat(wb, u, sizeof(u) - 1); + buffer_putc(wb, '\n'); + } + break; + case NDFT_CALLBACK: { + if(!tmp) + tmp = buffer_create(1024, NULL); + else + buffer_flush(tmp); + if(fields[i].entry.cb.formatter(tmp, fields[i].entry.cb.formatter_data)) + s = buffer_tostring(tmp); + else + s = NULL; + } + break; + default: + s = "UNHANDLED"; + break; + } + + if(s && *s) { + buffer_strcat(wb, key); + if(!strchr(s, '\n')) { + buffer_putc(wb, '='); + buffer_strcat(wb, s); + buffer_putc(wb, '\n'); + } + else { + buffer_putc(wb, '\n'); + size_t size = strlen(s); + uint64_t le_size = htole64(size); + buffer_memcat(wb, &le_size, sizeof(le_size)); + buffer_memcat(wb, s, size); + buffer_putc(wb, '\n'); + } + } + } + + return journal_direct_send(nd_log.journal_direct.fd, buffer_tostring(wb), buffer_strlen(wb)); +} + +// ---------------------------------------------------------------------------- +// syslog logger - uses logfmt + +static bool nd_logger_syslog(int priority, ND_LOG_FORMAT format __maybe_unused, struct log_field *fields, size_t fields_max) { + CLEAN_BUFFER *wb = buffer_create(1024, NULL); + + nd_logger_logfmt(wb, fields, fields_max); + syslog(priority, "%s", buffer_tostring(wb)); + + return true; +} + +// ---------------------------------------------------------------------------- +// file logger - uses logfmt + +static bool nd_logger_file(FILE *fp, ND_LOG_FORMAT format, struct log_field *fields, size_t fields_max) { + BUFFER *wb = buffer_create(1024, NULL); + + if(format == NDLF_JSON) + nd_logger_json(wb, fields, fields_max); + else + nd_logger_logfmt(wb, fields, fields_max); + + int r = fprintf(fp, "%s\n", buffer_tostring(wb)); + fflush(fp); + + buffer_free(wb); + return r > 0; +} + +// ---------------------------------------------------------------------------- +// logger router + +static ND_LOG_METHOD nd_logger_select_output(ND_LOG_SOURCES source, FILE **fpp, SPINLOCK **spinlock) { + *spinlock = NULL; + ND_LOG_METHOD output = nd_log.sources[source].method; + + switch(output) { + case NDLM_JOURNAL: + if(unlikely(!nd_log.journal_direct.initialized && !nd_log.journal.initialized)) { + output = NDLM_FILE; + *fpp = stderr; + *spinlock = &nd_log.std_error.spinlock; + } + else { + *fpp = NULL; + *spinlock = NULL; + } + break; + + case NDLM_SYSLOG: + if(unlikely(!nd_log.syslog.initialized)) { + output = NDLM_FILE; + *spinlock = &nd_log.std_error.spinlock; + *fpp = stderr; + } + else { + *spinlock = NULL; + *fpp = NULL; + } + break; + + case NDLM_FILE: + if(!nd_log.sources[source].fp) { + *fpp = stderr; + *spinlock = &nd_log.std_error.spinlock; + } + else { + *fpp = nd_log.sources[source].fp; + *spinlock = &nd_log.sources[source].spinlock; + } + break; + + case NDLM_STDOUT: + output = NDLM_FILE; + *fpp = stdout; + *spinlock = &nd_log.std_output.spinlock; + break; + + default: + case NDLM_DEFAULT: + case NDLM_STDERR: + output = NDLM_FILE; + *fpp = stderr; + *spinlock = &nd_log.std_error.spinlock; + break; + + case NDLM_DISABLED: + case NDLM_DEVNULL: + output = NDLM_DISABLED; + *fpp = NULL; + *spinlock = NULL; + break; + } + + return output; +} + +// ---------------------------------------------------------------------------- +// high level logger + +static void nd_logger_log_fields(SPINLOCK *spinlock, FILE *fp, bool limit, ND_LOG_FIELD_PRIORITY priority, + ND_LOG_METHOD output, struct nd_log_source *source, + struct log_field *fields, size_t fields_max) { + if(spinlock) + spinlock_lock(spinlock); + + // check the limits + if(limit && nd_log_limit_reached(source)) + goto cleanup; + + if(output == NDLM_JOURNAL) { + if(!nd_logger_journal_direct(fields, fields_max) && !nd_logger_journal_libsystemd(fields, fields_max)) { + // we can't log to journal, let's log to stderr + if(spinlock) + spinlock_unlock(spinlock); + + output = NDLM_FILE; + spinlock = &nd_log.std_error.spinlock; + fp = stderr; + + if(spinlock) + spinlock_lock(spinlock); + } + } + + if(output == NDLM_SYSLOG) + nd_logger_syslog(priority, source->format, fields, fields_max); + + if(output == NDLM_FILE) + nd_logger_file(fp, source->format, fields, fields_max); + + +cleanup: + if(spinlock) + spinlock_unlock(spinlock); +} + +static void nd_logger_unset_all_thread_fields(void) { + size_t fields_max = THREAD_FIELDS_MAX; + for(size_t i = 0; i < fields_max ; i++) + thread_log_fields[i].entry.set = false; +} + +static void nd_logger_merge_log_stack_to_thread_fields(void) { + for(size_t c = 0; c < thread_log_stack_next ;c++) { + struct log_stack_entry *lgs = thread_log_stack_base[c]; + + for(size_t i = 0; lgs[i].id != NDF_STOP ; i++) { + if(lgs[i].id >= _NDF_MAX || !lgs[i].set) + continue; + + struct log_stack_entry *e = &lgs[i]; + ND_LOG_STACK_FIELD_TYPE type = lgs[i].type; + + // do not add empty / unset fields + if((type == NDFT_TXT && (!e->txt || !*e->txt)) || + (type == NDFT_BFR && (!e->bfr || !buffer_strlen(e->bfr))) || + (type == NDFT_STR && !e->str) || + (type == NDFT_UUID && (!e->uuid || uuid_is_null(*e->uuid))) || + (type == NDFT_CALLBACK && !e->cb.formatter) || + type == NDFT_UNSET) + continue; + + thread_log_fields[lgs[i].id].entry = *e; + } + } +} + +static void nd_logger(const char *file, const char *function, const unsigned long line, + ND_LOG_SOURCES source, ND_LOG_FIELD_PRIORITY priority, bool limit, int saved_errno, + const char *fmt, va_list ap) { + + SPINLOCK *spinlock; + FILE *fp; + ND_LOG_METHOD output = nd_logger_select_output(source, &fp, &spinlock); + if(output != NDLM_FILE && output != NDLM_JOURNAL && output != NDLM_SYSLOG) + return; + + // mark all fields as unset + nd_logger_unset_all_thread_fields(); + + // flatten the log stack into the fields + nd_logger_merge_log_stack_to_thread_fields(); + + // set the common fields that are automatically set by the logging subsystem + + if(likely(!thread_log_fields[NDF_INVOCATION_ID].entry.set)) + thread_log_fields[NDF_INVOCATION_ID].entry = ND_LOG_FIELD_UUID(NDF_INVOCATION_ID, &nd_log.invocation_id); + + if(likely(!thread_log_fields[NDF_LOG_SOURCE].entry.set)) + thread_log_fields[NDF_LOG_SOURCE].entry = ND_LOG_FIELD_TXT(NDF_LOG_SOURCE, nd_log_id2source(source)); + else { + ND_LOG_SOURCES src = source; + + if(thread_log_fields[NDF_LOG_SOURCE].entry.type == NDFT_TXT) + src = nd_log_source2id(thread_log_fields[NDF_LOG_SOURCE].entry.txt, source); + else if(thread_log_fields[NDF_LOG_SOURCE].entry.type == NDFT_U64) + src = thread_log_fields[NDF_LOG_SOURCE].entry.u64; + + if(src != source && src < _NDLS_MAX) { + source = src; + output = nd_logger_select_output(source, &fp, &spinlock); + if(output != NDLM_FILE && output != NDLM_JOURNAL && output != NDLM_SYSLOG) + return; + } + } + + if(likely(!thread_log_fields[NDF_SYSLOG_IDENTIFIER].entry.set)) + thread_log_fields[NDF_SYSLOG_IDENTIFIER].entry = ND_LOG_FIELD_TXT(NDF_SYSLOG_IDENTIFIER, program_name); + + if(likely(!thread_log_fields[NDF_LINE].entry.set)) { + thread_log_fields[NDF_LINE].entry = ND_LOG_FIELD_U64(NDF_LINE, line); + thread_log_fields[NDF_FILE].entry = ND_LOG_FIELD_TXT(NDF_FILE, file); + thread_log_fields[NDF_FUNC].entry = ND_LOG_FIELD_TXT(NDF_FUNC, function); + } + + if(likely(!thread_log_fields[NDF_PRIORITY].entry.set)) { + thread_log_fields[NDF_PRIORITY].entry = ND_LOG_FIELD_U64(NDF_PRIORITY, priority); + } + + if(likely(!thread_log_fields[NDF_TID].entry.set)) + thread_log_fields[NDF_TID].entry = ND_LOG_FIELD_U64(NDF_TID, gettid_cached()); + + if(likely(!thread_log_fields[NDF_THREAD_TAG].entry.set)) { + const char *thread_tag = nd_thread_tag(); + thread_log_fields[NDF_THREAD_TAG].entry = ND_LOG_FIELD_TXT(NDF_THREAD_TAG, thread_tag); + + // TODO: fix the ND_MODULE in logging by setting proper module name in threads +// if(!thread_log_fields[NDF_MODULE].entry.set) +// thread_log_fields[NDF_MODULE].entry = ND_LOG_FIELD_CB(NDF_MODULE, thread_tag_to_module, (void *)thread_tag); + } + + if(likely(!thread_log_fields[NDF_TIMESTAMP_REALTIME_USEC].entry.set)) + thread_log_fields[NDF_TIMESTAMP_REALTIME_USEC].entry = ND_LOG_FIELD_U64(NDF_TIMESTAMP_REALTIME_USEC, now_realtime_usec()); + + if(saved_errno != 0 && !thread_log_fields[NDF_ERRNO].entry.set) + thread_log_fields[NDF_ERRNO].entry = ND_LOG_FIELD_I64(NDF_ERRNO, saved_errno); + + CLEAN_BUFFER *wb = NULL; + if(fmt && !thread_log_fields[NDF_MESSAGE].entry.set) { + wb = buffer_create(1024, NULL); + buffer_vsprintf(wb, fmt, ap); + thread_log_fields[NDF_MESSAGE].entry = ND_LOG_FIELD_TXT(NDF_MESSAGE, buffer_tostring(wb)); + } + + nd_logger_log_fields(spinlock, fp, limit, priority, output, &nd_log.sources[source], + thread_log_fields, THREAD_FIELDS_MAX); + + if(nd_log.sources[source].pending_msg) { + // log a pending message + + nd_logger_unset_all_thread_fields(); + + thread_log_fields[NDF_TIMESTAMP_REALTIME_USEC].entry = (struct log_stack_entry){ + .set = true, + .type = NDFT_U64, + .u64 = now_realtime_usec(), + }; + + thread_log_fields[NDF_LOG_SOURCE].entry = (struct log_stack_entry){ + .set = true, + .type = NDFT_TXT, + .txt = nd_log_id2source(source), + }; + + thread_log_fields[NDF_SYSLOG_IDENTIFIER].entry = (struct log_stack_entry){ + .set = true, + .type = NDFT_TXT, + .txt = program_name, + }; + + thread_log_fields[NDF_MESSAGE].entry = (struct log_stack_entry){ + .set = true, + .type = NDFT_TXT, + .txt = nd_log.sources[source].pending_msg, + }; + + nd_logger_log_fields(spinlock, fp, false, priority, output, + &nd_log.sources[source], + thread_log_fields, THREAD_FIELDS_MAX); + + freez((void *)nd_log.sources[source].pending_msg); + nd_log.sources[source].pending_msg = NULL; + } + + errno = 0; +} + +static ND_LOG_SOURCES nd_log_validate_source(ND_LOG_SOURCES source) { + if(source >= _NDLS_MAX) + source = NDLS_DAEMON; + + if(nd_log.overwrite_process_source) + source = nd_log.overwrite_process_source; + + return source; +} + +// ---------------------------------------------------------------------------- +// public API for loggers + +void netdata_logger(ND_LOG_SOURCES source, ND_LOG_FIELD_PRIORITY priority, const char *file, const char *function, unsigned long line, const char *fmt, ... ) +{ + int saved_errno = errno; + source = nd_log_validate_source(source); + + if (source != NDLS_DEBUG && priority > nd_log.sources[source].min_priority) + return; + + va_list args; + va_start(args, fmt); + nd_logger(file, function, line, source, priority, + source == NDLS_DAEMON || source == NDLS_COLLECTORS, + saved_errno, fmt, args); + va_end(args); +} + +void netdata_logger_with_limit(ERROR_LIMIT *erl, ND_LOG_SOURCES source, ND_LOG_FIELD_PRIORITY priority, const char *file __maybe_unused, const char *function __maybe_unused, const unsigned long line __maybe_unused, const char *fmt, ... ) { + int saved_errno = errno; + source = nd_log_validate_source(source); + + if (source != NDLS_DEBUG && priority > nd_log.sources[source].min_priority) + return; + + if(erl->sleep_ut) + sleep_usec(erl->sleep_ut); + + spinlock_lock(&erl->spinlock); + + erl->count++; + time_t now = now_boottime_sec(); + if(now - erl->last_logged < erl->log_every) { + spinlock_unlock(&erl->spinlock); + return; + } + + spinlock_unlock(&erl->spinlock); + + va_list args; + va_start(args, fmt); + nd_logger(file, function, line, source, priority, + source == NDLS_DAEMON || source == NDLS_COLLECTORS, + saved_errno, fmt, args); + va_end(args); + erl->last_logged = now; + erl->count = 0; +} + +void netdata_logger_fatal( const char *file, const char *function, const unsigned long line, const char *fmt, ... ) { + int saved_errno = errno; + ND_LOG_SOURCES source = NDLS_DAEMON; + source = nd_log_validate_source(source); + + va_list args; + va_start(args, fmt); + nd_logger(file, function, line, source, NDLP_ALERT, true, saved_errno, fmt, args); + va_end(args); + + char date[LOG_DATE_LENGTH]; + log_date(date, LOG_DATE_LENGTH, now_realtime_sec()); + + char action_data[70+1]; + snprintfz(action_data, 70, "%04lu@%-10.10s:%-15.15s/%d", line, file, function, saved_errno); + + const char *thread_tag = nd_thread_tag(); + const char *tag_to_send = thread_tag; + + // anonymize thread names + if(strncmp(thread_tag, THREAD_TAG_STREAM_RECEIVER, strlen(THREAD_TAG_STREAM_RECEIVER)) == 0) + tag_to_send = THREAD_TAG_STREAM_RECEIVER; + if(strncmp(thread_tag, THREAD_TAG_STREAM_SENDER, strlen(THREAD_TAG_STREAM_SENDER)) == 0) + tag_to_send = THREAD_TAG_STREAM_SENDER; + + char action_result[60+1]; + snprintfz(action_result, 60, "%s:%s", program_name, tag_to_send); + +#if !defined(ENABLE_SENTRY) && defined(HAVE_BACKTRACE) + int fd = nd_log.sources[NDLS_DAEMON].fd; + if(fd == -1) + fd = STDERR_FILENO; + + int nptrs; + void *buffer[10000]; + + nptrs = backtrace(buffer, sizeof(buffer)); + if(nptrs) + backtrace_symbols_fd(buffer, nptrs, fd); +#endif + +#ifdef NETDATA_INTERNAL_CHECKS + abort(); +#endif + + netdata_cleanup_and_exit(1, "FATAL", action_result, action_data); +} + +// ---------------------------------------------------------------------------- +// log limits + +void nd_log_limits_reset(void) { + usec_t now_ut = now_monotonic_usec(); + + spinlock_lock(&nd_log.std_output.spinlock); + spinlock_lock(&nd_log.std_error.spinlock); + + for(size_t i = 0; i < _NDLS_MAX ;i++) { + spinlock_lock(&nd_log.sources[i].spinlock); + nd_log.sources[i].limits.prevented = 0; + nd_log.sources[i].limits.counter = 0; + nd_log.sources[i].limits.started_monotonic_ut = now_ut; + nd_log.sources[i].limits.logs_per_period = nd_log.sources[i].limits.logs_per_period_backup; + spinlock_unlock(&nd_log.sources[i].spinlock); + } + + spinlock_unlock(&nd_log.std_output.spinlock); + spinlock_unlock(&nd_log.std_error.spinlock); +} + +void nd_log_limits_unlimited(void) { + nd_log_limits_reset(); + for(size_t i = 0; i < _NDLS_MAX ;i++) { + nd_log.sources[i].limits.logs_per_period = 0; + } +} + +static bool nd_log_limit_reached(struct nd_log_source *source) { + if(source->limits.throttle_period == 0 || source->limits.logs_per_period == 0) + return false; + + usec_t now_ut = now_monotonic_usec(); + if(!source->limits.started_monotonic_ut) + source->limits.started_monotonic_ut = now_ut; + + source->limits.counter++; + + if(now_ut - source->limits.started_monotonic_ut > (usec_t)source->limits.throttle_period) { + if(source->limits.prevented) { + BUFFER *wb = buffer_create(1024, NULL); + buffer_sprintf(wb, + "LOG FLOOD PROTECTION: resuming logging " + "(prevented %"PRIu32" logs in the last %"PRIu32" seconds).", + source->limits.prevented, + source->limits.throttle_period); + + if(source->pending_msg) + freez((void *)source->pending_msg); + + source->pending_msg = strdupz(buffer_tostring(wb)); + + buffer_free(wb); + } + + // restart the period accounting + source->limits.started_monotonic_ut = now_ut; + source->limits.counter = 1; + source->limits.prevented = 0; + + // log this error + return false; + } + + if(source->limits.counter > source->limits.logs_per_period) { + if(!source->limits.prevented) { + BUFFER *wb = buffer_create(1024, NULL); + buffer_sprintf(wb, + "LOG FLOOD PROTECTION: too many logs (%"PRIu32" logs in %"PRId64" seconds, threshold is set to %"PRIu32" logs " + "in %"PRIu32" seconds). Preventing more logs from process '%s' for %"PRId64" seconds.", + source->limits.counter, + (int64_t)((now_ut - source->limits.started_monotonic_ut) / USEC_PER_SEC), + source->limits.logs_per_period, + source->limits.throttle_period, + program_name, + (int64_t)(((source->limits.started_monotonic_ut + (source->limits.throttle_period * USEC_PER_SEC) - now_ut)) / USEC_PER_SEC) + ); + + if(source->pending_msg) + freez((void *)source->pending_msg); + + source->pending_msg = strdupz(buffer_tostring(wb)); + + buffer_free(wb); + } + + source->limits.prevented++; + + // prevent logging this error +#ifdef NETDATA_INTERNAL_CHECKS + return false; +#else + return true; +#endif + } + + return false; +} diff --git a/src/libnetdata/log/log.h b/src/libnetdata/log/log.h new file mode 100644 index 00000000..338a5d53 --- /dev/null +++ b/src/libnetdata/log/log.h @@ -0,0 +1,307 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_LOG_H +#define NETDATA_LOG_H 1 + +# ifdef __cplusplus +extern "C" { +# endif + +#include "../libnetdata.h" + +#define ND_LOG_DEFAULT_THROTTLE_LOGS 1000 +#define ND_LOG_DEFAULT_THROTTLE_PERIOD 60 + +typedef enum __attribute__((__packed__)) { + NDLS_UNSET = 0, // internal use only + NDLS_ACCESS, // access.log + NDLS_ACLK, // aclk.log + NDLS_COLLECTORS, // collectors.log + NDLS_DAEMON, // error.log + NDLS_HEALTH, // health.log + NDLS_DEBUG, // debug.log + + // terminator + _NDLS_MAX, +} ND_LOG_SOURCES; + +typedef enum __attribute__((__packed__)) { + NDLP_EMERG = LOG_EMERG, + NDLP_ALERT = LOG_ALERT, + NDLP_CRIT = LOG_CRIT, + NDLP_ERR = LOG_ERR, + NDLP_WARNING = LOG_WARNING, + NDLP_NOTICE = LOG_NOTICE, + NDLP_INFO = LOG_INFO, + NDLP_DEBUG = LOG_DEBUG, +} ND_LOG_FIELD_PRIORITY; + +typedef enum __attribute__((__packed__)) { + // KEEP THESE IN THE SAME ORDER AS in thread_log_fields (log.c) + // so that it easy to audit for missing fields + + NDF_STOP = 0, + NDF_TIMESTAMP_REALTIME_USEC, // the timestamp of the log message - added automatically + NDF_SYSLOG_IDENTIFIER, // the syslog identifier of the application - added automatically + NDF_LOG_SOURCE, // DAEMON, COLLECTORS, HEALTH, ACCESS, ACLK - set at the log call + NDF_PRIORITY, // the syslog priority (severity) - set at the log call + NDF_ERRNO, // the ERRNO at the time of the log call - added automatically + NDF_INVOCATION_ID, // the INVOCATION_ID of Netdata - added automatically + NDF_LINE, // the source code file line number - added automatically + NDF_FILE, // the source code filename - added automatically + NDF_FUNC, // the source code function - added automatically + NDF_TID, // the thread ID of the thread logging - added automatically + NDF_THREAD_TAG, // the thread tag of the thread logging - added automatically + NDF_MESSAGE_ID, // for specific events + NDF_MODULE, // for internal plugin module, all other get the NDF_THREAD_TAG + + NDF_NIDL_NODE, // the node / rrdhost currently being worked + NDF_NIDL_INSTANCE, // the instance / rrdset currently being worked + NDF_NIDL_CONTEXT, // the context of the instance currently being worked + NDF_NIDL_DIMENSION, // the dimension / rrddim currently being worked + + // web server, aclk and stream receiver + NDF_SRC_TRANSPORT, // the transport we received the request, one of: http, https, pluginsd + + // Netdata Cloud Related + NDF_ACCOUNT_ID, + NDF_USER_NAME, + NDF_USER_ROLE, + NDF_USER_ACCESS, + + // web server and stream receiver + NDF_SRC_IP, // the streaming / web server source IP + NDF_SRC_PORT, // the streaming / web server source Port + NDF_SRC_FORWARDED_HOST, + NDF_SRC_FORWARDED_FOR, + NDF_SRC_CAPABILITIES, // the stream receiver capabilities + + // stream sender (established links) + NDF_DST_TRANSPORT, // the transport we send the request, one of: http, https + NDF_DST_IP, // the destination streaming IP + NDF_DST_PORT, // the destination streaming Port + NDF_DST_CAPABILITIES, // the destination streaming capabilities + + // web server, aclk and stream receiver + NDF_REQUEST_METHOD, // for http like requests, the http request method + NDF_RESPONSE_CODE, // for http like requests, the http response code, otherwise a status string + + // web server (all), aclk (queries) + NDF_CONNECTION_ID, // the web server connection ID + NDF_TRANSACTION_ID, // the web server and API transaction ID + NDF_RESPONSE_SENT_BYTES, // for http like requests, the response bytes + NDF_RESPONSE_SIZE_BYTES, // for http like requests, the uncompressed response size + NDF_RESPONSE_PREPARATION_TIME_USEC, // for http like requests, the preparation time + NDF_RESPONSE_SENT_TIME_USEC, // for http like requests, the time to send the response back + NDF_RESPONSE_TOTAL_TIME_USEC, // for http like requests, the total time to complete the response + + // health alerts + NDF_ALERT_ID, + NDF_ALERT_UNIQUE_ID, + NDF_ALERT_EVENT_ID, + NDF_ALERT_TRANSITION_ID, + NDF_ALERT_CONFIG_HASH, + NDF_ALERT_NAME, + NDF_ALERT_CLASS, + NDF_ALERT_COMPONENT, + NDF_ALERT_TYPE, + NDF_ALERT_EXEC, + NDF_ALERT_RECIPIENT, + NDF_ALERT_DURATION, + NDF_ALERT_VALUE, + NDF_ALERT_VALUE_OLD, + NDF_ALERT_STATUS, + NDF_ALERT_STATUS_OLD, + NDF_ALERT_SOURCE, + NDF_ALERT_UNITS, + NDF_ALERT_SUMMARY, + NDF_ALERT_INFO, + NDF_ALERT_NOTIFICATION_REALTIME_USEC, + // NDF_ALERT_FLAGS, + + // put new items here + // leave the request URL and the message last + + NDF_REQUEST, // the request we are currently working on + NDF_MESSAGE, // the log message, if any + + // terminator + _NDF_MAX, +} ND_LOG_FIELD_ID; + +typedef enum __attribute__((__packed__)) { + NDFT_UNSET = 0, + NDFT_TXT, + NDFT_STR, + NDFT_BFR, + NDFT_U64, + NDFT_I64, + NDFT_DBL, + NDFT_UUID, + NDFT_CALLBACK, +} ND_LOG_STACK_FIELD_TYPE; + +void nd_log_set_user_settings(ND_LOG_SOURCES source, const char *setting); +void nd_log_set_facility(const char *facility); +void nd_log_set_priority_level(const char *setting); +void nd_log_initialize(void); +void nd_log_reopen_log_files(void); +void chown_open_file(int fd, uid_t uid, gid_t gid); +void nd_log_chown_log_files(uid_t uid, gid_t gid); +void nd_log_set_flood_protection(size_t logs, time_t period); +void nd_log_initialize_for_external_plugins(const char *name); +bool nd_log_journal_socket_available(void); +ND_LOG_FIELD_ID nd_log_field_id_by_name(const char *field, size_t len); +int nd_log_priority2id(const char *priority); +const char *nd_log_id2priority(ND_LOG_FIELD_PRIORITY priority); +const char *nd_log_method_for_external_plugins(const char *s); + +int nd_log_health_fd(void); +typedef bool (*log_formatter_callback_t)(BUFFER *wb, void *data); + +struct log_stack_entry { + ND_LOG_FIELD_ID id; + ND_LOG_STACK_FIELD_TYPE type; + bool set; + union { + const char *txt; + struct netdata_string *str; + BUFFER *bfr; + uint64_t u64; + int64_t i64; + double dbl; + const nd_uuid_t *uuid; + struct { + log_formatter_callback_t formatter; + void *formatter_data; + } cb; + }; +}; + +#define ND_LOG_STACK _cleanup_(log_stack_pop) struct log_stack_entry +#define ND_LOG_STACK_PUSH(lgs) log_stack_push(lgs) + +#define ND_LOG_FIELD_TXT(field, value) (struct log_stack_entry){ .id = (field), .type = NDFT_TXT, .txt = (value), .set = true, } +#define ND_LOG_FIELD_STR(field, value) (struct log_stack_entry){ .id = (field), .type = NDFT_STR, .str = (value), .set = true, } +#define ND_LOG_FIELD_BFR(field, value) (struct log_stack_entry){ .id = (field), .type = NDFT_BFR, .bfr = (value), .set = true, } +#define ND_LOG_FIELD_U64(field, value) (struct log_stack_entry){ .id = (field), .type = NDFT_U64, .u64 = (value), .set = true, } +#define ND_LOG_FIELD_I64(field, value) (struct log_stack_entry){ .id = (field), .type = NDFT_I64, .i64 = (value), .set = true, } +#define ND_LOG_FIELD_DBL(field, value) (struct log_stack_entry){ .id = (field), .type = NDFT_DBL, .dbl = (value), .set = true, } +#define ND_LOG_FIELD_CB(field, func, data) (struct log_stack_entry){ .id = (field), .type = NDFT_CALLBACK, .cb = { .formatter = (func), .formatter_data = (data) }, .set = true, } +#define ND_LOG_FIELD_UUID(field, value) (struct log_stack_entry){ .id = (field), .type = NDFT_UUID, .uuid = (value), .set = true, } +#define ND_LOG_FIELD_END() (struct log_stack_entry){ .id = NDF_STOP, .type = NDFT_UNSET, .set = false, } + +void log_stack_pop(void *ptr); +void log_stack_push(struct log_stack_entry *lgs); + +#define D_WEB_BUFFER 0x0000000000000001 +#define D_WEB_CLIENT 0x0000000000000002 +#define D_LISTENER 0x0000000000000004 +#define D_WEB_DATA 0x0000000000000008 +#define D_OPTIONS 0x0000000000000010 +#define D_PROCNETDEV_LOOP 0x0000000000000020 +#define D_RRD_STATS 0x0000000000000040 +#define D_WEB_CLIENT_ACCESS 0x0000000000000080 +#define D_TC_LOOP 0x0000000000000100 +#define D_DEFLATE 0x0000000000000200 +#define D_CONFIG 0x0000000000000400 +#define D_PLUGINSD 0x0000000000000800 +#define D_CHILDS 0x0000000000001000 +#define D_EXIT 0x0000000000002000 +#define D_CHECKS 0x0000000000004000 +#define D_NFACCT_LOOP 0x0000000000008000 +#define D_PROCFILE 0x0000000000010000 +#define D_RRD_CALLS 0x0000000000020000 +#define D_DICTIONARY 0x0000000000040000 +#define D_MEMORY 0x0000000000080000 +#define D_CGROUP 0x0000000000100000 +#define D_REGISTRY 0x0000000000200000 +#define D_VARIABLES 0x0000000000400000 +#define D_HEALTH 0x0000000000800000 +#define D_CONNECT_TO 0x0000000001000000 +#define D_RRDHOST 0x0000000002000000 +#define D_LOCKS 0x0000000004000000 +#define D_EXPORTING 0x0000000008000000 +#define D_STATSD 0x0000000010000000 +#define D_POLLFD 0x0000000020000000 +#define D_STREAM 0x0000000040000000 +#define D_ANALYTICS 0x0000000080000000 +#define D_RRDENGINE 0x0000000100000000 +#define D_ACLK 0x0000000200000000 +#define D_REPLICATION 0x0000002000000000 +#define D_SYSTEM 0x8000000000000000 + +extern uint64_t debug_flags; + +extern const char *program_name; + +#ifdef ENABLE_ACLK +extern int aclklog_enabled; +#endif + +#define LOG_DATE_LENGTH 26 +void log_date(char *buffer, size_t len, time_t now); + +static inline void debug_dummy(void) {} + +void nd_log_limits_reset(void); +void nd_log_limits_unlimited(void); + +#define NDLP_INFO_STR "info" + +#ifdef NETDATA_INTERNAL_CHECKS +#define netdata_log_debug(type, args...) do { if(unlikely(debug_flags & type)) netdata_logger(NDLS_DEBUG, NDLP_DEBUG, __FILE__, __FUNCTION__, __LINE__, ##args); } while(0) +#define internal_error(condition, args...) do { if(unlikely(condition)) netdata_logger(NDLS_DAEMON, NDLP_DEBUG, __FILE__, __FUNCTION__, __LINE__, ##args); } while(0) +#define internal_fatal(condition, args...) do { if(unlikely(condition)) netdata_logger_fatal(__FILE__, __FUNCTION__, __LINE__, ##args); } while(0) +#else +#define netdata_log_debug(type, args...) debug_dummy() +#define internal_error(args...) debug_dummy() +#define internal_fatal(args...) debug_dummy() +#endif + +#define fatal(args...) netdata_logger_fatal(__FILE__, __FUNCTION__, __LINE__, ##args) +#define fatal_assert(expr) ((expr) ? (void)(0) : netdata_logger_fatal(__FILE__, __FUNCTION__, __LINE__, "Assertion `%s' failed", #expr)) + +// ---------------------------------------------------------------------------- +// normal logging + +void netdata_logger(ND_LOG_SOURCES source, ND_LOG_FIELD_PRIORITY priority, const char *file, const char *function, unsigned long line, const char *fmt, ... ) PRINTFLIKE(6, 7); +#define nd_log(NDLS, NDLP, args...) netdata_logger(NDLS, NDLP, __FILE__, __FUNCTION__, __LINE__, ##args) +#define nd_log_daemon(NDLP, args...) netdata_logger(NDLS_DAEMON, NDLP, __FILE__, __FUNCTION__, __LINE__, ##args) +#define nd_log_collector(NDLP, args...) netdata_logger(NDLS_COLLECTORS, NDLP, __FILE__, __FUNCTION__, __LINE__, ##args) + +#define netdata_log_info(args...) netdata_logger(NDLS_DAEMON, NDLP_INFO, __FILE__, __FUNCTION__, __LINE__, ##args) +#define netdata_log_error(args...) netdata_logger(NDLS_DAEMON, NDLP_ERR, __FILE__, __FUNCTION__, __LINE__, ##args) +#define collector_info(args...) netdata_logger(NDLS_COLLECTORS, NDLP_INFO, __FILE__, __FUNCTION__, __LINE__, ##args) +#define collector_error(args...) netdata_logger(NDLS_COLLECTORS, NDLP_ERR, __FILE__, __FUNCTION__, __LINE__, ##args) + +#define log_aclk_message_bin(__data, __data_len, __tx, __mqtt_topic, __message_name) \ + nd_log(NDLS_ACLK, NDLP_INFO, \ + "direction:%s message:'%s' topic:'%s' json:'%.*s'", \ + (__tx) ? "OUTGOING" : "INCOMING", __message_name, __mqtt_topic, (int)(__data_len), __data) + +// ---------------------------------------------------------------------------- +// logging with limits + +typedef struct error_with_limit { + SPINLOCK spinlock; + time_t log_every; + size_t count; + time_t last_logged; + usec_t sleep_ut; +} ERROR_LIMIT; + +#define nd_log_limit_static_global_var(var, log_every_secs, sleep_usecs) static ERROR_LIMIT var = { .last_logged = 0, .count = 0, .log_every = (log_every_secs), .sleep_ut = (sleep_usecs) } +#define nd_log_limit_static_thread_var(var, log_every_secs, sleep_usecs) static __thread ERROR_LIMIT var = { .last_logged = 0, .count = 0, .log_every = (log_every_secs), .sleep_ut = (sleep_usecs) } +void netdata_logger_with_limit(ERROR_LIMIT *erl, ND_LOG_SOURCES source, ND_LOG_FIELD_PRIORITY priority, const char *file, const char *function, unsigned long line, const char *fmt, ... ) PRINTFLIKE(7, 8); +#define nd_log_limit(erl, NDLS, NDLP, args...) netdata_logger_with_limit(erl, NDLS, NDLP, __FILE__, __FUNCTION__, __LINE__, ##args) + +// ---------------------------------------------------------------------------- + +void netdata_logger_fatal( const char *file, const char *function, unsigned long line, const char *fmt, ... ) NORETURN PRINTFLIKE(4, 5); + +# ifdef __cplusplus +} +# endif + +#endif /* NETDATA_LOG_H */ diff --git a/src/libnetdata/log/systemd-cat-native.c b/src/libnetdata/log/systemd-cat-native.c new file mode 100644 index 00000000..74d3728a --- /dev/null +++ b/src/libnetdata/log/systemd-cat-native.c @@ -0,0 +1,820 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "systemd-cat-native.h" +#include "../required_dummies.h" + +#ifdef __FreeBSD__ +#include <sys/endian.h> +#endif + +#ifdef __APPLE__ +#include <machine/endian.h> +#endif + +static inline void log_message_to_stderr(BUFFER *msg) { + CLEAN_BUFFER *tmp = buffer_create(0, NULL); + + for(size_t i = 0; i < msg->len ;i++) { + if(isprint(msg->buffer[i])) + buffer_putc(tmp, msg->buffer[i]); + else { + buffer_putc(tmp, '['); + buffer_print_uint64_hex(tmp, msg->buffer[i]); + buffer_putc(tmp, ']'); + } + } + + fprintf(stderr, "SENDING: %s\n", buffer_tostring(tmp)); +} + +static inline buffered_reader_ret_t get_next_line(struct buffered_reader *reader, BUFFER *line, int timeout_ms) { + while(true) { + if(unlikely(!buffered_reader_next_line(reader, line))) { + buffered_reader_ret_t ret = buffered_reader_read_timeout(reader, STDIN_FILENO, timeout_ms, false); + if(unlikely(ret != BUFFERED_READER_READ_OK)) + return ret; + + continue; + } + else { + // make sure the buffer is NULL terminated + line->buffer[line->len] = '\0'; + + // remove the trailing newlines + while(line->len && line->buffer[line->len - 1] == '\n') + line->buffer[--line->len] = '\0'; + + return BUFFERED_READER_READ_OK; + } + } +} + +static inline size_t copy_replacing_newlines(char *dst, size_t dst_len, const char *src, size_t src_len, const char *newline) { + if (!dst || !src) return 0; + + const char *current_src = src; + const char *src_end = src + src_len; // Pointer to the end of src + char *current_dst = dst; + size_t remaining_dst_len = dst_len; + size_t newline_len = newline && *newline ? strlen(newline) : 0; + + size_t bytes_copied = 0; // To track the number of bytes copied + + while (remaining_dst_len > 1 && current_src < src_end) { + if (newline_len > 0) { + const char *found = strstr(current_src, newline); + if (found && found < src_end) { + size_t copy_len = found - current_src; + if (copy_len >= remaining_dst_len) copy_len = remaining_dst_len - 1; + + memcpy(current_dst, current_src, copy_len); + current_dst += copy_len; + *current_dst++ = '\n'; + remaining_dst_len -= (copy_len + 1); + bytes_copied += copy_len + 1; // +1 for the newline character + current_src = found + newline_len; + continue; + } + } + + // Copy the remaining part of src to dst + size_t copy_len = src_end - current_src; + if (copy_len >= remaining_dst_len) copy_len = remaining_dst_len - 1; + + memcpy(current_dst, current_src, copy_len); + current_dst += copy_len; + remaining_dst_len -= copy_len; + bytes_copied += copy_len; + break; + } + + // Ensure the string is null-terminated + *current_dst = '\0'; + + return bytes_copied; +} + +static inline void buffer_memcat_replacing_newlines(BUFFER *wb, const char *src, size_t src_len, const char *newline) { + if(!src) return; + + const char *equal; + if(!newline || !*newline || !strstr(src, newline) || !(equal = strchr(src, '='))) { + buffer_memcat(wb, src, src_len); + buffer_putc(wb, '\n'); + return; + } + + size_t key_len = equal - src; + buffer_memcat(wb, src, key_len); + buffer_putc(wb, '\n'); + + char *length_ptr = &wb->buffer[wb->len]; + uint64_t le_size = 0; + buffer_memcat(wb, &le_size, sizeof(le_size)); + + const char *value = ++equal; + size_t value_len = src_len - key_len - 1; + buffer_need_bytes(wb, value_len + 1); + size_t size = copy_replacing_newlines(&wb->buffer[wb->len], value_len + 1, value, value_len, newline); + wb->len += size; + buffer_putc(wb, '\n'); + + le_size = htole64(size); + memcpy(length_ptr, &le_size, sizeof(le_size)); +} + +// ---------------------------------------------------------------------------- +// log to a systemd-journal-remote + +#ifdef HAVE_CURL +#include <curl/curl.h> + +#ifndef HOST_NAME_MAX +#define HOST_NAME_MAX 256 +#endif + +char global_hostname[HOST_NAME_MAX] = ""; +char global_boot_id[UUID_COMPACT_STR_LEN] = ""; +char global_machine_id[UUID_COMPACT_STR_LEN] = ""; +char global_stream_id[UUID_COMPACT_STR_LEN] = ""; +char global_namespace[1024] = ""; +char global_systemd_invocation_id[1024] = ""; +#define BOOT_ID_PATH "/proc/sys/kernel/random/boot_id" +#define MACHINE_ID_PATH "/etc/machine-id" + +#define DEFAULT_PRIVATE_KEY "/etc/ssl/private/journal-upload.pem" +#define DEFAULT_PUBLIC_KEY "/etc/ssl/certs/journal-upload.pem" +#define DEFAULT_CA_CERT "/etc/ssl/ca/trusted.pem" + +struct upload_data { + char *data; + size_t length; +}; + +static size_t systemd_journal_remote_read_callback(void *ptr, size_t size, size_t nmemb, void *userp) { + struct upload_data *upload = (struct upload_data *)userp; + size_t buffer_size = size * nmemb; + + if (upload->length) { + size_t copy_size = upload->length < buffer_size ? upload->length : buffer_size; + memcpy(ptr, upload->data, copy_size); + upload->data += copy_size; + upload->length -= copy_size; + return copy_size; + } + + return 0; +} + +CURL* initialize_connection_to_systemd_journal_remote(const char* url, const char* private_key, const char* public_key, const char* ca_cert, struct curl_slist **headers) { + CURL *curl = curl_easy_init(); + if (!curl) { + fprintf(stderr, "Failed to initialize curl\n"); + return NULL; + } + + *headers = curl_slist_append(*headers, "Content-Type: application/vnd.fdo.journal"); + *headers = curl_slist_append(*headers, "Transfer-Encoding: chunked"); + curl_easy_setopt(curl, CURLOPT_HTTPHEADER, *headers); + curl_easy_setopt(curl, CURLOPT_URL, url); + curl_easy_setopt(curl, CURLOPT_POST, 1L); + curl_easy_setopt(curl, CURLOPT_READFUNCTION, systemd_journal_remote_read_callback); + + if (strncmp(url, "https://", 8) == 0) { + if (private_key) curl_easy_setopt(curl, CURLOPT_SSLKEY, private_key); + if (public_key) curl_easy_setopt(curl, CURLOPT_SSLCERT, public_key); + + if (strcmp(ca_cert, "all") != 0) { + curl_easy_setopt(curl, CURLOPT_CAINFO, ca_cert); + } else { + curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0L); + } + } + // curl_easy_setopt(curl, CURLOPT_VERBOSE, 1L); // Remove for less verbose output + + return curl; +} + +static void journal_remote_complete_event(BUFFER *msg, usec_t *monotonic_ut) { + usec_t ut = now_monotonic_usec(); + + if(monotonic_ut) + *monotonic_ut = ut; + + buffer_sprintf(msg, + "" + "__REALTIME_TIMESTAMP=%llu\n" + "__MONOTONIC_TIMESTAMP=%llu\n" + "_MACHINE_ID=%s\n" + "_BOOT_ID=%s\n" + "_HOSTNAME=%s\n" + "_TRANSPORT=stdout\n" + "_LINE_BREAK=nul\n" + "_STREAM_ID=%s\n" + "_RUNTIME_SCOPE=system\n" + "%s%s\n" + , now_realtime_usec() + , ut + , global_machine_id + , global_boot_id + , global_hostname + , global_stream_id + , global_namespace + , global_systemd_invocation_id + ); +} + +static CURLcode journal_remote_send_buffer(CURL* curl, BUFFER *msg) { + + // log_message_to_stderr(msg); + + struct upload_data upload = {0}; + + if (!curl || !buffer_strlen(msg)) + return CURLE_FAILED_INIT; + + upload.data = (char *) buffer_tostring(msg); + upload.length = buffer_strlen(msg); + + curl_easy_setopt(curl, CURLOPT_READDATA, &upload); + curl_easy_setopt(curl, CURLOPT_INFILESIZE_LARGE, (curl_off_t)upload.length); + + return curl_easy_perform(curl); +} + +typedef enum { + LOG_TO_JOURNAL_REMOTE_BAD_PARAMS = -1, + LOG_TO_JOURNAL_REMOTE_CANNOT_INITIALIZE = -2, + LOG_TO_JOURNAL_REMOTE_CANNOT_SEND = -3, + LOG_TO_JOURNAL_REMOTE_CANNOT_READ = -4, +} log_to_journal_remote_ret_t; + +static log_to_journal_remote_ret_t log_input_to_journal_remote(const char *url, const char *key, const char *cert, const char *trust, const char *newline, int timeout_ms) { + if(!url || !*url) { + fprintf(stderr, "No URL is given.\n"); + return LOG_TO_JOURNAL_REMOTE_BAD_PARAMS; + } + + if(timeout_ms < 10) + timeout_ms = 10; + + global_boot_id[0] = '\0'; + char buffer[1024]; + if(read_file(BOOT_ID_PATH, buffer, sizeof(buffer)) == 0) { + uuid_t uuid; + if(uuid_parse_flexi(buffer, uuid) == 0) + uuid_unparse_lower_compact(uuid, global_boot_id); + else + fprintf(stderr, "WARNING: cannot parse the UUID found in '%s'.\n", BOOT_ID_PATH); + } + + if(global_boot_id[0] == '\0') { + fprintf(stderr, "WARNING: cannot read '%s'. Will generate a random _BOOT_ID.\n", BOOT_ID_PATH); + uuid_t uuid; + uuid_generate_random(uuid); + uuid_unparse_lower_compact(uuid, global_boot_id); + } + + if(read_file(MACHINE_ID_PATH, buffer, sizeof(buffer)) == 0) { + uuid_t uuid; + if(uuid_parse_flexi(buffer, uuid) == 0) + uuid_unparse_lower_compact(uuid, global_machine_id); + else + fprintf(stderr, "WARNING: cannot parse the UUID found in '%s'.\n", MACHINE_ID_PATH); + } + + if(global_machine_id[0] == '\0') { + fprintf(stderr, "WARNING: cannot read '%s'. Will generate a random _MACHINE_ID.\n", MACHINE_ID_PATH); + uuid_t uuid; + uuid_generate_random(uuid); + uuid_unparse_lower_compact(uuid, global_boot_id); + } + + if(global_stream_id[0] == '\0') { + uuid_t uuid; + uuid_generate_random(uuid); + uuid_unparse_lower_compact(uuid, global_stream_id); + } + + if(global_hostname[0] == '\0') { + if(gethostname(global_hostname, sizeof(global_hostname)) != 0) { + fprintf(stderr, "WARNING: cannot get system's hostname. Will use internal default.\n"); + snprintfz(global_hostname, sizeof(global_hostname), "systemd-cat-native-unknown-hostname"); + } + } + + if(global_systemd_invocation_id[0] == '\0' && getenv("INVOCATION_ID")) + snprintfz(global_systemd_invocation_id, sizeof(global_systemd_invocation_id), "_SYSTEMD_INVOCATION_ID=%s\n", getenv("INVOCATION_ID")); + + if(!key) + key = DEFAULT_PRIVATE_KEY; + + if(!cert) + cert = DEFAULT_PUBLIC_KEY; + + if(!trust) + trust = DEFAULT_CA_CERT; + + char full_url[4096]; + snprintfz(full_url, sizeof(full_url), "%s/upload", url); + + CURL *curl; + CURLcode res = CURLE_OK; + struct curl_slist *headers = NULL; + + curl_global_init(CURL_GLOBAL_ALL); + curl = initialize_connection_to_systemd_journal_remote(full_url, key, cert, trust, &headers); + + if(!curl) + return LOG_TO_JOURNAL_REMOTE_CANNOT_INITIALIZE; + + struct buffered_reader reader; + buffered_reader_init(&reader); + CLEAN_BUFFER *line = buffer_create(sizeof(reader.read_buffer), NULL); + CLEAN_BUFFER *msg = buffer_create(sizeof(reader.read_buffer), NULL); + + size_t msg_full_events = 0; + size_t msg_partial_fields = 0; + usec_t msg_started_ut = 0; + size_t failures = 0; + size_t messages_logged = 0; + + log_to_journal_remote_ret_t ret = 0; + + while(true) { + buffered_reader_ret_t rc = get_next_line(&reader, line, timeout_ms); + if(rc == BUFFERED_READER_READ_POLL_TIMEOUT) { + if(msg_full_events && !msg_partial_fields) { + res = journal_remote_send_buffer(curl, msg); + if(res != CURLE_OK) { + fprintf(stderr, "journal_remote_send_buffer() failed: %s\n", curl_easy_strerror(res)); + failures++; + ret = LOG_TO_JOURNAL_REMOTE_CANNOT_SEND; + goto cleanup; + } + else + messages_logged++; + + msg_full_events = 0; + buffer_flush(msg); + } + } + else if(rc == BUFFERED_READER_READ_OK) { + if(!line->len) { + // an empty line - we are done for this message + if(msg_partial_fields) { + msg_partial_fields = 0; + + usec_t ut; + journal_remote_complete_event(msg, &ut); + if(!msg_full_events) + msg_started_ut = ut; + + msg_full_events++; + + if(ut - msg_started_ut >= USEC_PER_SEC / 2) { + res = journal_remote_send_buffer(curl, msg); + if(res != CURLE_OK) { + fprintf(stderr, "journal_remote_send_buffer() failed: %s\n", curl_easy_strerror(res)); + failures++; + ret = LOG_TO_JOURNAL_REMOTE_CANNOT_SEND; + goto cleanup; + } + else + messages_logged++; + + msg_full_events = 0; + buffer_flush(msg); + } + } + } + else { + buffer_memcat_replacing_newlines(msg, line->buffer, line->len, newline); + msg_partial_fields++; + } + + buffer_flush(line); + } + else { + fprintf(stderr, "cannot read input data, failed with code %d\n", rc); + ret = LOG_TO_JOURNAL_REMOTE_CANNOT_READ; + break; + } + } + + if (msg_full_events || msg_partial_fields) { + if(msg_partial_fields) { + msg_partial_fields = 0; + msg_full_events++; + journal_remote_complete_event(msg, NULL); + } + + if(msg_full_events) { + res = journal_remote_send_buffer(curl, msg); + if(res != CURLE_OK) { + fprintf(stderr, "journal_remote_send_buffer() failed: %s\n", curl_easy_strerror(res)); + failures++; + } + else + messages_logged++; + + msg_full_events = 0; + buffer_flush(msg); + } + } + +cleanup: + curl_easy_cleanup(curl); + curl_slist_free_all(headers); + curl_global_cleanup(); + + return ret; +} + +#endif + +static int help(void) { + fprintf(stderr, + "\n" + "Netdata systemd-cat-native " NETDATA_VERSION "\n" + "\n" + "This program reads from its standard input, lines in the format:\n" + "\n" + "KEY1=VALUE1\\n\n" + "KEY2=VALUE2\\n\n" + "KEYN=VALUEN\\n\n" + "\\n\n" + "\n" + "and sends them to systemd-journal.\n" + "\n" + " - Binary journal fields are not accepted at its input\n" + " - Binary journal fields can be generated after newline processing\n" + " - Messages have to be separated by an empty line\n" + " - Keys starting with underscore are not accepted (by journald)\n" + " - Other rules imposed by systemd-journald are imposed (by journald)\n" + "\n" + "Usage:\n" + "\n" + " %s\n" + " [--newline=STRING]\n" + " [--log-as-netdata|-N]\n" + " [--namespace=NAMESPACE] [--socket=PATH]\n" +#ifdef HAVE_CURL + " [--url=URL [--key=FILENAME] [--cert=FILENAME] [--trust=FILENAME|all]]\n" +#endif + "\n" + "The program has the following modes of logging:\n" + "\n" + " * Log to a local systemd-journald or stderr\n" + "\n" + " This is the default mode. If systemd-journald is available, logs will be\n" + " sent to systemd, otherwise logs will be printed on stderr, using logfmt\n" + " formatting. Options --socket and --namespace are available to configure\n" + " the journal destination:\n" + "\n" + " --socket=PATH\n" + " The path of a systemd-journald UNIX socket.\n" + " The program will use the default systemd-journald socket when this\n" + " option is not used.\n" + "\n" + " --namespace=NAMESPACE\n" + " The name of a configured and running systemd-journald namespace.\n" + " The program will produce the socket path based on its internal\n" + " defaults, to send the messages to the systemd journal namespace.\n" + "\n" + " * Log as Netdata, enabled with --log-as-netdata or -N\n" + "\n" + " In this mode the program uses environment variables set by Netdata for\n" + " the log destination. Only log fields defined by Netdata are accepted.\n" + " If the environment variables expected by Netdata are not found, it\n" + " falls back to stderr logging in logfmt format.\n" +#ifdef HAVE_CURL + "\n" + " * Log to a systemd-journal-remote TCP socket, enabled with --url=URL\n" + "\n" + " In this mode, the program will directly sent logs to a remote systemd\n" + " journal (systemd-journal-remote expected at the destination)\n" + " This mode is available even when the local system does not support\n" + " systemd, or even it is not Linux, allowing a remote Linux systemd\n" + " journald to become the logs database of the local system.\n" + "\n" + " Unfortunately systemd-journal-remote does not accept compressed\n" + " data over the network, so the stream will be uncompressed.\n" + "\n" + " --url=URL\n" + " The destination systemd-journal-remote address and port, similarly\n" + " to what /etc/systemd/journal-upload.conf accepts.\n" + " Usually it is in the form: https://ip.address:19532\n" + " Both http and https URLs are accepted. When using https, the\n" + " following additional options are accepted:\n" + "\n" + " --key=FILENAME\n" + " The filename of the private key of the server.\n" + " The default is: " DEFAULT_PRIVATE_KEY "\n" + "\n" + " --cert=FILENAME\n" + " The filename of the public key of the server.\n" + " The default is: " DEFAULT_PUBLIC_KEY "\n" + "\n" + " --trust=FILENAME | all\n" + " The filename of the trusted CA public key.\n" + " The default is: " DEFAULT_CA_CERT "\n" + " The keyword 'all' can be used to trust all CAs.\n" + "\n" + " --namespace=NAMESPACE\n" + " Set the namespace of the messages sent.\n" + "\n" + " --keep-trying\n" + " Keep trying to send the message, if the remote journal is not there.\n" +#endif + "\n" + " NEWLINES PROCESSING\n" + " systemd-journal logs entries may have newlines in them. However the\n" + " Journal Export Format uses binary formatted data to achieve this,\n" + " making it hard for text processing.\n" + "\n" + " To overcome this limitation, this program allows single-line text\n" + " formatted values at its input, to be binary formatted multi-line Journal\n" + " Export Format at its output.\n" + "\n" + " To achieve that it allows replacing a given string to a newline.\n" + " The parameter --newline=STRING allows setting the string to be replaced\n" + " with newlines.\n" + "\n" + " For example by setting --newline='--NEWLINE--', the program will replace\n" + " all occurrences of --NEWLINE-- with the newline character, within each\n" + " VALUE of the KEY=VALUE lines. Once this this done, the program will\n" + " switch the field to the binary Journal Export Format before sending the\n" + " log event to systemd-journal.\n" + "\n", + program_name); + + return 1; +} + +// ---------------------------------------------------------------------------- +// log as Netdata + +static void lgs_reset(struct log_stack_entry *lgs) { + for(size_t i = 1; i < _NDF_MAX ;i++) { + if(lgs[i].type == NDFT_TXT && lgs[i].set && lgs[i].txt) + freez((void *)lgs[i].txt); + + lgs[i] = ND_LOG_FIELD_TXT(i, NULL); + } + + lgs[0] = ND_LOG_FIELD_TXT(NDF_MESSAGE, NULL); + lgs[_NDF_MAX] = ND_LOG_FIELD_END(); +} + +static const char *strdupz_replacing_newlines(const char *src, const char *newline) { + if(!src) src = ""; + + size_t src_len = strlen(src); + char *buffer = mallocz(src_len + 1); + copy_replacing_newlines(buffer, src_len + 1, src, src_len, newline); + return buffer; +} + +static int log_input_as_netdata(const char *newline, int timeout_ms) { + struct buffered_reader reader; + buffered_reader_init(&reader); + CLEAN_BUFFER *line = buffer_create(sizeof(reader.read_buffer), NULL); + + ND_LOG_STACK lgs[_NDF_MAX + 1] = { 0 }; + ND_LOG_STACK_PUSH(lgs); + lgs_reset(lgs); + + size_t fields_added = 0; + size_t messages_logged = 0; + ND_LOG_FIELD_PRIORITY priority = NDLP_INFO; + + while(get_next_line(&reader, line, timeout_ms) == BUFFERED_READER_READ_OK) { + if(!line->len) { + // an empty line - we are done for this message + + nd_log(NDLS_HEALTH, priority, + "added %zu fields", // if the user supplied a MESSAGE, this will be ignored + fields_added); + + lgs_reset(lgs); + fields_added = 0; + messages_logged++; + } + else { + char *equal = strchr(line->buffer, '='); + if(equal) { + const char *field = line->buffer; + size_t field_len = equal - line->buffer; + ND_LOG_FIELD_ID id = nd_log_field_id_by_name(field, field_len); + if(id != NDF_STOP) { + const char *value = ++equal; + + if(lgs[id].txt) + freez((void *) lgs[id].txt); + + lgs[id].txt = strdupz_replacing_newlines(value, newline); + lgs[id].set = true; + + fields_added++; + + if(id == NDF_PRIORITY) + priority = nd_log_priority2id(value); + } + else { + struct log_stack_entry backup = lgs[NDF_MESSAGE]; + lgs[NDF_MESSAGE] = ND_LOG_FIELD_TXT(NDF_MESSAGE, NULL); + + nd_log(NDLS_COLLECTORS, NDLP_ERR, + "Field '%.*s' is not a Netdata field. Ignoring it.", + (int)field_len, field); + + lgs[NDF_MESSAGE] = backup; + } + } + else { + struct log_stack_entry backup = lgs[NDF_MESSAGE]; + lgs[NDF_MESSAGE] = ND_LOG_FIELD_TXT(NDF_MESSAGE, NULL); + + nd_log(NDLS_COLLECTORS, NDLP_ERR, + "Line does not contain an = sign; ignoring it: %s", + line->buffer); + + lgs[NDF_MESSAGE] = backup; + } + } + + buffer_flush(line); + } + + if(fields_added) { + nd_log(NDLS_HEALTH, priority, "added %zu fields", fields_added); + messages_logged++; + } + + return messages_logged ? 0 : 1; +} + +// ---------------------------------------------------------------------------- +// log to a local systemd-journald + +static bool journal_local_send_buffer(int fd, BUFFER *msg) { + // log_message_to_stderr(msg); + + bool ret = journal_direct_send(fd, msg->buffer, msg->len); + if (!ret) + fprintf(stderr, "Cannot send message to systemd journal.\n"); + + return ret; +} + +static int log_input_to_journal(const char *socket, const char *namespace, const char *newline, int timeout_ms) { + char path[FILENAME_MAX + 1]; + int fd = -1; + + if(socket) + snprintfz(path, sizeof(path), "%s", socket); + else + journal_construct_path(path, sizeof(path), NULL, namespace); + + fd = journal_direct_fd(path); + if (fd == -1) { + fprintf(stderr, "Cannot open '%s' as a UNIX socket (errno = %d)\n", + path, errno); + return 1; + } + + struct buffered_reader reader; + buffered_reader_init(&reader); + CLEAN_BUFFER *line = buffer_create(sizeof(reader.read_buffer), NULL); + CLEAN_BUFFER *msg = buffer_create(sizeof(reader.read_buffer), NULL); + + size_t messages_logged = 0; + size_t failed_messages = 0; + + while(get_next_line(&reader, line, timeout_ms) == BUFFERED_READER_READ_OK) { + if (!line->len) { + // an empty line - we are done for this message + if (msg->len) { + if(journal_local_send_buffer(fd, msg)) + messages_logged++; + else { + failed_messages++; + goto cleanup; + } + } + + buffer_flush(msg); + } + else + buffer_memcat_replacing_newlines(msg, line->buffer, line->len, newline); + + buffer_flush(line); + } + + if (msg && msg->len) { + if(journal_local_send_buffer(fd, msg)) + messages_logged++; + else + failed_messages++; + } + +cleanup: + return !failed_messages && messages_logged ? 0 : 1; +} + +int main(int argc, char *argv[]) { + clocks_init(); + nd_log_initialize_for_external_plugins(argv[0]); + + int timeout_ms = -1; // wait forever + bool log_as_netdata = false; + const char *newline = NULL; + const char *namespace = NULL; + const char *socket = getenv("NETDATA_SYSTEMD_JOURNAL_PATH"); +#ifdef HAVE_CURL + const char *url = NULL; + const char *key = NULL; + const char *cert = NULL; + const char *trust = NULL; + bool keep_trying = false; +#endif + + for(int i = 1; i < argc ;i++) { + const char *k = argv[i]; + + if(strcmp(k, "--help") == 0 || strcmp(k, "-h") == 0) + return help(); + + else if(strcmp(k, "--log-as-netdata") == 0 || strcmp(k, "-N") == 0) + log_as_netdata = true; + + else if(strncmp(k, "--namespace=", 12) == 0) + namespace = &k[12]; + + else if(strncmp(k, "--socket=", 9) == 0) + socket = &k[9]; + + else if(strncmp(k, "--newline=", 10) == 0) + newline = &k[10]; + +#ifdef HAVE_CURL + else if (strncmp(k, "--url=", 6) == 0) + url = &k[6]; + + else if (strncmp(k, "--key=", 6) == 0) + key = &k[6]; + + else if (strncmp(k, "--cert=", 7) == 0) + cert = &k[7]; + + else if (strncmp(k, "--trust=", 8) == 0) + trust = &k[8]; + + else if (strcmp(k, "--keep-trying") == 0) + keep_trying = true; +#endif + else { + fprintf(stderr, "Unknown parameter '%s'\n", k); + return 1; + } + } + +#ifdef HAVE_CURL + if(log_as_netdata && url) { + fprintf(stderr, "Cannot log to a systemd-journal-remote URL as Netdata. " + "Please either give --url or --log-as-netdata, not both.\n"); + return 1; + } + + if(socket && url) { + fprintf(stderr, "Cannot log to a systemd-journal-remote URL using a UNIX socket. " + "Please either give --url or --socket, not both.\n"); + return 1; + } + +#endif + + if(log_as_netdata && namespace) { + fprintf(stderr, "Cannot log as netdata using a namespace. " + "Please either give --log-as-netdata or --namespace, not both.\n"); + return 1; + } + + if(log_as_netdata) + return log_input_as_netdata(newline, timeout_ms); + +#ifdef HAVE_CURL + if(url) { + if(url && namespace && *namespace) + snprintfz(global_namespace, sizeof(global_namespace), "_NAMESPACE=%s\n", namespace); + + log_to_journal_remote_ret_t rc; + do { + rc = log_input_to_journal_remote(url, key, cert, trust, newline, timeout_ms); + } while(keep_trying && rc == LOG_TO_JOURNAL_REMOTE_CANNOT_SEND); + } +#endif + + return log_input_to_journal(socket, namespace, newline, timeout_ms); +} diff --git a/src/libnetdata/log/systemd-cat-native.h b/src/libnetdata/log/systemd-cat-native.h new file mode 100644 index 00000000..34e7a361 --- /dev/null +++ b/src/libnetdata/log/systemd-cat-native.h @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" + +#ifndef NETDATA_SYSTEMD_CAT_NATIVE_H +#define NETDATA_SYSTEMD_CAT_NATIVE_H + +#endif //NETDATA_SYSTEMD_CAT_NATIVE_H diff --git a/src/libnetdata/log/systemd-cat-native.md b/src/libnetdata/log/systemd-cat-native.md new file mode 100644 index 00000000..b0b15f40 --- /dev/null +++ b/src/libnetdata/log/systemd-cat-native.md @@ -0,0 +1,209 @@ +# systemd-cat-native + +`systemd` includes a utility called `systemd-cat`. This utility reads log lines from its standard input and sends them +to the local systemd journal. Its key limitation is that despite the fact that systemd journals support structured logs, +this command does not support sending structured logs to it. + +`systemd-cat-native` is a Netdata supplied utility to push structured logs to systemd journals. Key features: + +- reads [Journal Export Format](https://systemd.io/JOURNAL_EXPORT_FORMATS/) formatted log entries +- converts text fields into binary journal multiline log fields +- sends logs to any of these: + - local default `systemd-journald`, + - local namespace `systemd-journald`, + - remote `systemd-journal-remote` using HTTP or HTTPS, the same way `systemd-journal-upload` does. +- is the standard external logger of Netdata shell scripts + +## Simple use: + +```bash +printf "MESSAGE=hello world\nPRIORITY=6\n\n" | systemd-cat-native +``` + +The result: + +![image](https://github.com/netdata/netdata/assets/2662304/689d5e03-97ee-40a8-a690-82b7710cef7c) + + +Sending `PRIORITY=3` (error): + +```bash +printf "MESSAGE=hey, this is error\nPRIORITY=3\n\n" | systemd-cat-native +``` + +The result: +![image](https://github.com/netdata/netdata/assets/2662304/faf3eaa5-ac56-415b-9de8-16e6ceed9280) + +Sending multi-line log entries (in this example we replace the text `--NEWLINE--` with a newline in the log entry): + +```bash +printf "MESSAGE=hello--NEWLINE--world\nPRIORITY=6\n\n" | systemd-cat-native --newline='--NEWLINE--' +``` + +The result: + +![image](https://github.com/netdata/netdata/assets/2662304/d6037b4a-87da-4693-ae67-e07df0decdd9) + + +Processing the standard `\n` string can be tricky due to shell escaping. This works, but note that +we have to add a lot of backslashes to printf. + +```bash +printf "MESSAGE=hello\\\\nworld\nPRIORITY=6\n\n" | systemd-cat-native --newline='\n' +``` + +`systemd-cat-native` needs to receive it like this for newline processing to work: + +```bash +# printf "MESSAGE=hello\\\\nworld\nPRIORITY=6\n\n" +MESSAGE=hello\nworld +PRIORITY=6 + +``` + +## Best practices + +These are the rules about fields, enforced by `systemd-journald`: + +- field names can be up to **64 characters**, +- field values can be up to **48k characters**, +- the only allowed field characters are **A-Z**, **0-9** and **underscore**, +- the **first** character of fields cannot be a **digit** +- **protected** journal fields start with underscore: + * they are accepted by `systemd-journal-remote`, + * they are **NOT** accepted by a local `systemd-journald`. + +For best results, always include these fields: + +- `MESSAGE=TEXT`<br/> + The `MESSAGE` is the body of the log entry. + This field is what we usually see in our logs. + +- `PRIORITY=NUMBER`<br/> + `PRIORITY` sets the severity of the log entry.<br/> + `0=emerg, 1=alert, 2=crit, 3=err, 4=warn, 5=notice, 6=info, 7=debug` + - Emergency events (0) are usually broadcast to all terminals. + - Emergency, alert, critical, and error (0-3) are usually colored red. + - Warning (4) entries are usually colored yellow. + - Notice (5) entries are usually bold or have a brighter white color. + - Info (6) entries are the default. + - Debug (7) entries are usually grayed or dimmed. + +- `SYSLOG_IDENTIFIER=NAME`<br/> + `SYSLOG_IDENTIFIER` sets the name of application. + Use something descriptive, like: `SYSLOG_IDENTIFIER=myapp` + +You can find the most common fields at `man systemd.journal-fields`. + + +## Usage + +``` +Netdata systemd-cat-native v1.43.0-333-g5af71b875 + +This program reads from its standard input, lines in the format: + +KEY1=VALUE1\n +KEY2=VALUE2\n +KEYN=VALUEN\n +\n + +and sends them to systemd-journal. + + - Binary journal fields are not accepted at its input + - Binary journal fields can be generated after newline processing + - Messages have to be separated by an empty line + - Keys starting with underscore are not accepted (by journald) + - Other rules imposed by systemd-journald are imposed (by journald) + +Usage: + + systemd-cat-native + [--newline=STRING] + [--log-as-netdata|-N] + [--namespace=NAMESPACE] [--socket=PATH] + [--url=URL [--key=FILENAME] [--cert=FILENAME] [--trust=FILENAME|all]] + +The program has the following modes of logging: + + * Log to a local systemd-journald or stderr + + This is the default mode. If systemd-journald is available, logs will be + sent to systemd, otherwise logs will be printed on stderr, using logfmt + formatting. Options --socket and --namespace are available to configure + the journal destination: + + --socket=PATH + The path of a systemd-journald UNIX socket. + The program will use the default systemd-journald socket when this + option is not used. + + --namespace=NAMESPACE + The name of a configured and running systemd-journald namespace. + The program will produce the socket path based on its internal + defaults, to send the messages to the systemd journal namespace. + + * Log as Netdata, enabled with --log-as-netdata or -N + + In this mode the program uses environment variables set by Netdata for + the log destination. Only log fields defined by Netdata are accepted. + If the environment variables expected by Netdata are not found, it + falls back to stderr logging in logfmt format. + + * Log to a systemd-journal-remote TCP socket, enabled with --url=URL + + In this mode, the program will directly sent logs to a remote systemd + journal (systemd-journal-remote expected at the destination) + This mode is available even when the local system does not support + systemd, or even it is not Linux, allowing a remote Linux systemd + journald to become the logs database of the local system. + + Unfortunately systemd-journal-remote does not accept compressed + data over the network, so the stream will be uncompressed. + + --url=URL + The destination systemd-journal-remote address and port, similarly + to what /etc/systemd/journal-upload.conf accepts. + Usually it is in the form: https://ip.address:19532 + Both http and https URLs are accepted. When using https, the + following additional options are accepted: + + --key=FILENAME + The filename of the private key of the server. + The default is: /etc/ssl/private/journal-upload.pem + + --cert=FILENAME + The filename of the public key of the server. + The default is: /etc/ssl/certs/journal-upload.pem + + --trust=FILENAME | all + The filename of the trusted CA public key. + The default is: /etc/ssl/ca/trusted.pem + The keyword 'all' can be used to trust all CAs. + + --namespace=NAMESPACE + Set the namespace of the messages sent. + + --keep-trying + Keep trying to send the message, if the remote journal is not there. + + NEWLINES PROCESSING + systemd-journal logs entries may have newlines in them. However the + Journal Export Format uses binary formatted data to achieve this, + making it hard for text processing. + + To overcome this limitation, this program allows single-line text + formatted values at its input, to be binary formatted multi-line Journal + Export Format at its output. + + To achieve that it allows replacing a given string to a newline. + The parameter --newline=STRING allows setting the string to be replaced + with newlines. + + For example by setting --newline='--NEWLINE--', the program will replace + all occurrences of --NEWLINE-- with the newline character, within each + VALUE of the KEY=VALUE lines. Once this this done, the program will + switch the field to the binary Journal Export Format before sending the + log event to systemd-journal. + +```
\ No newline at end of file diff --git a/src/libnetdata/maps/local-sockets.h b/src/libnetdata/maps/local-sockets.h new file mode 100644 index 00000000..d407e6be --- /dev/null +++ b/src/libnetdata/maps/local-sockets.h @@ -0,0 +1,1283 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_LOCAL_SOCKETS_H +#define NETDATA_LOCAL_SOCKETS_H + +#include "libnetdata/libnetdata.h" + +// disable libmnl for the moment +#undef HAVE_LIBMNL + +#ifdef HAVE_LIBMNL +#include <linux/inet_diag.h> +#include <linux/sock_diag.h> +#include <linux/unix_diag.h> +#include <linux/netlink.h> +#include <libmnl/libmnl.h> +#endif + +#define UID_UNSET (uid_t)(UINT32_MAX) + +// -------------------------------------------------------------------------------------------------------------------- +// hashtable for keeping the namespaces +// key and value is the namespace inode + +#define SIMPLE_HASHTABLE_VALUE_TYPE uint64_t +#define SIMPLE_HASHTABLE_NAME _NET_NS +#include "libnetdata/simple_hashtable.h" + +// -------------------------------------------------------------------------------------------------------------------- +// hashtable for keeping the sockets of PIDs +// key is the inode + +struct pid_socket; +#define SIMPLE_HASHTABLE_VALUE_TYPE struct pid_socket +#define SIMPLE_HASHTABLE_NAME _PID_SOCKET +#include "libnetdata/simple_hashtable.h" + +// -------------------------------------------------------------------------------------------------------------------- +// hashtable for keeping all the sockets +// key is the inode + +struct local_socket; +#define SIMPLE_HASHTABLE_VALUE_TYPE struct local_socket +#define SIMPLE_HASHTABLE_NAME _LOCAL_SOCKET +#include "libnetdata/simple_hashtable.h" + +// -------------------------------------------------------------------------------------------------------------------- +// hashtable for keeping all local IPs +// key is XXH3_64bits hash of the IP + +union ipv46; +#define SIMPLE_HASHTABLE_VALUE_TYPE union ipv46 +#define SIMPLE_HASHTABLE_NAME _LOCAL_IP +#include "libnetdata/simple_hashtable.h" + +// -------------------------------------------------------------------------------------------------------------------- +// hashtable for keeping all listening ports +// key is XXH3_64bits hash of the family, protocol, port number, namespace + +struct local_port; +#define SIMPLE_HASHTABLE_VALUE_TYPE struct local_port +#define SIMPLE_HASHTABLE_NAME _LISTENING_PORT +#include "libnetdata/simple_hashtable.h" + +// -------------------------------------------------------------------------------------------------------------------- + +struct local_socket_state; +typedef void (*local_sockets_cb_t)(struct local_socket_state *state, struct local_socket *n, void *data); + +typedef struct local_socket_state { + struct { + bool listening; + bool inbound; + bool outbound; + bool local; + bool tcp4; + bool tcp6; + bool udp4; + bool udp6; + bool pid; + bool cmdline; + bool comm; + bool uid; + bool namespaces; + size_t max_errors; + + local_sockets_cb_t cb; + void *data; + + const char *host_prefix; + } config; + + struct { + size_t pid_fds_processed; + size_t pid_fds_opendir_failed; + size_t pid_fds_readlink_failed; + size_t pid_fds_parse_failed; + size_t errors_encountered; + } stats; + +#ifdef HAVE_LIBMNL + bool use_nl; + struct mnl_socket *nl; + uint16_t tmp_protocol; +#endif + + ARAL *local_socket_aral; + ARAL *pid_socket_aral; + + uint64_t proc_self_net_ns_inode; + + SIMPLE_HASHTABLE_NET_NS ns_hashtable; + SIMPLE_HASHTABLE_PID_SOCKET pid_sockets_hashtable; + SIMPLE_HASHTABLE_LOCAL_SOCKET sockets_hashtable; + SIMPLE_HASHTABLE_LOCAL_IP local_ips_hashtable; + SIMPLE_HASHTABLE_LISTENING_PORT listening_ports_hashtable; +} LS_STATE; + +// -------------------------------------------------------------------------------------------------------------------- + +typedef enum __attribute__((packed)) { + SOCKET_DIRECTION_NONE = 0, + SOCKET_DIRECTION_LISTEN = (1 << 0), // a listening socket + SOCKET_DIRECTION_INBOUND = (1 << 1), // an inbound socket connecting a remote system to a local listening socket + SOCKET_DIRECTION_OUTBOUND = (1 << 2), // a socket initiated by this system, connecting to another system + SOCKET_DIRECTION_LOCAL_INBOUND = (1 << 3), // the socket connecting 2 localhost applications + SOCKET_DIRECTION_LOCAL_OUTBOUND = (1 << 4), // the socket connecting 2 localhost applications +} SOCKET_DIRECTION; + +#ifndef TASK_COMM_LEN +#define TASK_COMM_LEN 16 +#endif + +struct pid_socket { + uint64_t inode; + pid_t pid; + uid_t uid; + uint64_t net_ns_inode; + char *cmdline; + char comm[TASK_COMM_LEN]; +}; + +struct local_port { + uint16_t protocol; + uint16_t family; + uint16_t port; + uint64_t net_ns_inode; +}; + +union ipv46 { + uint32_t ipv4; + struct in6_addr ipv6; +}; + +struct socket_endpoint { + uint16_t protocol; + uint16_t family; + uint16_t port; + union ipv46 ip; +}; + +static inline void ipv6_to_in6_addr(const char *ipv6_str, struct in6_addr *d) { + char buf[9]; + + for (size_t k = 0; k < 4; ++k) { + memcpy(buf, ipv6_str + (k * 8), 8); + buf[sizeof(buf) - 1] = '\0'; + d->s6_addr32[k] = str2uint32_hex(buf, NULL); + } +} + +typedef struct local_socket { + uint64_t inode; + uint64_t net_ns_inode; + + int state; + struct socket_endpoint local; + struct socket_endpoint remote; + pid_t pid; + + SOCKET_DIRECTION direction; + + uint8_t timer; + uint8_t retransmits; + uint32_t expires; + uint32_t rqueue; + uint32_t wqueue; + uid_t uid; + + char comm[TASK_COMM_LEN]; + STRING *cmdline; + + struct local_port local_port_key; + + XXH64_hash_t local_ip_hash; + XXH64_hash_t remote_ip_hash; + XXH64_hash_t local_port_hash; + +#ifdef LOCAL_SOCKETS_EXTENDED_MEMBERS + LOCAL_SOCKETS_EXTENDED_MEMBERS +#endif +} LOCAL_SOCKET; + +// -------------------------------------------------------------------------------------------------------------------- + +static inline void local_sockets_log(LS_STATE *ls, const char *format, ...) PRINTFLIKE(2, 3); +static inline void local_sockets_log(LS_STATE *ls, const char *format, ...) { + if(++ls->stats.errors_encountered == ls->config.max_errors) { + nd_log(NDLS_COLLECTORS, NDLP_ERR, "LOCAL-SOCKETS: max number of logs reached. Not logging anymore"); + return; + } + + if(ls->stats.errors_encountered > ls->config.max_errors) + return; + + char buf[16384]; + va_list args; + va_start(args, format); + vsnprintf(buf, sizeof(buf), format, args); + va_end(args); + + nd_log(NDLS_COLLECTORS, NDLP_ERR, "LOCAL-SOCKETS: %s", buf); +} + +// -------------------------------------------------------------------------------------------------------------------- + +static void local_sockets_foreach_local_socket_call_cb(LS_STATE *ls) { + for(SIMPLE_HASHTABLE_SLOT_LOCAL_SOCKET *sl = simple_hashtable_first_read_only_LOCAL_SOCKET(&ls->sockets_hashtable); + sl; + sl = simple_hashtable_next_read_only_LOCAL_SOCKET(&ls->sockets_hashtable, sl)) { + LOCAL_SOCKET *n = SIMPLE_HASHTABLE_SLOT_DATA(sl); + if(!n) continue; + + if((ls->config.listening && n->direction & SOCKET_DIRECTION_LISTEN) || + (ls->config.local && n->direction & (SOCKET_DIRECTION_LOCAL_INBOUND|SOCKET_DIRECTION_LOCAL_OUTBOUND)) || + (ls->config.inbound && n->direction & SOCKET_DIRECTION_INBOUND) || + (ls->config.outbound && n->direction & SOCKET_DIRECTION_OUTBOUND) + ) { + // we have to call the callback for this socket + if (ls->config.cb) + ls->config.cb(ls, n, ls->config.data); + } + } +} + +// -------------------------------------------------------------------------------------------------------------------- + +static inline void local_sockets_fix_cmdline(char* str) { + char *s = str; + + // map invalid characters to underscores + while(*s) { + if(*s == '|' || iscntrl(*s)) *s = '_'; + s++; + } +} + +// -------------------------------------------------------------------------------------------------------------------- + +static inline bool +local_sockets_read_proc_inode_link(LS_STATE *ls, const char *filename, uint64_t *inode, const char *type) { + char link_target[FILENAME_MAX + 1]; + + *inode = 0; + + ssize_t len = readlink(filename, link_target, sizeof(link_target) - 1); + if (len == -1) { + local_sockets_log(ls, "cannot read '%s' link '%s'", type, filename); + + ls->stats.pid_fds_readlink_failed++; + return false; + } + link_target[len] = '\0'; + + len = strlen(type); + if(strncmp(link_target, type, len) == 0 && link_target[len] == ':' && link_target[len + 1] == '[' && isdigit(link_target[len + 2])) { + *inode = strtoull(&link_target[len + 2], NULL, 10); + // ll_log(ls, "read link of type '%s' '%s' from '%s', inode = %"PRIu64, type, link_target, filename, *inode); + return true; + } + else { + // ll_log(ls, "cannot read '%s' link '%s' from '%s'", type, link_target, filename); + ls->stats.pid_fds_processed++; + return false; + } +} + +static inline bool local_sockets_is_path_a_pid(const char *s) { + if(!s || !*s) return false; + + while(*s) { + if(!isdigit(*s++)) + return false; + } + + return true; +} + +static inline bool local_sockets_find_all_sockets_in_proc(LS_STATE *ls, const char *proc_filename) { + DIR *proc_dir; + struct dirent *proc_entry; + char filename[FILENAME_MAX + 1]; + char comm[TASK_COMM_LEN]; + char cmdline[8192]; + const char *cmdline_trimmed; + uint64_t net_ns_inode; + + proc_dir = opendir(proc_filename); + if (proc_dir == NULL) { + local_sockets_log(ls, "cannot opendir() '%s'", proc_filename); + ls->stats.pid_fds_readlink_failed++; + return false; + } + + while ((proc_entry = readdir(proc_dir)) != NULL) { + if(proc_entry->d_type != DT_DIR) + continue; + + if(!strcmp(proc_entry->d_name, ".") || !strcmp(proc_entry->d_name, "..")) + continue; + + if(!local_sockets_is_path_a_pid(proc_entry->d_name)) + continue; + + // Build the path to the fd directory of the process + snprintfz(filename, FILENAME_MAX, "%s/%s/fd/", proc_filename, proc_entry->d_name); + DIR *fd_dir = opendir(filename); + if (fd_dir == NULL) { + local_sockets_log(ls, "cannot opendir() '%s'", filename); + ls->stats.pid_fds_opendir_failed++; + continue; + } + + comm[0] = '\0'; + cmdline[0] = '\0'; + cmdline_trimmed = NULL; + pid_t pid = (pid_t)strtoul(proc_entry->d_name, NULL, 10); + if(!pid) { + local_sockets_log(ls, "cannot parse pid of '%s'", proc_entry->d_name); + closedir(fd_dir); + continue; + } + net_ns_inode = 0; + uid_t uid = UID_UNSET; + + struct dirent *fd_entry; + while ((fd_entry = readdir(fd_dir)) != NULL) { + if(fd_entry->d_type != DT_LNK) + continue; + + snprintfz(filename, sizeof(filename), "%s/%s/fd/%s", proc_filename, proc_entry->d_name, fd_entry->d_name); + uint64_t inode = 0; + if(!local_sockets_read_proc_inode_link(ls, filename, &inode, "socket")) + continue; + + SIMPLE_HASHTABLE_SLOT_PID_SOCKET *sl = simple_hashtable_get_slot_PID_SOCKET(&ls->pid_sockets_hashtable, inode, &inode, true); + struct pid_socket *ps = SIMPLE_HASHTABLE_SLOT_DATA(sl); + if(!ps || (ps->pid == 1 && pid != 1)) { + if(uid == UID_UNSET && ls->config.uid) { + char status_buf[512]; + snprintfz(filename, sizeof(filename), "%s/%s/status", proc_filename, proc_entry->d_name); + if (read_txt_file(filename, status_buf, sizeof(status_buf))) + local_sockets_log(ls, "cannot open file: %s\n", filename); + else { + char *u = strstr(status_buf, "Uid:"); + if(u) { + u += 4; + while(isspace(*u)) u++; // skip spaces + while(*u >= '0' && *u <= '9') u++; // skip the first number (real uid) + while(isspace(*u)) u++; // skip spaces again + uid = strtol(u, NULL, 10); // parse the 2nd number (effective uid) + } + } + } + if(!comm[0] && ls->config.comm) { + snprintfz(filename, sizeof(filename), "%s/%s/comm", proc_filename, proc_entry->d_name); + if (read_txt_file(filename, comm, sizeof(comm))) + local_sockets_log(ls, "cannot open file: %s\n", filename); + else { + size_t clen = strlen(comm); + if(comm[clen - 1] == '\n') + comm[clen - 1] = '\0'; + } + } + if(!cmdline[0] && ls->config.cmdline) { + snprintfz(filename, sizeof(filename), "%s/%s/cmdline", proc_filename, proc_entry->d_name); + if (read_proc_cmdline(filename, cmdline, sizeof(cmdline))) + local_sockets_log(ls, "cannot open file: %s\n", filename); + else { + local_sockets_fix_cmdline(cmdline); + cmdline_trimmed = trim(cmdline); + } + } + if(!net_ns_inode && ls->config.namespaces) { + snprintfz(filename, sizeof(filename), "%s/%s/ns/net", proc_filename, proc_entry->d_name); + if(local_sockets_read_proc_inode_link(ls, filename, &net_ns_inode, "net")) { + SIMPLE_HASHTABLE_SLOT_NET_NS *sl_ns = simple_hashtable_get_slot_NET_NS(&ls->ns_hashtable, net_ns_inode, (uint64_t *)net_ns_inode, true); + simple_hashtable_set_slot_NET_NS(&ls->ns_hashtable, sl_ns, net_ns_inode, (uint64_t *)net_ns_inode); + } + } + + if(!ps) + ps = aral_callocz(ls->pid_socket_aral); + + ps->inode = inode; + ps->pid = pid; + ps->uid = uid; + ps->net_ns_inode = net_ns_inode; + strncpyz(ps->comm, comm, sizeof(ps->comm) - 1); + + if(ps->cmdline) + freez(ps->cmdline); + + ps->cmdline = cmdline_trimmed ? strdupz(cmdline_trimmed) : NULL; + simple_hashtable_set_slot_PID_SOCKET(&ls->pid_sockets_hashtable, sl, inode, ps); + } + } + + closedir(fd_dir); + } + + closedir(proc_dir); + return true; +} + +// -------------------------------------------------------------------------------------------------------------------- + +static bool local_sockets_is_ipv4_mapped_ipv6_address(const struct in6_addr *addr) { + // An IPv4-mapped IPv6 address starts with 80 bits of zeros followed by 16 bits of ones + static const unsigned char ipv4_mapped_prefix[12] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF }; + return memcmp(addr->s6_addr, ipv4_mapped_prefix, 12) == 0; +} + +static bool local_sockets_is_loopback_address(struct socket_endpoint *se) { + if (se->family == AF_INET) { + // For IPv4, loopback addresses are in the 127.0.0.0/8 range + return (ntohl(se->ip.ipv4) >> 24) == 127; // Check if the first byte is 127 + } else if (se->family == AF_INET6) { + // Check if the address is an IPv4-mapped IPv6 address + if (local_sockets_is_ipv4_mapped_ipv6_address(&se->ip.ipv6)) { + // Extract the last 32 bits (IPv4 address) and check if it's in the 127.0.0.0/8 range + uint8_t *ip6 = (uint8_t *)&se->ip.ipv6; + const uint32_t ipv4_addr = *((const uint32_t *)(ip6 + 12)); + return (ntohl(ipv4_addr) >> 24) == 127; + } + + // For IPv6, loopback address is ::1 + return memcmp(&se->ip.ipv6, &in6addr_loopback, sizeof(se->ip.ipv6)) == 0; + } + + return false; +} + +static inline bool local_sockets_is_ipv4_reserved_address(uint32_t ip) { + // Check for the reserved address ranges + ip = ntohl(ip); + return ( + (ip >> 24 == 10) || // Private IP range (A class) + (ip >> 20 == (172 << 4) + 1) || // Private IP range (B class) + (ip >> 16 == (192 << 8) + 168) || // Private IP range (C class) + (ip >> 24 == 127) || // Loopback address (127.0.0.0) + (ip >> 24 == 0) || // Reserved (0.0.0.0) + (ip >> 24 == 169 && (ip >> 16) == 254) || // Link-local address (169.254.0.0) + (ip >> 16 == (192 << 8) + 0) // Test-Net (192.0.0.0) + ); +} + +static inline bool local_sockets_is_private_address(struct socket_endpoint *se) { + if (se->family == AF_INET) { + return local_sockets_is_ipv4_reserved_address(se->ip.ipv4); + } + else if (se->family == AF_INET6) { + uint8_t *ip6 = (uint8_t *)&se->ip.ipv6; + + // Check if the address is an IPv4-mapped IPv6 address + if (local_sockets_is_ipv4_mapped_ipv6_address(&se->ip.ipv6)) { + // Extract the last 32 bits (IPv4 address) and check if it's in the 127.0.0.0/8 range + const uint32_t ipv4_addr = *((const uint32_t *)(ip6 + 12)); + return local_sockets_is_ipv4_reserved_address(ipv4_addr); + } + + // Check for link-local addresses (fe80::/10) + if ((ip6[0] == 0xFE) && ((ip6[1] & 0xC0) == 0x80)) + return true; + + // Check for Unique Local Addresses (ULA) (fc00::/7) + if ((ip6[0] & 0xFE) == 0xFC) + return true; + + // Check for multicast addresses (ff00::/8) + if (ip6[0] == 0xFF) + return true; + + // For IPv6, loopback address is :: or ::1 + return memcmp(&se->ip.ipv6, &in6addr_any, sizeof(se->ip.ipv6)) == 0 || + memcmp(&se->ip.ipv6, &in6addr_loopback, sizeof(se->ip.ipv6)) == 0; + } + + return false; +} + +static bool local_sockets_is_multicast_address(struct socket_endpoint *se) { + if (se->family == AF_INET) { + // For IPv4, check if the address is 0.0.0.0 + uint32_t ip = htonl(se->ip.ipv4); + return (ip >= 0xE0000000 && ip <= 0xEFFFFFFF); // Multicast address range (224.0.0.0/4) + } + else if (se->family == AF_INET6) { + // For IPv6, check if the address is ff00::/8 + uint8_t *ip6 = (uint8_t *)&se->ip.ipv6; + return ip6[0] == 0xff; + } + + return false; +} + +static bool local_sockets_is_zero_address(struct socket_endpoint *se) { + if (se->family == AF_INET) { + // For IPv4, check if the address is 0.0.0.0 + return se->ip.ipv4 == 0; + } + else if (se->family == AF_INET6) { + // For IPv6, check if the address is :: + return memcmp(&se->ip.ipv6, &in6addr_any, sizeof(se->ip.ipv6)) == 0; + } + + return false; +} + +static inline const char *local_sockets_address_space(struct socket_endpoint *se) { + if(local_sockets_is_zero_address(se)) + return "zero"; + else if(local_sockets_is_loopback_address(se)) + return "loopback"; + else if(local_sockets_is_multicast_address(se)) + return "multicast"; + else if(local_sockets_is_private_address(se)) + return "private"; + else + return "public"; +} + +// -------------------------------------------------------------------------------------------------------------------- + +static inline void local_sockets_index_listening_port(LS_STATE *ls, LOCAL_SOCKET *n) { + if(n->direction & SOCKET_DIRECTION_LISTEN) { + // for the listening sockets, keep a hashtable with all the local ports + // so that we will be able to detect INBOUND sockets + + SIMPLE_HASHTABLE_SLOT_LISTENING_PORT *sl_port = + simple_hashtable_get_slot_LISTENING_PORT(&ls->listening_ports_hashtable, n->local_port_hash, &n->local_port_key, true); + + struct local_port *port = SIMPLE_HASHTABLE_SLOT_DATA(sl_port); + if(!port) + simple_hashtable_set_slot_LISTENING_PORT(&ls->listening_ports_hashtable, sl_port, n->local_port_hash, &n->local_port_key); + } +} + +static inline bool local_sockets_add_socket(LS_STATE *ls, LOCAL_SOCKET *tmp) { + if(!tmp->inode) return false; + + SIMPLE_HASHTABLE_SLOT_LOCAL_SOCKET *sl = simple_hashtable_get_slot_LOCAL_SOCKET(&ls->sockets_hashtable, tmp->inode, &tmp->inode, true); + LOCAL_SOCKET *n = SIMPLE_HASHTABLE_SLOT_DATA(sl); + if(n) { + local_sockets_log(ls, "inode %" PRIu64" already exists in hashtable - ignoring duplicate", tmp->inode); + return false; + } + + n = aral_mallocz(ls->local_socket_aral); + *n = *tmp; // copy all contents + + // fix the key + n->local_port_key.port = n->local.port; + n->local_port_key.family = n->local.family; + n->local_port_key.protocol = n->local.protocol; + n->local_port_key.net_ns_inode = ls->proc_self_net_ns_inode; + + n->local_ip_hash = XXH3_64bits(&n->local.ip, sizeof(n->local.ip)); + n->remote_ip_hash = XXH3_64bits(&n->remote.ip, sizeof(n->remote.ip)); + n->local_port_hash = XXH3_64bits(&n->local_port_key, sizeof(n->local_port_key)); + + // --- look up a pid for it ----------------------------------------------------------------------------------- + + SIMPLE_HASHTABLE_SLOT_PID_SOCKET *sl_pid = simple_hashtable_get_slot_PID_SOCKET(&ls->pid_sockets_hashtable, n->inode, &n->inode, false); + struct pid_socket *ps = SIMPLE_HASHTABLE_SLOT_DATA(sl_pid); + if(ps) { + n->net_ns_inode = ps->net_ns_inode; + n->pid = ps->pid; + + if(ps->uid != UID_UNSET && n->uid == UID_UNSET) + n->uid = ps->uid; + + if(ps->cmdline) + n->cmdline = string_strdupz(ps->cmdline); + + strncpyz(n->comm, ps->comm, sizeof(n->comm) - 1); + } + + // --- index it ----------------------------------------------------------------------------------------------- + + simple_hashtable_set_slot_LOCAL_SOCKET(&ls->sockets_hashtable, sl, n->inode, n); + + if(!local_sockets_is_zero_address(&n->local)) { + // put all the local IPs into the local_ips hashtable + // so, we learn all local IPs the system has + + SIMPLE_HASHTABLE_SLOT_LOCAL_IP *sl_ip = + simple_hashtable_get_slot_LOCAL_IP(&ls->local_ips_hashtable, n->local_ip_hash, &n->local.ip, true); + + union ipv46 *ip = SIMPLE_HASHTABLE_SLOT_DATA(sl_ip); + if(!ip) + simple_hashtable_set_slot_LOCAL_IP(&ls->local_ips_hashtable, sl_ip, n->local_ip_hash, &n->local.ip); + } + + // --- 1st phase for direction detection ---------------------------------------------------------------------- + + if((n->local.protocol == IPPROTO_TCP && n->state == TCP_LISTEN) || + local_sockets_is_zero_address(&n->local) || + local_sockets_is_zero_address(&n->remote)) { + // the socket is either in a TCP LISTEN, or + // the remote address is zero + n->direction |= SOCKET_DIRECTION_LISTEN; + } + else { + // we can't say yet if it is inbound or outboud + // so, mark it as both inbound and outbound + n->direction |= SOCKET_DIRECTION_INBOUND | SOCKET_DIRECTION_OUTBOUND; + } + + // --- index it in LISTENING_PORT ----------------------------------------------------------------------------- + + local_sockets_index_listening_port(ls, n); + + return true; +} + +#ifdef HAVE_LIBMNL + +static inline void local_sockets_netlink_init(LS_STATE *ls) { + ls->use_nl = true; + ls->nl = mnl_socket_open(NETLINK_INET_DIAG); + if (!ls->nl) { + local_sockets_log(ls, "cannot open netlink socket"); + ls->use_nl = false; + } + + if (mnl_socket_bind(ls->nl, 0, MNL_SOCKET_AUTOPID) < 0) { + local_sockets_log(ls, "cannot bind netlink socket"); + ls->use_nl = false; + } +} + +static inline void local_sockets_netlink_cleanup(LS_STATE *ls) { + if(ls->nl) { + mnl_socket_close(ls->nl); + ls->nl = NULL; + } +} + +static inline int local_sockets_netlink_cb_data(const struct nlmsghdr *nlh, void *data) { + LS_STATE *ls = data; + + struct inet_diag_msg *diag_msg = mnl_nlmsg_get_payload(nlh); + + LOCAL_SOCKET n = { + .inode = diag_msg->idiag_inode, + .direction = SOCKET_DIRECTION_NONE, + .state = diag_msg->idiag_state, + .local = { + .protocol = ls->tmp_protocol, + .family = diag_msg->idiag_family, + .port = diag_msg->id.idiag_sport, + }, + .remote = { + .protocol = ls->tmp_protocol, + .family = diag_msg->idiag_family, + .port = diag_msg->id.idiag_dport, + }, + .timer = diag_msg->idiag_timer, + .retransmits = diag_msg->idiag_retrans, + .expires = diag_msg->idiag_expires, + .rqueue = diag_msg->idiag_rqueue, + .wqueue = diag_msg->idiag_wqueue, + .uid = diag_msg->idiag_uid, + }; + + if (diag_msg->idiag_family == AF_INET) { + memcpy(&n.local.ip.ipv4, diag_msg->id.idiag_src, sizeof(n.local.ip.ipv4)); + memcpy(&n.remote.ip.ipv4, diag_msg->id.idiag_dst, sizeof(n.remote.ip.ipv4)); + } + else if (diag_msg->idiag_family == AF_INET6) { + memcpy(&n.local.ip.ipv6, diag_msg->id.idiag_src, sizeof(n.local.ip.ipv6)); + memcpy(&n.remote.ip.ipv6, diag_msg->id.idiag_dst, sizeof(n.remote.ip.ipv6)); + } + + local_sockets_add_socket(ls, &n); + + return MNL_CB_OK; +} + +static inline bool local_sockets_netlink_get_sockets(LS_STATE *ls, uint16_t family, uint16_t protocol) { + ls->tmp_protocol = protocol; + + char buf[MNL_SOCKET_BUFFER_SIZE]; + struct nlmsghdr *nlh; + struct inet_diag_req_v2 req; + unsigned int seq, portid = mnl_socket_get_portid(ls->nl); + + memset(&req, 0, sizeof(req)); + req.sdiag_family = family; + req.sdiag_protocol = protocol; + req.idiag_states = -1; + + nlh = mnl_nlmsg_put_header(buf); + nlh->nlmsg_type = SOCK_DIAG_BY_FAMILY; + nlh->nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST; + nlh->nlmsg_seq = seq = time(NULL); + mnl_nlmsg_put_extra_header(nlh, sizeof(req)); + memcpy(mnl_nlmsg_get_payload(nlh), &req, sizeof(req)); + + if (mnl_socket_sendto(ls->nl, nlh, nlh->nlmsg_len) < 0) { + local_sockets_log(ls, "mnl_socket_send failed"); + return false; + } + + ssize_t ret; + while ((ret = mnl_socket_recvfrom(ls->nl, buf, sizeof(buf))) > 0) { + ret = mnl_cb_run(buf, ret, seq, portid, local_sockets_netlink_cb_data, ls); + if (ret <= MNL_CB_STOP) + break; + } + if (ret == -1) { + local_sockets_log(ls, "mnl_socket_recvfrom"); + return false; + } + + return true; +} +#endif // HAVE_LIBMNL + +static inline bool local_sockets_read_proc_net_x(LS_STATE *ls, const char *filename, uint16_t family, uint16_t protocol) { + static bool is_space[256] = { + [':'] = true, + [' '] = true, + }; + + if(family != AF_INET && family != AF_INET6) + return false; + + FILE *fp = fopen(filename, "r"); + if (fp == NULL) + return false; + + char *line = malloc(1024); // no mallocz() here because getline() may resize + if(!line) { + fclose(fp); + return false; + } + + size_t len = 1024; + ssize_t read; + + ssize_t min_line_length = (family == AF_INET) ? 105 : 155; + size_t counter = 0; + + // Read line by line + while ((read = getline(&line, &len, fp)) != -1) { + if(counter++ == 0) continue; // skip the first line + + if(read < min_line_length) { + local_sockets_log(ls, "too small line No %zu of filename '%s': %s", counter, filename, line); + continue; + } + + LOCAL_SOCKET n = { + .direction = SOCKET_DIRECTION_NONE, + .local = { + .family = family, + .protocol = protocol, + }, + .remote = { + .family = family, + .protocol = protocol, + }, + .uid = UID_UNSET, + }; + + char *words[32]; + size_t num_words = quoted_strings_splitter(line, words, 32, is_space); + // char *sl_txt = get_word(words, num_words, 0); + char *local_ip_txt = get_word(words, num_words, 1); + char *local_port_txt = get_word(words, num_words, 2); + char *remote_ip_txt = get_word(words, num_words, 3); + char *remote_port_txt = get_word(words, num_words, 4); + char *state_txt = get_word(words, num_words, 5); + char *tx_queue_txt = get_word(words, num_words, 6); + char *rx_queue_txt = get_word(words, num_words, 7); + char *tr_txt = get_word(words, num_words, 8); + char *tm_when_txt = get_word(words, num_words, 9); + char *retrans_txt = get_word(words, num_words, 10); + char *uid_txt = get_word(words, num_words, 11); + // char *timeout_txt = get_word(words, num_words, 12); + char *inode_txt = get_word(words, num_words, 13); + + if(!local_ip_txt || !local_port_txt || !remote_ip_txt || !remote_port_txt || !state_txt || + !tx_queue_txt || !rx_queue_txt || !tr_txt || !tm_when_txt || !retrans_txt || !uid_txt || !inode_txt) { + local_sockets_log(ls, "cannot parse ipv4 line No %zu of filename '%s'", counter, filename); + continue; + } + + n.local.port = str2uint32_hex(local_port_txt, NULL); + n.remote.port = str2uint32_hex(remote_port_txt, NULL); + n.state = str2uint32_hex(state_txt, NULL); + n.wqueue = str2uint32_hex(tx_queue_txt, NULL); + n.rqueue = str2uint32_hex(rx_queue_txt, NULL); + n.timer = str2uint32_hex(tr_txt, NULL); + n.expires = str2uint32_hex(tm_when_txt, NULL); + n.retransmits = str2uint32_hex(retrans_txt, NULL); + n.uid = str2uint32_t(uid_txt, NULL); + n.inode = str2uint64_t(inode_txt, NULL); + + if(family == AF_INET) { + n.local.ip.ipv4 = str2uint32_hex(local_ip_txt, NULL); + n.remote.ip.ipv4 = str2uint32_hex(remote_ip_txt, NULL); + } + else if(family == AF_INET6) { + ipv6_to_in6_addr(local_ip_txt, &n.local.ip.ipv6); + ipv6_to_in6_addr(remote_ip_txt, &n.remote.ip.ipv6); + } + + local_sockets_add_socket(ls, &n); + } + + fclose(fp); + + if (line) + free(line); // no freez() here because getline() may resize + + return true; +} + +// -------------------------------------------------------------------------------------------------------------------- + +static inline void local_sockets_detect_directions(LS_STATE *ls) { + for(SIMPLE_HASHTABLE_SLOT_LOCAL_SOCKET *sl = simple_hashtable_first_read_only_LOCAL_SOCKET(&ls->sockets_hashtable); + sl ; + sl = simple_hashtable_next_read_only_LOCAL_SOCKET(&ls->sockets_hashtable, sl)) { + LOCAL_SOCKET *n = SIMPLE_HASHTABLE_SLOT_DATA(sl); + if (!n) continue; + + if ((n->direction & (SOCKET_DIRECTION_INBOUND|SOCKET_DIRECTION_OUTBOUND)) != + (SOCKET_DIRECTION_INBOUND|SOCKET_DIRECTION_OUTBOUND)) + continue; + + // check if the local port is one of our listening ports + { + SIMPLE_HASHTABLE_SLOT_LISTENING_PORT *sl_port = + simple_hashtable_get_slot_LISTENING_PORT(&ls->listening_ports_hashtable, n->local_port_hash, &n->local_port_key, false); + + struct local_port *port = SIMPLE_HASHTABLE_SLOT_DATA(sl_port); // do not reference this pointer - is invalid + if(port) { + // the local port of this socket is a port we listen to + n->direction &= ~SOCKET_DIRECTION_OUTBOUND; + } + else + n->direction &= ~SOCKET_DIRECTION_INBOUND; + } + + // check if the remote IP is one of our local IPs + { + SIMPLE_HASHTABLE_SLOT_LOCAL_IP *sl_ip = + simple_hashtable_get_slot_LOCAL_IP(&ls->local_ips_hashtable, n->remote_ip_hash, &n->remote.ip, false); + + union ipv46 *d = SIMPLE_HASHTABLE_SLOT_DATA(sl_ip); + if (d) { + // the remote IP of this socket is one of our local IPs + if(n->direction & SOCKET_DIRECTION_INBOUND) { + n->direction &= ~SOCKET_DIRECTION_INBOUND; + n->direction |= SOCKET_DIRECTION_LOCAL_INBOUND; + } + else if(n->direction & SOCKET_DIRECTION_OUTBOUND) { + n->direction &= ~SOCKET_DIRECTION_OUTBOUND; + n->direction |= SOCKET_DIRECTION_LOCAL_OUTBOUND; + } + continue; + } + } + + if (local_sockets_is_loopback_address(&n->local) || + local_sockets_is_loopback_address(&n->remote)) { + // both IP addresses are loopback + if(n->direction & SOCKET_DIRECTION_INBOUND) { + n->direction &= ~SOCKET_DIRECTION_INBOUND; + n->direction |= SOCKET_DIRECTION_LOCAL_INBOUND; + } + else if(n->direction & SOCKET_DIRECTION_OUTBOUND) { + n->direction &= ~SOCKET_DIRECTION_OUTBOUND; + n->direction |= SOCKET_DIRECTION_LOCAL_OUTBOUND; + } + } + } +} + +// -------------------------------------------------------------------------------------------------------------------- + +static inline void local_sockets_init(LS_STATE *ls) { + simple_hashtable_init_NET_NS(&ls->ns_hashtable, 1024); + simple_hashtable_init_PID_SOCKET(&ls->pid_sockets_hashtable, 65535); + simple_hashtable_init_LOCAL_SOCKET(&ls->sockets_hashtable, 65535); + simple_hashtable_init_LOCAL_IP(&ls->local_ips_hashtable, 4096); + simple_hashtable_init_LISTENING_PORT(&ls->listening_ports_hashtable, 4096); + + ls->local_socket_aral = aral_create( + "local-sockets", + sizeof(LOCAL_SOCKET), + 65536, + 65536, + NULL, NULL, NULL, false, true); + + ls->pid_socket_aral = aral_create( + "pid-sockets", + sizeof(struct pid_socket), + 65536, + 65536, + NULL, NULL, NULL, false, true); +} + +static inline void local_sockets_cleanup(LS_STATE *ls) { + // free the sockets hashtable data + for(SIMPLE_HASHTABLE_SLOT_LOCAL_SOCKET *sl = simple_hashtable_first_read_only_LOCAL_SOCKET(&ls->sockets_hashtable); + sl; + sl = simple_hashtable_next_read_only_LOCAL_SOCKET(&ls->sockets_hashtable, sl)) { + LOCAL_SOCKET *n = SIMPLE_HASHTABLE_SLOT_DATA(sl); + if(!n) continue; + + string_freez(n->cmdline); + aral_freez(ls->local_socket_aral, n); + } + + // free the pid_socket hashtable data + for(SIMPLE_HASHTABLE_SLOT_PID_SOCKET *sl = simple_hashtable_first_read_only_PID_SOCKET(&ls->pid_sockets_hashtable); + sl; + sl = simple_hashtable_next_read_only_PID_SOCKET(&ls->pid_sockets_hashtable, sl)) { + struct pid_socket *ps = SIMPLE_HASHTABLE_SLOT_DATA(sl); + if(!ps) continue; + + freez(ps->cmdline); + aral_freez(ls->pid_socket_aral, ps); + } + + // free the hashtable + simple_hashtable_destroy_NET_NS(&ls->ns_hashtable); + simple_hashtable_destroy_PID_SOCKET(&ls->pid_sockets_hashtable); + simple_hashtable_destroy_LISTENING_PORT(&ls->listening_ports_hashtable); + simple_hashtable_destroy_LOCAL_IP(&ls->local_ips_hashtable); + simple_hashtable_destroy_LOCAL_SOCKET(&ls->sockets_hashtable); + + aral_destroy(ls->local_socket_aral); + aral_destroy(ls->pid_socket_aral); +} + +// -------------------------------------------------------------------------------------------------------------------- + +static inline void local_sockets_do_family_protocol(LS_STATE *ls, const char *filename, uint16_t family, uint16_t protocol) { +#ifdef HAVE_LIBMNL + if(ls->use_nl) { + ls->use_nl = local_sockets_netlink_get_sockets(ls, family, protocol); + + if(ls->use_nl) + return; + } +#endif + + local_sockets_read_proc_net_x(ls, filename, family, protocol); +} + +static inline void local_sockets_read_sockets_from_proc(LS_STATE *ls) { + char path[FILENAME_MAX + 1]; + + if(ls->config.namespaces) { + snprintfz(path, sizeof(path), "%s/proc/self/ns/net", ls->config.host_prefix); + local_sockets_read_proc_inode_link(ls, path, &ls->proc_self_net_ns_inode, "net"); + } + + if(ls->config.cmdline || ls->config.comm || ls->config.pid || ls->config.namespaces) { + snprintfz(path, sizeof(path), "%s/proc", ls->config.host_prefix); + local_sockets_find_all_sockets_in_proc(ls, path); + } + + if(ls->config.tcp4) { + snprintfz(path, sizeof(path), "%s/proc/net/tcp", ls->config.host_prefix); + local_sockets_do_family_protocol(ls, path, AF_INET, IPPROTO_TCP); + } + + if(ls->config.udp4) { + snprintfz(path, sizeof(path), "%s/proc/net/udp", ls->config.host_prefix); + local_sockets_do_family_protocol(ls, path, AF_INET, IPPROTO_UDP); + } + + if(ls->config.tcp6) { + snprintfz(path, sizeof(path), "%s/proc/net/tcp6", ls->config.host_prefix); + local_sockets_do_family_protocol(ls, path, AF_INET6, IPPROTO_TCP); + } + + if(ls->config.udp6) { + snprintfz(path, sizeof(path), "%s/proc/net/udp6", ls->config.host_prefix); + local_sockets_do_family_protocol(ls, path, AF_INET6, IPPROTO_UDP); + } +} + +// -------------------------------------------------------------------------------------------------------------------- + +struct local_sockets_child_work { + int fd; + uint64_t net_ns_inode; +}; + +static inline void local_sockets_send_to_parent(struct local_socket_state *ls __maybe_unused, struct local_socket *n, void *data) { + struct local_sockets_child_work *cw = data; + int fd = cw->fd; + + if(n->net_ns_inode != cw->net_ns_inode) + return; + + // local_sockets_log(ls, "child is sending inode %"PRIu64" of namespace %"PRIu64, n->inode, n->net_ns_inode); + + if(write(fd, n, sizeof(*n)) != sizeof(*n)) + local_sockets_log(ls, "failed to write local socket to pipe"); + + size_t len = n->cmdline ? string_strlen(n->cmdline) + 1 : 0; + if(write(fd, &len, sizeof(len)) != sizeof(len)) + local_sockets_log(ls, "failed to write cmdline length to pipe"); + + if(len) + if(write(fd, string2str(n->cmdline), len) != (ssize_t)len) + local_sockets_log(ls, "failed to write cmdline to pipe"); +} + +static inline bool local_sockets_get_namespace_sockets(LS_STATE *ls, struct pid_socket *ps, pid_t *pid) { + char filename[1024]; + snprintfz(filename, sizeof(filename), "%s/proc/%d/ns/net", ls->config.host_prefix, ps->pid); + + // verify the pid is in the target namespace + int fd = open(filename, O_RDONLY | O_CLOEXEC); + if (fd == -1) { + local_sockets_log(ls, "cannot open file '%s'", filename); + return false; + } + + struct stat statbuf; + if (fstat(fd, &statbuf) == -1) { + close(fd); + local_sockets_log(ls, "failed to get file statistics for '%s'", filename); + return false; + } + + if (statbuf.st_ino != ps->net_ns_inode) { + close(fd); + local_sockets_log(ls, "pid %d is not in the wanted network namespace", ps->pid); + return false; + } + + int pipefd[2]; + if (pipe(pipefd) != 0) { + local_sockets_log(ls, "cannot create pipe"); + close(fd); + return false; + } + + *pid = fork(); + if (*pid == 0) { + // Child process + close(pipefd[0]); + + // local_sockets_log(ls, "child is here for inode %"PRIu64" and namespace %"PRIu64, ps->inode, ps->net_ns_inode); + + struct local_sockets_child_work cw = { + .net_ns_inode = ps->net_ns_inode, + .fd = pipefd[1], + }; + + ls->config.host_prefix = ""; // we need the /proc of the container + ls->config.cb = local_sockets_send_to_parent; + ls->config.data = &cw; + ls->config.cmdline = false; // we have these already + ls->config.comm = false; // we have these already + ls->config.pid = false; // we have these already + ls->config.namespaces = false; + ls->proc_self_net_ns_inode = ps->net_ns_inode; + + + // switch namespace + if (setns(fd, CLONE_NEWNET) == -1) { + local_sockets_log(ls, "failed to switch network namespace at child process"); + exit(EXIT_FAILURE); + } + +#ifdef HAVE_LIBMNL + local_sockets_netlink_cleanup(ls); + local_sockets_netlink_init(ls); +#endif + + // read all sockets from /proc + local_sockets_read_sockets_from_proc(ls); + + // send all sockets to parent + local_sockets_foreach_local_socket_call_cb(ls); + + // send the terminating socket + struct local_socket zero = { + .net_ns_inode = ps->net_ns_inode, + }; + local_sockets_send_to_parent(ls, &zero, &cw); + +#ifdef HAVE_LIBMNL + local_sockets_netlink_cleanup(ls); +#endif + + close(pipefd[1]); // Close write end of pipe + exit(EXIT_SUCCESS); + } + // parent + + close(fd); + close(pipefd[1]); + + size_t received = 0; + struct local_socket buf; + while(read(pipefd[0], &buf, sizeof(buf)) == sizeof(buf)) { + size_t len = 0; + if(read(pipefd[0], &len, sizeof(len)) != sizeof(len)) + local_sockets_log(ls, "failed to read cmdline length from pipe"); + + if(len) { + char cmdline[len + 1]; + if(read(pipefd[0], cmdline, len) != (ssize_t)len) + local_sockets_log(ls, "failed to read cmdline from pipe"); + else { + cmdline[len] = '\0'; + buf.cmdline = string_strdupz(cmdline); + } + } + else + buf.cmdline = NULL; + + received++; + + struct local_socket zero = { + .net_ns_inode = ps->net_ns_inode, + }; + if(memcmp(&buf, &zero, sizeof(buf)) == 0) { + // the terminator + break; + } + + SIMPLE_HASHTABLE_SLOT_LOCAL_SOCKET *sl = simple_hashtable_get_slot_LOCAL_SOCKET(&ls->sockets_hashtable, buf.inode, &buf, true); + LOCAL_SOCKET *n = SIMPLE_HASHTABLE_SLOT_DATA(sl); + if(n) { + string_freez(buf.cmdline); + +// local_sockets_log(ls, +// "ns inode %" PRIu64" (comm: '%s', pid: %u, ns: %"PRIu64") already exists in hashtable (comm: '%s', pid: %u, ns: %"PRIu64") - ignoring duplicate", +// buf.inode, buf.comm, buf.pid, buf.net_ns_inode, n->comm, n->pid, n->net_ns_inode); + continue; + } + else { + n = aral_mallocz(ls->local_socket_aral); + memcpy(n, &buf, sizeof(*n)); + simple_hashtable_set_slot_LOCAL_SOCKET(&ls->sockets_hashtable, sl, n->inode, n); + + local_sockets_index_listening_port(ls, n); + } + } + + close(pipefd[0]); + + return received > 0; +} + +static inline void local_socket_waitpid(LS_STATE *ls, pid_t pid) { + if(!pid) return; + + int status; + waitpid(pid, &status, 0); + + if (WIFEXITED(status) && WEXITSTATUS(status) != 0) + local_sockets_log(ls, "Child exited with status %d", WEXITSTATUS(status)); + else if (WIFSIGNALED(status)) + local_sockets_log(ls, "Child terminated by signal %d", WTERMSIG(status)); +} + +static inline void local_sockets_namespaces(LS_STATE *ls) { + pid_t children[5] = { 0 }; + size_t last_child = 0; + + for(SIMPLE_HASHTABLE_SLOT_NET_NS *sl = simple_hashtable_first_read_only_NET_NS(&ls->ns_hashtable); + sl; + sl = simple_hashtable_next_read_only_NET_NS(&ls->ns_hashtable, sl)) { + uint64_t inode = (uint64_t)SIMPLE_HASHTABLE_SLOT_DATA(sl); + + if(inode == ls->proc_self_net_ns_inode) + continue; + + // find a pid_socket that has this namespace + for(SIMPLE_HASHTABLE_SLOT_PID_SOCKET *sl_pid = simple_hashtable_first_read_only_PID_SOCKET(&ls->pid_sockets_hashtable) ; + sl_pid ; + sl_pid = simple_hashtable_next_read_only_PID_SOCKET(&ls->pid_sockets_hashtable, sl_pid)) { + struct pid_socket *ps = SIMPLE_HASHTABLE_SLOT_DATA(sl_pid); + if(!ps || ps->net_ns_inode != inode) continue; + + if(++last_child >= 5) + last_child = 0; + + local_socket_waitpid(ls, children[last_child]); + children[last_child] = 0; + + // now we have a pid that has the same namespace inode + if(local_sockets_get_namespace_sockets(ls, ps, &children[last_child])) + break; + } + } + + for(size_t i = 0; i < 5 ;i++) + local_socket_waitpid(ls, children[i]); +} + +// -------------------------------------------------------------------------------------------------------------------- + +static inline void local_sockets_process(LS_STATE *ls) { + +#ifdef HAVE_LIBMNL + local_sockets_netlink_init(ls); +#endif + + ls->config.host_prefix = netdata_configured_host_prefix; + + // initialize our hashtables + local_sockets_init(ls); + + // read all sockets from /proc + local_sockets_read_sockets_from_proc(ls); + + // check all socket namespaces + if(ls->config.namespaces) + local_sockets_namespaces(ls); + + // detect the directions of the sockets + if(ls->config.inbound || ls->config.outbound || ls->config.local) + local_sockets_detect_directions(ls); + + // call the callback for each socket + local_sockets_foreach_local_socket_call_cb(ls); + + // free all memory + local_sockets_cleanup(ls); + +#ifdef HAVE_LIBMNL + local_sockets_netlink_cleanup(ls); +#endif +} + +static inline void ipv6_address_to_txt(struct in6_addr *in6_addr, char *dst) { + struct sockaddr_in6 sa = { 0 }; + + sa.sin6_family = AF_INET6; + sa.sin6_port = htons(0); + sa.sin6_addr = *in6_addr; + + // Convert to human-readable format + if (inet_ntop(AF_INET6, &(sa.sin6_addr), dst, INET6_ADDRSTRLEN) == NULL) + *dst = '\0'; +} + +static inline void ipv4_address_to_txt(uint32_t ip, char *dst) { + uint8_t octets[4]; + octets[0] = ip & 0xFF; + octets[1] = (ip >> 8) & 0xFF; + octets[2] = (ip >> 16) & 0xFF; + octets[3] = (ip >> 24) & 0xFF; + sprintf(dst, "%u.%u.%u.%u", octets[0], octets[1], octets[2], octets[3]); +} + +#endif //NETDATA_LOCAL_SOCKETS_H diff --git a/src/libnetdata/maps/system-groups.h b/src/libnetdata/maps/system-groups.h new file mode 100644 index 00000000..fd042cd4 --- /dev/null +++ b/src/libnetdata/maps/system-groups.h @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_SYSTEM_GROUPS_H +#define NETDATA_SYSTEM_GROUPS_H + +#include "libnetdata/libnetdata.h" + +// -------------------------------------------------------------------------------------------------------------------- +// hashtable for caching uid to username mappings +// key is the uid, value is username (STRING) + +#define SIMPLE_HASHTABLE_VALUE_TYPE STRING +#define SIMPLE_HASHTABLE_NAME _GROUPNAMES_CACHE +#include "libnetdata/simple_hashtable.h" + +typedef struct groupnames_cache { + SPINLOCK spinlock; + SIMPLE_HASHTABLE_GROUPNAMES_CACHE ht; +} GROUPNAMES_CACHE; + +static inline STRING *system_groupnames_cache_lookup_gid(GROUPNAMES_CACHE *gc, gid_t gid) { + spinlock_lock(&gc->spinlock); + + SIMPLE_HASHTABLE_SLOT_GROUPNAMES_CACHE *sl = simple_hashtable_get_slot_GROUPNAMES_CACHE(&gc->ht, gid, &gid, true); + STRING *g = SIMPLE_HASHTABLE_SLOT_DATA(sl); + if(!g) { + char tmp[1024 + 1]; + struct group grp, *result = NULL; + + if (getgrgid_r(gid, &grp, tmp, sizeof(tmp), &result) != 0 || !result || !grp.gr_name || !(*grp.gr_name)) { + char name[50]; + snprintfz(name, sizeof(name), "%u", gid); + g = string_strdupz(name); + } + else + g = string_strdupz(grp.gr_name); + + simple_hashtable_set_slot_GROUPNAMES_CACHE(&gc->ht, sl, gid, g); + } + + g = string_dup(g); + spinlock_unlock(&gc->spinlock); + return g; +} + +static inline GROUPNAMES_CACHE *system_groupnames_cache_init(void) { + GROUPNAMES_CACHE *gc = callocz(1, sizeof(*gc)); + spinlock_init(&gc->spinlock); + simple_hashtable_init_GROUPNAMES_CACHE(&gc->ht, 100); + return gc; +} + +static inline void system_groupnames_cache_destroy(GROUPNAMES_CACHE *gc) { + spinlock_lock(&gc->spinlock); + + for(SIMPLE_HASHTABLE_SLOT_GROUPNAMES_CACHE *sl = simple_hashtable_first_read_only_GROUPNAMES_CACHE(&gc->ht); + sl; + sl = simple_hashtable_next_read_only_GROUPNAMES_CACHE(&gc->ht, sl)) { + STRING *u = SIMPLE_HASHTABLE_SLOT_DATA(sl); + string_freez(u); + } + + simple_hashtable_destroy_GROUPNAMES_CACHE(&gc->ht); + freez(gc); +} + +#endif //NETDATA_SYSTEM_GROUPS_H diff --git a/src/libnetdata/maps/system-users.h b/src/libnetdata/maps/system-users.h new file mode 100644 index 00000000..5f7dfae1 --- /dev/null +++ b/src/libnetdata/maps/system-users.h @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_SYSTEM_USERS_H +#define NETDATA_SYSTEM_USERS_H + +#include "libnetdata/libnetdata.h" + +// -------------------------------------------------------------------------------------------------------------------- +// hashtable for caching uid to username mappings +// key is the uid, value is username (STRING) + +#define SIMPLE_HASHTABLE_VALUE_TYPE STRING +#define SIMPLE_HASHTABLE_NAME _USERNAMES_CACHE +#include "libnetdata/simple_hashtable.h" + +typedef struct usernames_cache { + SPINLOCK spinlock; + SIMPLE_HASHTABLE_USERNAMES_CACHE ht; +} USERNAMES_CACHE; + +static inline STRING *system_usernames_cache_lookup_uid(USERNAMES_CACHE *uc, uid_t uid) { + spinlock_lock(&uc->spinlock); + + SIMPLE_HASHTABLE_SLOT_USERNAMES_CACHE *sl = simple_hashtable_get_slot_USERNAMES_CACHE(&uc->ht, uid, &uid, true); + STRING *u = SIMPLE_HASHTABLE_SLOT_DATA(sl); + if(!u) { + char tmp[1024 + 1]; + struct passwd pw, *result = NULL; + + if (getpwuid_r(uid, &pw, tmp, sizeof(tmp), &result) != 0 || !result || !pw.pw_name || !(*pw.pw_name)) { + char name[50]; + snprintfz(name, sizeof(name), "%u", uid); + u = string_strdupz(name); + } + else + u = string_strdupz(pw.pw_name); + + simple_hashtable_set_slot_USERNAMES_CACHE(&uc->ht, sl, uid, u); + } + + u = string_dup(u); + spinlock_unlock(&uc->spinlock); + return u; +} + +static inline USERNAMES_CACHE *system_usernames_cache_init(void) { + USERNAMES_CACHE *uc = callocz(1, sizeof(*uc)); + spinlock_init(&uc->spinlock); + simple_hashtable_init_USERNAMES_CACHE(&uc->ht, 100); + return uc; +} + +static inline void system_usernames_cache_destroy(USERNAMES_CACHE *uc) { + spinlock_lock(&uc->spinlock); + + for(SIMPLE_HASHTABLE_SLOT_USERNAMES_CACHE *sl = simple_hashtable_first_read_only_USERNAMES_CACHE(&uc->ht); + sl; + sl = simple_hashtable_next_read_only_USERNAMES_CACHE(&uc->ht, sl)) { + STRING *u = SIMPLE_HASHTABLE_SLOT_DATA(sl); + string_freez(u); + } + + simple_hashtable_destroy_USERNAMES_CACHE(&uc->ht); + freez(uc); +} + +#endif //NETDATA_SYSTEM_USERS_H diff --git a/src/libnetdata/onewayalloc/README.md b/src/libnetdata/onewayalloc/README.md new file mode 100644 index 00000000..082085db --- /dev/null +++ b/src/libnetdata/onewayalloc/README.md @@ -0,0 +1,75 @@ +<!-- +title: "One Way Allocator" +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/libnetdata/onewayalloc/README.md" +sidebar_label: "One way allocator" +learn_status: "Published" +learn_topic_type: "Tasks" +learn_rel_path: "Developers/libnetdata" +--> + +# One Way Allocator + +This is a very fast single-threaded-only memory allocator, that minimized system calls +when a lot of memory allocations needs to be made to perform a task, which all of them +can be freed together when the task finishes. + +It has been designed to be used for netdata context queries. + +For netdata to perform a context query, it builds a virtual chart, a chart that contains +all the dimensions of the charts having the same context. This process requires allocating +several structures for each of the dimensions to attach them to the virtual chart. All +these data can be freed immediately after the query finishes. + +## How it works + +1. The caller calls `ONEWAYALLOC *owa = onewayalloc_create(sizehint)` to create an OWA. + Internally this allocates the first memory buffer with size >= `sizehint`. + If `sizehint` is zero, it will allocate 1 hardware page (usually 4kb). + No need to check for success or failure. As with `mallocz()` in netdata, a `fatal()` + will be called if the allocation fails - although this will never fail, since Linux + does not really check if there is memory available for `mmap()` calls. + +2. The caller can then perform any number of the following calls to acquire memory: + - `onewayalloc_mallocz(owa, size)`, similar to `mallocz()` + - `onewayalloc_callocz(owa, nmemb, size)`, similar to `callocz()` + - `onewayalloc_strdupz(owa, string)`, similar to `strdupz()` + - `onewayalloc_memdupz(owa, ptr, size)`, similar to `mallocz()` and then `memcpy()` + +3. Once the caller has done all the work with the allocated buffers, all memory allocated + can be freed with `onewayalloc_destroy(owa)`. + +## How faster it is? + +On modern hardware, for any single query the performance improvement is marginal and not +noticeable at all. + +We performed the following tests using the same huge context query (1000 charts, +100 dimensions each = 100k dimensions) + +1. using `mallocz()`, 1 caller, 256 queries (sequential) +2. using `mallocz()`, 256 callers, 1 query each (parallel) +3. using `OWA`, 1 caller, 256 queries (sequential) +4. using `OWA`, 256 callers, 1 query each (parallel) + +Netdata was configured to use 24 web threads on the 24 core server we used. + +The results are as follows: + +### sequential test + +branch|transactions|time to complete|transaction rate|average response time|min response time|max response time +:---:|:---:|:---:|:---:|:---:|:---:|:---:| +`malloc()`|256|322.35s|0.79/sec|1.26s|1.01s|1.87s +`OWA`|256|310.19s|0.83/sec|1.21s|1.04s|1.63s + +For a single query, the improvement is just marginal and not noticeable at all. + +### parallel test + +branch|transactions|time to complete|transaction rate|average response time|min response time|max response time +:---:|:---:|:---:|:---:|:---:|:---:|:---:| +`malloc()`|256|84.72s|3.02/sec|68.43s|50.20s|84.71s +`OWA`|256|39.35s|6.51/sec|34.48s|20.55s|39.34s + +For parallel workload, like the one executed by netdata.cloud, `OWA` provides a 54% overall speed improvement (more than double the overall +user-experienced speed, including the data query itself). diff --git a/src/libnetdata/onewayalloc/onewayalloc.c b/src/libnetdata/onewayalloc/onewayalloc.c new file mode 100644 index 00000000..98ab1835 --- /dev/null +++ b/src/libnetdata/onewayalloc/onewayalloc.c @@ -0,0 +1,213 @@ +#include "onewayalloc.h" + +// https://www.gnu.org/software/libc/manual/html_node/Aligned-Memory-Blocks.html +#define OWA_NATURAL_ALIGNMENT (sizeof(uintptr_t) * 2) + +typedef struct owa_page { + size_t stats_pages; + size_t stats_pages_size; + size_t stats_mallocs_made; + size_t stats_mallocs_size; + size_t size; // the total size of the page + size_t offset; // the first free byte of the page + struct owa_page *next; // the next page on the list + struct owa_page *last; // the last page on the list - we currently allocate on this +} OWA_PAGE; + +static size_t onewayalloc_total_memory = 0; + +size_t onewayalloc_allocated_memory(void) { + return __atomic_load_n(&onewayalloc_total_memory, __ATOMIC_RELAXED); +} + +// allocations need to be aligned to CPU register width +// https://en.wikipedia.org/wiki/Data_structure_alignment +static inline size_t natural_alignment(size_t size) { + if(unlikely(size % OWA_NATURAL_ALIGNMENT)) + size = size + OWA_NATURAL_ALIGNMENT - (size % OWA_NATURAL_ALIGNMENT); + + return size; +} + +// Create an OWA +// Once it is created, the called may call the onewayalloc_mallocz() +// any number of times, for any amount of memory. + +static OWA_PAGE *onewayalloc_create_internal(OWA_PAGE *head, size_t size_hint) { + static size_t OWA_NATURAL_PAGE_SIZE = 0; + + if(unlikely(!OWA_NATURAL_PAGE_SIZE)) { + long int page_size = sysconf(_SC_PAGE_SIZE); + if (unlikely(page_size == -1)) + OWA_NATURAL_PAGE_SIZE = 4096; + else + OWA_NATURAL_PAGE_SIZE = page_size; + } + + // our default page size + size_t size = OWA_NATURAL_PAGE_SIZE; + + // make sure the new page will fit both the requested size + // and the OWA_PAGE structure at its beginning + size_hint += natural_alignment(sizeof(OWA_PAGE)); + + // prefer the user size if it is bigger than our size + if(size_hint > size) size = size_hint; + + // try to allocate half of the total we have allocated already + if(likely(head)) { + size_t optimal_size = head->stats_pages_size / 2; + if(optimal_size > size) size = optimal_size; + } + + // Make sure our allocations are always a multiple of the hardware page size + if(size % OWA_NATURAL_PAGE_SIZE) size = size + OWA_NATURAL_PAGE_SIZE - (size % OWA_NATURAL_PAGE_SIZE); + + // OWA_PAGE *page = (OWA_PAGE *)netdata_mmap(NULL, size, MAP_ANONYMOUS|MAP_PRIVATE, 0); + // if(unlikely(!page)) fatal("Cannot allocate onewayalloc buffer of size %zu", size); + OWA_PAGE *page = (OWA_PAGE *)mallocz(size); + __atomic_add_fetch(&onewayalloc_total_memory, size, __ATOMIC_RELAXED); + + page->size = size; + page->offset = natural_alignment(sizeof(OWA_PAGE)); + page->next = page->last = NULL; + + if(unlikely(!head)) { + // this is the first time we are called + head = page; + head->stats_pages = 0; + head->stats_pages_size = 0; + head->stats_mallocs_made = 0; + head->stats_mallocs_size = 0; + } + else { + // link this page into our existing linked list + head->last->next = page; + } + + head->last = page; + head->stats_pages++; + head->stats_pages_size += size; + + return page; +} + +ONEWAYALLOC *onewayalloc_create(size_t size_hint) { + return (ONEWAYALLOC *)onewayalloc_create_internal(NULL, size_hint); +} + +void *onewayalloc_mallocz(ONEWAYALLOC *owa, size_t size) { +#ifdef FSANITIZE_ADDRESS + return mallocz(size); +#endif + + OWA_PAGE *head = (OWA_PAGE *)owa; + OWA_PAGE *page = head->last; + + // update stats + head->stats_mallocs_made++; + head->stats_mallocs_size += size; + + // make sure the size is aligned + size = natural_alignment(size); + + if(unlikely(page->size - page->offset < size)) { + // we don't have enough space to fit the data + // let's get another page + page = onewayalloc_create_internal(head, (size > page->size)?size:page->size); + } + + char *mem = (char *)page; + mem = &mem[page->offset]; + page->offset += size; + + return (void *)mem; +} + +void *onewayalloc_callocz(ONEWAYALLOC *owa, size_t nmemb, size_t size) { + size_t total = nmemb * size; + void *mem = onewayalloc_mallocz(owa, total); + memset(mem, 0, total); + return mem; +} + +char *onewayalloc_strdupz(ONEWAYALLOC *owa, const char *s) { + size_t size = strlen(s) + 1; + char *d = onewayalloc_mallocz((OWA_PAGE *)owa, size); + memcpy(d, s, size); + return d; +} + +void *onewayalloc_memdupz(ONEWAYALLOC *owa, const void *src, size_t size) { + void *mem = onewayalloc_mallocz((OWA_PAGE *)owa, size); + // memcpy() is way faster than strcpy() since it does not check for '\0' + memcpy(mem, src, size); + return mem; +} + +void onewayalloc_freez(ONEWAYALLOC *owa __maybe_unused, const void *ptr __maybe_unused) { +#ifdef FSANITIZE_ADDRESS + freez((void *)ptr); + return; +#endif + +#ifdef NETDATA_INTERNAL_CHECKS + // allow the caller to call us for a mallocz() allocation + // so try to find it in our memory and if it is not there + // log an error + + if (unlikely(!ptr)) + return; + + OWA_PAGE *head = (OWA_PAGE *)owa; + OWA_PAGE *page; + uintptr_t seeking = (uintptr_t)ptr; + + for(page = head; page ;page = page->next) { + uintptr_t start = (uintptr_t)page; + uintptr_t end = start + page->size; + + if(seeking >= start && seeking <= end) { + // found it - it is ours + // just return to let the caller think we actually did something + return; + } + } + + // not found - it is not ours + // let's free it with the system allocator + netdata_log_error("ONEWAYALLOC: request to free address 0x%p that is not allocated by this OWA", ptr); +#endif +} + +void *onewayalloc_doublesize(ONEWAYALLOC *owa, const void *src, size_t oldsize) { + size_t newsize = oldsize * 2; + void *dst = onewayalloc_mallocz(owa, newsize); + memcpy(dst, src, oldsize); + onewayalloc_freez(owa, src); + return dst; +} + +void onewayalloc_destroy(ONEWAYALLOC *owa) { + if(!owa) return; + + OWA_PAGE *head = (OWA_PAGE *)owa; + + //netdata_log_info("OWA: %zu allocations of %zu total bytes, in %zu pages of %zu total bytes", + // head->stats_mallocs_made, head->stats_mallocs_size, + // head->stats_pages, head->stats_pages_size); + + size_t total_size = 0; + OWA_PAGE *page = head; + while(page) { + total_size += page->size; + + OWA_PAGE *p = page; + page = page->next; + + // munmap(p, p->size); + freez(p); + } + + __atomic_sub_fetch(&onewayalloc_total_memory, total_size, __ATOMIC_RELAXED); +} diff --git a/src/libnetdata/onewayalloc/onewayalloc.h b/src/libnetdata/onewayalloc/onewayalloc.h new file mode 100644 index 00000000..a415b063 --- /dev/null +++ b/src/libnetdata/onewayalloc/onewayalloc.h @@ -0,0 +1,21 @@ +#ifndef ONEWAYALLOC_H +#define ONEWAYALLOC_H 1 + +#include "../libnetdata.h" + +typedef void ONEWAYALLOC; + +ONEWAYALLOC *onewayalloc_create(size_t size_hint); +void onewayalloc_destroy(ONEWAYALLOC *owa); + +void *onewayalloc_mallocz(ONEWAYALLOC *owa, size_t size); +void *onewayalloc_callocz(ONEWAYALLOC *owa, size_t nmemb, size_t size); +char *onewayalloc_strdupz(ONEWAYALLOC *owa, const char *s); +void *onewayalloc_memdupz(ONEWAYALLOC *owa, const void *src, size_t size); +void onewayalloc_freez(ONEWAYALLOC *owa, const void *ptr); + +void *onewayalloc_doublesize(ONEWAYALLOC *owa, const void *src, size_t oldsize); + +size_t onewayalloc_allocated_memory(void); + +#endif // ONEWAYALLOC_H diff --git a/src/libnetdata/os/adjtimex.c b/src/libnetdata/os/adjtimex.c new file mode 100644 index 00000000..b7a91009 --- /dev/null +++ b/src/libnetdata/os/adjtimex.c @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" + +int os_adjtimex(struct timex *buf __maybe_unused) { +#if defined(OS_MACOS) || defined(OS_FREEBSD) + return ntp_adjtime(buf); +#endif + +#if defined(OS_LINUX) + return adjtimex(buf); +#endif + + errno = ENOSYS; + return -1; +} diff --git a/src/libnetdata/os/adjtimex.h b/src/libnetdata/os/adjtimex.h new file mode 100644 index 00000000..d37ebf88 --- /dev/null +++ b/src/libnetdata/os/adjtimex.h @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_ADJTIMEX_H +#define NETDATA_ADJTIMEX_H + +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_MACOS) +#include <sys/timex.h> +#endif + +struct timex; +int os_adjtimex(struct timex *buf); + +#endif //NETDATA_ADJTIMEX_H diff --git a/src/libnetdata/os/byteorder.h b/src/libnetdata/os/byteorder.h new file mode 100644 index 00000000..28fc9e66 --- /dev/null +++ b/src/libnetdata/os/byteorder.h @@ -0,0 +1,32 @@ +#ifndef LIBNETDATA_BYTE_ORDER_H +#define LIBNETDATA_BYTE_ORDER_H + +/** compatibility header for endian.h + * This is a simple compatibility shim to convert + * BSD/Linux endian macros to the Mac OS X equivalents. + * It is public domain. + * */ + +#ifndef __APPLE__ +#error "This header file (endian.h) is MacOS X specific.\n" +#endif /* __APPLE__ */ + + +#include <libkern/OSByteOrder.h> + +#define htobe16(x) OSSwapHostToBigInt16(x) +#define htole16(x) OSSwapHostToLittleInt16(x) +#define be16toh(x) OSSwapBigToHostInt16(x) +#define le16toh(x) OSSwapLittleToHostInt16(x) + +#define htobe32(x) OSSwapHostToBigInt32(x) +#define htole32(x) OSSwapHostToLittleInt32(x) +#define be32toh(x) OSSwapBigToHostInt32(x) +#define le32toh(x) OSSwapLittleToHostInt32(x) + +#define htobe64(x) OSSwapHostToBigInt64(x) +#define htole64(x) OSSwapHostToLittleInt64(x) +#define be64toh(x) OSSwapBigToHostInt64(x) +#define le64toh(x) OSSwapLittleToHostInt64(x) + +#endif /* LIBNETDATA_BYTE_ORDER_H */ diff --git a/src/libnetdata/os/get_pid_max.c b/src/libnetdata/os/get_pid_max.c new file mode 100644 index 00000000..45027961 --- /dev/null +++ b/src/libnetdata/os/get_pid_max.c @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" + +pid_t pid_max = 32768; +pid_t os_get_system_pid_max(void) { +#if defined(OS_MACOS) + + // As we currently do not know a solution to query pid_max from the os + // we use the number defined in bsd/sys/proc_internal.h in XNU sources + pid_max = 99999; + return pid_max; + +#elif defined(OS_FREEBSD) + + int32_t tmp_pid_max; + + if (unlikely(GETSYSCTL_BY_NAME("kern.pid_max", tmp_pid_max))) { + pid_max = 99999; + netdata_log_error("Assuming system's maximum pid is %d.", pid_max); + } else { + pid_max = tmp_pid_max; + } + + return pid_max; + +#elif defined(OS_LINUX) + + static char read = 0; + if(unlikely(read)) return pid_max; + read = 1; + + char filename[FILENAME_MAX + 1]; + snprintfz(filename, FILENAME_MAX, "%s/proc/sys/kernel/pid_max", netdata_configured_host_prefix?netdata_configured_host_prefix:""); + + unsigned long long max = 0; + if(read_single_number_file(filename, &max) != 0) { + netdata_log_error("Cannot open file '%s'. Assuming system supports %d pids.", filename, pid_max); + return pid_max; + } + + if(!max) { + netdata_log_error("Cannot parse file '%s'. Assuming system supports %d pids.", filename, pid_max); + return pid_max; + } + + pid_max = (pid_t) max; + return pid_max; + +#else + + // just a big default + + pid_max = 4194304; + return pid_max; + +#endif +} diff --git a/src/libnetdata/os/get_pid_max.h b/src/libnetdata/os/get_pid_max.h new file mode 100644 index 00000000..a6da0a4d --- /dev/null +++ b/src/libnetdata/os/get_pid_max.h @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_GET_PID_MAX_H +#define NETDATA_GET_PID_MAX_H + +#include <unistd.h> + +extern pid_t pid_max; +pid_t os_get_system_pid_max(void); + +#endif //NETDATA_GET_PID_MAX_H diff --git a/src/libnetdata/os/get_system_cpus.c b/src/libnetdata/os/get_system_cpus.c new file mode 100644 index 00000000..5a76d8aa --- /dev/null +++ b/src/libnetdata/os/get_system_cpus.c @@ -0,0 +1,93 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" + +#if defined(OS_WINDOWS) +#include <windows.h> +#endif + +#define CPUS_FOR_COLLECTORS 0 +#define CPUS_FOR_NETDATA 1 + +long os_get_system_cpus_cached(bool cache, bool for_netdata) { + static long processors[2] = { 0, 0 }; + + int index = for_netdata ? CPUS_FOR_NETDATA : CPUS_FOR_COLLECTORS; + + if(likely(cache && processors[index] > 0)) + return processors[index]; + +#if defined(OS_FREEBSD) || defined(OS_MACOS) +#if defined(OS_MACOS) +#define HW_CPU_NAME "hw.logicalcpu" +#else +#define HW_CPU_NAME "hw.ncpu" +#endif + + int32_t tmp_processors; + bool error = false; + + if (unlikely(GETSYSCTL_BY_NAME(HW_CPU_NAME, tmp_processors))) + error = true; + else + processors[index] = tmp_processors; + + if(processors[index] < 1) { + processors[index] = 1; + + if(error) + netdata_log_error("Assuming system has %ld processors.", processors[index]); + } + + return processors[index]; +#elif defined(OS_LINUX) + + char filename[FILENAME_MAX + 1]; + snprintfz(filename, FILENAME_MAX, "%s/proc/stat", + (!for_netdata && netdata_configured_host_prefix) ? netdata_configured_host_prefix : ""); + + procfile *ff = procfile_open(filename, NULL, PROCFILE_FLAG_DEFAULT); + if(!ff) { + processors[index] = 1; + netdata_log_error("Cannot open file '%s'. Assuming system has %ld processors.", filename, processors[index]); + return processors[index]; + } + + ff = procfile_readall(ff); + if(!ff) { + processors[index] = 1; + netdata_log_error("Cannot open file '%s'. Assuming system has %ld processors.", filename, processors[index]); + return processors[index]; + } + + long tmp_processors = 0; + unsigned int i; + for(i = 0; i < procfile_lines(ff); i++) { + if(!procfile_linewords(ff, i)) continue; + + if(strncmp(procfile_lineword(ff, i, 0), "cpu", 3) == 0) + tmp_processors++; + } + procfile_close(ff); + + processors[index] = --tmp_processors; + + if(processors[index] < 1) + processors[index] = 1; + + netdata_log_debug(D_SYSTEM, "System has %ld processors.", processors[index]); + return processors[index]; + +#elif defined(OS_WINDOWS) + + SYSTEM_INFO sysInfo; + GetSystemInfo(&sysInfo); + return (long) sysInfo.dwNumberOfProcessors; + +#else + + processors[index] = 1; + return processors[index]; + +#endif +} diff --git a/src/libnetdata/os/get_system_cpus.h b/src/libnetdata/os/get_system_cpus.h new file mode 100644 index 00000000..3c608df8 --- /dev/null +++ b/src/libnetdata/os/get_system_cpus.h @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_GET_SYSTEM_CPUS_H +#define NETDATA_GET_SYSTEM_CPUS_H + +#include "../libnetdata.h" + +long os_get_system_cpus_cached(bool cache, bool for_netdata); + +#endif //NETDATA_GET_SYSTEM_CPUS_H diff --git a/src/libnetdata/os/getgrouplist.c b/src/libnetdata/os/getgrouplist.c new file mode 100644 index 00000000..7f32faf3 --- /dev/null +++ b/src/libnetdata/os/getgrouplist.c @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" + +int os_getgrouplist(const char *username __maybe_unused, gid_t gid __maybe_unused, gid_t *supplementary_groups __maybe_unused, int *ngroups __maybe_unused) { +#if defined(OS_LINUX) || defined(OS_FREEBSD) + return getgrouplist(username, gid, supplementary_groups, ngroups); +#endif + +#if defined(OS_MACOS) + return getgrouplist(username, gid, (int *)supplementary_groups, ngroups); +#endif + + errno = ENOSYS; + return -1; +} diff --git a/src/libnetdata/os/getgrouplist.h b/src/libnetdata/os/getgrouplist.h new file mode 100644 index 00000000..33646255 --- /dev/null +++ b/src/libnetdata/os/getgrouplist.h @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_GETGROUPLIST_H +#define NETDATA_GETGROUPLIST_H + +#include <unistd.h> +int os_getgrouplist(const char *username, gid_t gid, gid_t *supplementary_groups, int *ngroups); + +#endif //NETDATA_GETGROUPLIST_H diff --git a/src/libnetdata/os/gettid.c b/src/libnetdata/os/gettid.c new file mode 100644 index 00000000..273c428f --- /dev/null +++ b/src/libnetdata/os/gettid.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" + +#if defined(OS_WINDOWS) +#include <windows.h> +#endif + +pid_t os_gettid(void) { +#if defined(HAVE_GETTID) + return gettid(); +#elif defined(HAVE_PTHREAD_GETTHREADID_NP) + return (pid_t)pthread_getthreadid_np(); +#elif defined(HAVE_PTHREAD_THREADID_NP) + uint64_t curthreadid; + pthread_threadid_np(NULL, &curthreadid); + return curthreadid; +#elif defined(OS_WINDOWS) + return (pid_t)GetCurrentThreadId(); +#elif defined(OS_LINUX) + return (pid_t)syscall(SYS_gettid); +#else + return (pid_t)pthread_self(); +#endif +} + +static __thread pid_t gettid_cached_tid = 0; +pid_t gettid_cached(void) { + if(unlikely(gettid_cached_tid == 0)) + gettid_cached_tid = os_gettid(); + + return gettid_cached_tid; +}
\ No newline at end of file diff --git a/src/libnetdata/os/gettid.h b/src/libnetdata/os/gettid.h new file mode 100644 index 00000000..f04d9c36 --- /dev/null +++ b/src/libnetdata/os/gettid.h @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_GETTID_H +#define NETDATA_GETTID_H + +#include <unistd.h> + +pid_t os_gettid(void); +pid_t gettid_cached(void); + +#endif //NETDATA_GETTID_H diff --git a/src/libnetdata/os/os-freebsd-wrappers.c b/src/libnetdata/os/os-freebsd-wrappers.c new file mode 100644 index 00000000..c3d1bda0 --- /dev/null +++ b/src/libnetdata/os/os-freebsd-wrappers.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" + +#if defined(OS_FREEBSD) + +int getsysctl_by_name(const char *name, void *ptr, size_t len) { + size_t nlen = len; + + if (unlikely(sysctlbyname(name, ptr, &nlen, NULL, 0) == -1)) { + netdata_log_error("FREEBSD: sysctl(%s...) failed: %s", name, strerror(errno)); + return 1; + } + if (unlikely(nlen != len)) { + netdata_log_error("FREEBSD: sysctl(%s...) expected %lu, got %lu", name, (unsigned long)len, (unsigned long)nlen); + return 1; + } + return 0; +} + +int getsysctl_simple(const char *name, int *mib, size_t miblen, void *ptr, size_t len) { + size_t nlen = len; + + if (unlikely(!mib[0])) + if (unlikely(getsysctl_mib(name, mib, miblen))) + return 1; + + if (unlikely(sysctl(mib, miblen, ptr, &nlen, NULL, 0) == -1)) { + netdata_log_error("FREEBSD: sysctl(%s...) failed: %s", name, strerror(errno)); + return 1; + } + if (unlikely(nlen != len)) { + netdata_log_error("FREEBSD: sysctl(%s...) expected %lu, got %lu", name, (unsigned long)len, (unsigned long)nlen); + return 1; + } + + return 0; +} + +int getsysctl(const char *name, int *mib, size_t miblen, void *ptr, size_t *len) { + size_t nlen = *len; + + if (unlikely(!mib[0])) + if (unlikely(getsysctl_mib(name, mib, miblen))) + return 1; + + if (unlikely(sysctl(mib, miblen, ptr, len, NULL, 0) == -1)) { + netdata_log_error("FREEBSD: sysctl(%s...) failed: %s", name, strerror(errno)); + return 1; + } + if (unlikely(ptr != NULL && nlen != *len)) { + netdata_log_error("FREEBSD: sysctl(%s...) expected %lu, got %lu", name, (unsigned long)*len, (unsigned long)nlen); + return 1; + } + + return 0; +} + +int getsysctl_mib(const char *name, int *mib, size_t len) { + size_t nlen = len; + + if (unlikely(sysctlnametomib(name, mib, &nlen) == -1)) { + netdata_log_error("FREEBSD: sysctl(%s...) failed: %s", name, strerror(errno)); + return 1; + } + if (unlikely(nlen != len)) { + netdata_log_error("FREEBSD: sysctl(%s...) expected %lu, got %lu", name, (unsigned long)len, (unsigned long)nlen); + return 1; + } + return 0; +} + +#endif diff --git a/src/libnetdata/os/os-freebsd-wrappers.h b/src/libnetdata/os/os-freebsd-wrappers.h new file mode 100644 index 00000000..6f54a2e9 --- /dev/null +++ b/src/libnetdata/os/os-freebsd-wrappers.h @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_OS_FREEBSD_WRAPPERS_H +#define NETDATA_OS_FREEBSD_WRAPPERS_H + +#include "../libnetdata.h" + +#if defined(OS_FREEBSD) +#include <sys/sysctl.h> + +#define GETSYSCTL_BY_NAME(name, var) getsysctl_by_name(name, &(var), sizeof(var)) +int getsysctl_by_name(const char *name, void *ptr, size_t len); + +#define GETSYSCTL_MIB(name, mib) getsysctl_mib(name, mib, sizeof(mib)/sizeof(int)) + +int getsysctl_mib(const char *name, int *mib, size_t len); + +#define GETSYSCTL_SIMPLE(name, mib, var) getsysctl_simple(name, mib, sizeof(mib)/sizeof(int), &(var), sizeof(var)) +#define GETSYSCTL_WSIZE(name, mib, var, size) getsysctl_simple(name, mib, sizeof(mib)/sizeof(int), var, size) + +int getsysctl_simple(const char *name, int *mib, size_t miblen, void *ptr, size_t len); + +#define GETSYSCTL_SIZE(name, mib, size) getsysctl(name, mib, sizeof(mib)/sizeof(int), NULL, &(size)) +#define GETSYSCTL(name, mib, var, size) getsysctl(name, mib, sizeof(mib)/sizeof(int), &(var), &(size)) + +int getsysctl(const char *name, int *mib, size_t miblen, void *ptr, size_t *len); +#endif + +#endif //NETDATA_OS_FREEBSD_WRAPPERS_H diff --git a/src/libnetdata/os/os-macos-wrappers.c b/src/libnetdata/os/os-macos-wrappers.c new file mode 100644 index 00000000..b3d3ee4e --- /dev/null +++ b/src/libnetdata/os/os-macos-wrappers.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" + +#if defined(OS_MACOS) + +int getsysctl_by_name(const char *name, void *ptr, size_t len) { + size_t nlen = len; + + if (unlikely(sysctlbyname(name, ptr, &nlen, NULL, 0) == -1)) { + netdata_log_error("MACOS: sysctl(%s...) failed: %s", name, strerror(errno)); + return 1; + } + if (unlikely(nlen != len)) { + netdata_log_error("MACOS: sysctl(%s...) expected %lu, got %lu", name, (unsigned long)len, (unsigned long)nlen); + return 1; + } + return 0; +} + +#endif diff --git a/src/libnetdata/os/os-macos-wrappers.h b/src/libnetdata/os/os-macos-wrappers.h new file mode 100644 index 00000000..ec863c66 --- /dev/null +++ b/src/libnetdata/os/os-macos-wrappers.h @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_OS_MACOS_WRAPPERS_H +#define NETDATA_OS_MACOS_WRAPPERS_H + +#include "../libnetdata.h" + +#if defined(OS_MACOS) +#include <sys/sysctl.h> +#include "byteorder.h" + +#define GETSYSCTL_BY_NAME(name, var) getsysctl_by_name(name, &(var), sizeof(var)) +int getsysctl_by_name(const char *name, void *ptr, size_t len); + +#endif + +#endif //NETDATA_OS_MACOS_WRAPPERS_H diff --git a/src/libnetdata/os/os.c b/src/libnetdata/os/os.c new file mode 100644 index 00000000..1caa25f8 --- /dev/null +++ b/src/libnetdata/os/os.c @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" + +// ---------------------------------------------------------------------------- +// system functions +// to retrieve settings of the system + +unsigned int system_hz; +void os_get_system_HZ(void) { + long ticks; + + if ((ticks = sysconf(_SC_CLK_TCK)) == -1) { + netdata_log_error("Cannot get system clock ticks"); + } + + system_hz = (unsigned int) ticks; +} + +static inline unsigned long cpuset_str2ul(char **s) { + unsigned long n = 0; + char c; + for(c = **s; c >= '0' && c <= '9' ; c = *(++*s)) { + n *= 10; + n += c - '0'; + } + return n; +} + +unsigned long os_read_cpuset_cpus(const char *filename, long system_cpus) { + static char *buf = NULL; + static size_t buf_size = 0; + + if(!buf) { + buf_size = 100U + 6 * system_cpus + 1; // taken from kernel/cgroup/cpuset.c + buf = mallocz(buf_size); + } + + int ret = read_txt_file(filename, buf, buf_size); + + if(!ret) { + char *s = buf; + unsigned long ncpus = 0; + + // parse the cpuset string and calculate the number of cpus the cgroup is allowed to use + while (*s) { + if (isspace((uint8_t)*s)) { + s++; + continue; + } + unsigned long n = cpuset_str2ul(&s); + ncpus++; + if(*s == ',') { + s++; + continue; + } + if(*s == '-') { + s++; + unsigned long m = cpuset_str2ul(&s); + ncpus += m - n; // calculate the number of cpus in the region + } + s++; + } + + if(!ncpus) + return 0; + + return ncpus; + } + + return 0; +} + +// ===================================================================================================================== +// os_type + +#if defined(OS_LINUX) +const char *os_type = "linux"; +#endif + +#if defined(OS_FREEBSD) +const char *os_type = "freebsd"; +#endif + +#if defined(OS_MACOS) +const char *os_type = "macos"; +#endif + +#if defined(OS_WINDOWS) +const char *os_type = "windows"; +#endif + diff --git a/src/libnetdata/os/os.h b/src/libnetdata/os/os.h new file mode 100644 index 00000000..35009615 --- /dev/null +++ b/src/libnetdata/os/os.h @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_OS_H +#define NETDATA_OS_H + +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_MACOS) +#include <sys/syscall.h> +#endif + +#include "setresuid.h" +#include "setresgid.h" +#include "getgrouplist.h" +#include "adjtimex.h" +#include "gettid.h" +#include "waitid.h" +#include "get_pid_max.h" +#include "get_system_cpus.h" +#include "tinysleep.h" +#include "uuid_generate.h" +#include "setenv.h" +#include "os-freebsd-wrappers.h" +#include "os-macos-wrappers.h" + +// ===================================================================================================================== +// common defs for Apple/FreeBSD/Linux + +extern const char *os_type; + +#define os_get_system_cpus() os_get_system_cpus_cached(true, false) +#define os_get_system_cpus_uncached() os_get_system_cpus_cached(false, false) +long os_get_system_cpus_cached(bool cache, bool for_netdata); +unsigned long os_read_cpuset_cpus(const char *filename, long system_cpus); + +extern unsigned int system_hz; +void os_get_system_HZ(void); + +#endif //NETDATA_OS_H diff --git a/src/libnetdata/os/setenv.c b/src/libnetdata/os/setenv.c new file mode 100644 index 00000000..5aa4302b --- /dev/null +++ b/src/libnetdata/os/setenv.c @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-3.0-or-later
+
+#include "config.h"
+
+#ifndef HAVE_SETENV
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+int os_setenv(const char *name, const char *value, int overwrite) {
+ char *env_var;
+ int result;
+
+ if (!overwrite) {
+ env_var = getenv(name);
+ if (env_var) return 0; // Already set
+ }
+
+ size_t len = strlen(name) + strlen(value) + 2; // +2 for '=' and '\0'
+ env_var = malloc(len);
+ if (!env_var) return -1; // Allocation failure
+ snprintf(env_var, len, "%s=%s", name, value);
+
+ result = putenv(env_var);
+ // free(env_var); // _putenv in Windows makes a copy of the string
+ return result;
+}
+
+#endif
diff --git a/src/libnetdata/os/setenv.h b/src/libnetdata/os/setenv.h new file mode 100644 index 00000000..3ed63714 --- /dev/null +++ b/src/libnetdata/os/setenv.h @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-3.0-or-later
+
+#ifndef NETDATA_SETENV_H
+#define NETDATA_SETENV_H
+
+#include "config.h"
+
+#ifndef HAVE_SETENV
+int os_setenv(const char *name, const char *value, int overwrite);
+#define setenv(name, value, overwrite) os_setenv(name, value, overwrite)
+#endif
+
+#endif //NETDATA_SETENV_H
diff --git a/src/libnetdata/os/setresgid.c b/src/libnetdata/os/setresgid.c new file mode 100644 index 00000000..e9f1b186 --- /dev/null +++ b/src/libnetdata/os/setresgid.c @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" + +int os_setresgid(gid_t gid __maybe_unused, gid_t egid __maybe_unused, gid_t sgid __maybe_unused) { +#if defined(OS_LINUX) || defined(OS_FREEBSD) + return setresgid(gid, egid, sgid); +#endif + +#if defined(OS_MACOS) + return setregid(gid, egid); +#endif + + errno = ENOSYS; + return -1; +} diff --git a/src/libnetdata/os/setresgid.h b/src/libnetdata/os/setresgid.h new file mode 100644 index 00000000..fc6d41f9 --- /dev/null +++ b/src/libnetdata/os/setresgid.h @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_SETRESGID_H +#define NETDATA_SETRESGID_H + +#include <unistd.h> +int os_setresgid(gid_t gid, gid_t egid, gid_t sgid); + +#endif //NETDATA_SETRESGID_H diff --git a/src/libnetdata/os/setresuid.c b/src/libnetdata/os/setresuid.c new file mode 100644 index 00000000..08169029 --- /dev/null +++ b/src/libnetdata/os/setresuid.c @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" + +int os_setresuid(uid_t uid __maybe_unused, uid_t euid __maybe_unused, uid_t suid __maybe_unused) { +#if defined(OS_LINUX) || defined(OS_FREEBSD) + return setresuid(uid, euid, suid); +#endif + +#if defined(OS_MACOS) + return setreuid(uid, euid); +#endif + + errno = ENOSYS; + return -1; +} diff --git a/src/libnetdata/os/setresuid.h b/src/libnetdata/os/setresuid.h new file mode 100644 index 00000000..9f95d5d6 --- /dev/null +++ b/src/libnetdata/os/setresuid.h @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_SETRESUID_H +#define NETDATA_SETRESUID_H + +#include <unistd.h> + +int os_setresuid(uid_t uid, uid_t euid, uid_t suid); + +#endif //NETDATA_SETRESUID_H diff --git a/src/libnetdata/os/strndup.c b/src/libnetdata/os/strndup.c new file mode 100644 index 00000000..17210f12 --- /dev/null +++ b/src/libnetdata/os/strndup.c @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef HAVE_STRNDUP +#include "../libnetdata.h" + +static inline char *os_strndup( const char *s1, size_t n) +{ + char *copy= (char*)malloc( n+1 ); + memcpy( copy, s1, n ); + copy[n] = 0; + return copy; +}; +#endif diff --git a/src/libnetdata/os/strndup.h b/src/libnetdata/os/strndup.h new file mode 100644 index 00000000..9e51c8fd --- /dev/null +++ b/src/libnetdata/os/strndup.h @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef STRNDUP_H +#define STRNDUP_H + +#include "config.h" + +#ifndef HAVE_STRNDUP +#define strndup(s, n) os_strndup(s, n) +#endif + +#endif //STRNDUP_H diff --git a/src/libnetdata/os/tinysleep.c b/src/libnetdata/os/tinysleep.c new file mode 100644 index 00000000..f04cbdad --- /dev/null +++ b/src/libnetdata/os/tinysleep.c @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" + +#ifdef OS_WINDOWS +#include <windows.h> + +void tinysleep(void) { + // Improve the system timer resolution to 1 ms + timeBeginPeriod(1); + + // Sleep for the desired duration + Sleep(1); + + // Reset the system timer resolution + timeEndPeriod(1); +} +#else +void tinysleep(void) { + static const struct timespec ns = { .tv_sec = 0, .tv_nsec = 1 }; + nanosleep(&ns, NULL); +} +#endif diff --git a/src/libnetdata/os/tinysleep.h b/src/libnetdata/os/tinysleep.h new file mode 100644 index 00000000..480575a3 --- /dev/null +++ b/src/libnetdata/os/tinysleep.h @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: GPL-3.0-or-later
+
+#ifndef NETDATA_TINYSLEEP_H
+#define NETDATA_TINYSLEEP_H
+
+void tinysleep(void);
+
+#endif //NETDATA_TINYSLEEP_H
diff --git a/src/libnetdata/os/uuid_generate.c b/src/libnetdata/os/uuid_generate.c new file mode 100644 index 00000000..4a7a9b6b --- /dev/null +++ b/src/libnetdata/os/uuid_generate.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" +#undef uuid_generate +#undef uuid_generate_random +#undef uuid_generate_time + +#ifdef OS_WINDOWS +#include <windows.h> + +void os_uuid_generate(void *out) { + RPC_STATUS status = UuidCreate(out); + while (status != RPC_S_OK && status != RPC_S_UUID_LOCAL_ONLY) { + tinysleep(); + status = UuidCreate(out); + } +} + +void os_uuid_generate_random(void *out) { + os_uuid_generate(out); +} + +void os_uuid_generate_time(void *out) { + os_uuid_generate(out); +} + +#else + +#if !defined(OS_MACOS) +#include <uuid.h> +#endif + +void os_uuid_generate(void *out) { + uuid_generate(out); +} + +void os_uuid_generate_random(void *out) { + uuid_generate_random(out); +} + +void os_uuid_generate_time(void *out) { + uuid_generate_time(out); +} + +#endif diff --git a/src/libnetdata/os/uuid_generate.h b/src/libnetdata/os/uuid_generate.h new file mode 100644 index 00000000..95f07c79 --- /dev/null +++ b/src/libnetdata/os/uuid_generate.h @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-3.0-or-later
+
+#ifndef NETDATA_UUID_GENERATE_H
+#define NETDATA_UUID_GENERATE_H
+
+void os_uuid_generate(void *out);
+void os_uuid_generate_random(void *out);
+void os_uuid_generate_time(void *out);
+
+#endif //NETDATA_UUID_GENERATE_H
diff --git a/src/libnetdata/os/waitid.c b/src/libnetdata/os/waitid.c new file mode 100644 index 00000000..b78d704e --- /dev/null +++ b/src/libnetdata/os/waitid.c @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" + +int os_waitid(idtype_t idtype, id_t id, siginfo_t *infop, int options) { +#if defined(HAVE_WAITID) + return waitid(idtype, id, infop, options); +#else + // emulate waitid() using waitpid() + + // a cache for WNOWAIT + static const struct pid_status empty = { 0, 0 }; + static __thread struct pid_status last = { 0, 0 }; // the cache + struct pid_status current = { 0, 0 }; + + // zero the infop structure + memset(infop, 0, sizeof(*infop)); + + // from the infop structure we use only 3 fields: + // - si_pid + // - si_code + // - si_status + // so, we update only these 3 + + switch(idtype) { + case P_ALL: + current.pid = waitpid((pid_t)-1, ¤t.status, options); + if(options & WNOWAIT) + last = current; + else + last = empty; + break; + + case P_PID: + if(last.pid == (pid_t)id) { + current = last; + last = empty; + } + else + current.pid = waitpid((pid_t)id, ¤t.status, options); + + break; + + default: + errno = ENOSYS; + return -1; + } + + if (current.pid > 0) { + if (WIFEXITED(current.status)) { + infop->si_code = CLD_EXITED; + infop->si_status = WEXITSTATUS(current.status); + } else if (WIFSIGNALED(current.status)) { + infop->si_code = WTERMSIG(current.status) == SIGABRT ? CLD_DUMPED : CLD_KILLED; + infop->si_status = WTERMSIG(current.status); + } else if (WIFSTOPPED(current.status)) { + infop->si_code = CLD_STOPPED; + infop->si_status = WSTOPSIG(current.status); + } else if (WIFCONTINUED(current.status)) { + infop->si_code = CLD_CONTINUED; + infop->si_status = SIGCONT; + } + infop->si_pid = current.pid; + return 0; + } else if (current.pid == 0) { + // No change in state, depends on WNOHANG + return 0; + } + + return -1; +#endif +} diff --git a/src/libnetdata/os/waitid.h b/src/libnetdata/os/waitid.h new file mode 100644 index 00000000..9e1fd6be --- /dev/null +++ b/src/libnetdata/os/waitid.h @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_WAITID_H +#define NETDATA_WAITID_H + +#include "config.h" +#include <sys/types.h> +#include <signal.h> + +#ifdef HAVE_SYS_WAIT_H +#include <sys/wait.h> +#endif + +#ifndef WNOWAIT +#define WNOWAIT 0x01000000 +#endif + +#ifndef WEXITED +#define WEXITED 4 +#endif + +#if !defined(HAVE_WAITID) +typedef enum +{ + P_ALL, /* Wait for any child. */ + P_PID, /* Wait for specified process. */ + P_PGID, /* Wait for members of process group. */ + P_PIDFD, /* Wait for the child referred by the PID file descriptor. */ +} idtype_t; + +struct pid_status { + pid_t pid; + int status; +}; + +#if defined(OS_WINDOWS) && !defined(__CYGWIN__) +typedef uint32_t id_t; +typedef struct { + int si_code; /* Signal code. */ + int si_status; /* Exit value or signal. */ + pid_t si_pid; /* Sending process ID. */ +} siginfo_t; +#endif +#endif + +int os_waitid(idtype_t idtype, id_t id, siginfo_t *infop, int options); + +#endif //NETDATA_WAITID_H diff --git a/src/libnetdata/popen/README.md b/src/libnetdata/popen/README.md new file mode 100644 index 00000000..ca4877c1 --- /dev/null +++ b/src/libnetdata/popen/README.md @@ -0,0 +1,15 @@ +<!-- +title: "popen" +custom_edit_url: https://github.com/netdata/netdata/edit/master/src/libnetdata/popen/README.md +sidebar_label: "popen" +learn_status: "Published" +learn_topic_type: "Tasks" +learn_rel_path: "Developers/libnetdata" +--> + +# popen + +Process management library + + + diff --git a/src/libnetdata/popen/popen.c b/src/libnetdata/popen/popen.c new file mode 100644 index 00000000..c1721e9b --- /dev/null +++ b/src/libnetdata/popen/popen.c @@ -0,0 +1,446 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" + +// ---------------------------------------------------------------------------- +// popen with tracking + +static pthread_mutex_t netdata_popen_tracking_mutex = NETDATA_MUTEX_INITIALIZER; + +struct netdata_popen { + pid_t pid; + bool reaped; + siginfo_t infop; + int waitid_ret; + struct netdata_popen *next; + struct netdata_popen *prev; +}; + +static struct netdata_popen *netdata_popen_root = NULL; + +// myp_add_lock takes the lock if we're tracking. +static void netdata_popen_tracking_lock(void) { + netdata_mutex_lock(&netdata_popen_tracking_mutex); +} + +// myp_add_unlock release the lock if we're tracking. +static void netdata_popen_tracking_unlock(void) { + netdata_mutex_unlock(&netdata_popen_tracking_mutex); +} + +// myp_add_locked adds pid if we're tracking. +// myp_add_lock must have been called previously. +static void netdata_popen_tracking_add_pid_unsafe(pid_t pid) { + struct netdata_popen *mp; + + mp = callocz(1, sizeof(struct netdata_popen)); + mp->pid = pid; + + DOUBLE_LINKED_LIST_PREPEND_ITEM_UNSAFE(netdata_popen_root, mp, prev, next); +} + +// myp_del deletes pid if we're tracking. +static void netdata_popen_tracking_del_pid(pid_t pid) { + struct netdata_popen *mp; + + netdata_popen_tracking_lock(); + + DOUBLE_LINKED_LIST_FOREACH_FORWARD(netdata_popen_root, mp, prev, next) { + if(unlikely(mp->pid == pid)) + break; + } + + if(mp) { + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(netdata_popen_root, mp, prev, next); + freez(mp); + } + else + netdata_log_error("POPEN: Cannot find pid %d.", pid); + + netdata_popen_tracking_unlock(); +} + +// myp_free cleans up any resources allocated for process +// tracking. +void netdata_popen_tracking_cleanup(void) { + netdata_popen_tracking_lock(); + + while(netdata_popen_root) { + struct netdata_popen *mp = netdata_popen_root; + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(netdata_popen_root, mp, prev, next); + freez(mp); + } + + netdata_popen_tracking_unlock(); +} + +int netdata_waitid(idtype_t idtype, id_t id, siginfo_t *infop, int options) { + struct netdata_popen *mp = NULL; + + if(idtype == P_PID && id != 0) { + // the caller is asking to waitid() for a specific child pid + + netdata_popen_tracking_lock(); + DOUBLE_LINKED_LIST_FOREACH_FORWARD(netdata_popen_root, mp, prev, next) { + if(unlikely(mp->pid == (pid_t)id)) + break; + } + + if(!mp) + netdata_popen_tracking_unlock(); + } + + int ret; + if(mp && mp->reaped) { + // we have already reaped this child + ret = mp->waitid_ret; + *infop = mp->infop; + } + else { + // we haven't reaped this child yet + ret = os_waitid(idtype, id, infop, options); + + if(mp && !mp->reaped) { + mp->reaped = true; + mp->infop = *infop; + mp->waitid_ret = ret; + } + } + + if(mp) + netdata_popen_tracking_unlock(); + + return ret; +} + +// ---------------------------------------------------------------------------- +// helpers + +static inline void convert_argv_to_string(char *dst, size_t size, const char *spawn_argv[]) { + int i; + for(i = 0; spawn_argv[i] ;i++) { + if(i == 0) snprintfz(dst, size, "%s", spawn_argv[i]); + else { + size_t len = strlen(dst); + snprintfz(&dst[len], size - len, " '%s'", spawn_argv[i]); + } + } +} + +// ---------------------------------------------------------------------------- +// the core of netdata popen + +/* + * Returns -1 on failure, 0 on success. When POPEN_FLAG_CREATE_PIPE is set, on success set the FILE *fp pointer. + */ +#define PIPE_READ 0 +#define PIPE_WRITE 1 + +static int popene_internal(volatile pid_t *pidptr, char **env, uint8_t flags, FILE **fpp_child_stdin, FILE **fpp_child_stdout, const char *command, const char *spawn_argv[]) { + // create a string to be logged about the command we are running + char command_to_be_logged[2048]; + convert_argv_to_string(command_to_be_logged, sizeof(command_to_be_logged), spawn_argv); + // netdata_log_info("custom_popene() running command: %s", command_to_be_logged); + + int ret = 0; // success by default + int attr_rc = 1; // failure by default + + FILE *fp_child_stdin = NULL, *fp_child_stdout = NULL; + int pipefd_stdin[2] = { -1, -1 }; + int pipefd_stdout[2] = { -1, -1 }; + + pid_t pid; + posix_spawnattr_t attr; + posix_spawn_file_actions_t fa; + + unsigned int fds_to_exclude_from_closing = OPEN_FD_EXCLUDE_STDERR; + + if(posix_spawn_file_actions_init(&fa)) { + netdata_log_error("POPEN: posix_spawn_file_actions_init() failed."); + ret = -1; + goto set_return_values_and_return; + } + + if(fpp_child_stdin) { + if (pipe(pipefd_stdin) == -1) { + netdata_log_error("POPEN: stdin pipe() failed"); + ret = -1; + goto cleanup_and_return; + } + + if ((fp_child_stdin = fdopen(pipefd_stdin[PIPE_WRITE], "w")) == NULL) { + netdata_log_error("POPEN: fdopen() stdin failed"); + ret = -1; + goto cleanup_and_return; + } + + if(posix_spawn_file_actions_adddup2(&fa, pipefd_stdin[PIPE_READ], STDIN_FILENO)) { + netdata_log_error("POPEN: posix_spawn_file_actions_adddup2() on stdin failed."); + ret = -1; + goto cleanup_and_return; + } + } + else { + if (posix_spawn_file_actions_addopen(&fa, STDIN_FILENO, "/dev/null", O_RDONLY, 0)) { + netdata_log_error("POPEN: posix_spawn_file_actions_addopen() on stdin to /dev/null failed."); + // this is not a fatal error + fds_to_exclude_from_closing |= OPEN_FD_EXCLUDE_STDIN; + } + } + + if (fpp_child_stdout) { + if (pipe(pipefd_stdout) == -1) { + netdata_log_error("POPEN: stdout pipe() failed"); + ret = -1; + goto cleanup_and_return; + } + + if ((fp_child_stdout = fdopen(pipefd_stdout[PIPE_READ], "r")) == NULL) { + netdata_log_error("POPEN: fdopen() stdout failed"); + ret = -1; + goto cleanup_and_return; + } + + if(posix_spawn_file_actions_adddup2(&fa, pipefd_stdout[PIPE_WRITE], STDOUT_FILENO)) { + netdata_log_error("POPEN: posix_spawn_file_actions_adddup2() on stdout failed."); + ret = -1; + goto cleanup_and_return; + } + } + else { + if (posix_spawn_file_actions_addopen(&fa, STDOUT_FILENO, "/dev/null", O_WRONLY, 0)) { + netdata_log_error("POPEN: posix_spawn_file_actions_addopen() on stdout to /dev/null failed."); + // this is not a fatal error + fds_to_exclude_from_closing |= OPEN_FD_EXCLUDE_STDOUT; + } + } + + if(flags & POPEN_FLAG_CLOSE_FD) { + // Mark all files to be closed by the exec() stage of posix_spawn() + for_each_open_fd(OPEN_FD_ACTION_FD_CLOEXEC, fds_to_exclude_from_closing); + } + + attr_rc = posix_spawnattr_init(&attr); + if(attr_rc) { + // failed + netdata_log_error("POPEN: posix_spawnattr_init() failed."); + } + else { + // success + // reset all signals in the child + + if (posix_spawnattr_setflags(&attr, POSIX_SPAWN_SETSIGMASK | POSIX_SPAWN_SETSIGDEF)) + netdata_log_error("POPEN: posix_spawnattr_setflags() failed."); + + sigset_t mask; + sigemptyset(&mask); + + if (posix_spawnattr_setsigmask(&attr, &mask)) + netdata_log_error("POPEN: posix_spawnattr_setsigmask() failed."); + } + + // Take the lock while we fork to ensure we don't race with SIGCHLD + // delivery on a process which exits quickly. + netdata_popen_tracking_lock(); + if (!posix_spawn(&pid, command, &fa, &attr, (char * const*)spawn_argv, env)) { + // success + *pidptr = pid; + netdata_popen_tracking_add_pid_unsafe(pid); + netdata_popen_tracking_unlock(); + } + else { + // failure + netdata_popen_tracking_unlock(); + netdata_log_error("POPEN: failed to spawn command: \"%s\" from parent pid %d.", command_to_be_logged, getpid()); + ret = -1; + goto cleanup_and_return; + } + + // the normal cleanup will run + // but ret == 0 at this point + +cleanup_and_return: + if(!attr_rc) { + // posix_spawnattr_init() succeeded + if (posix_spawnattr_destroy(&attr)) + netdata_log_error("POPEN: posix_spawnattr_destroy() failed"); + } + + if (posix_spawn_file_actions_destroy(&fa)) + netdata_log_error("POPEN: posix_spawn_file_actions_destroy() failed"); + + // the child end - close it + if(pipefd_stdin[PIPE_READ] != -1) + close(pipefd_stdin[PIPE_READ]); + + // our end + if(ret == -1 || !fpp_child_stdin) { + if (fp_child_stdin) + fclose(fp_child_stdin); + else if (pipefd_stdin[PIPE_WRITE] != -1) + close(pipefd_stdin[PIPE_WRITE]); + + fp_child_stdin = NULL; + } + + // the child end - close it + if (pipefd_stdout[PIPE_WRITE] != -1) + close(pipefd_stdout[PIPE_WRITE]); + + // our end + if (ret == -1 || !fpp_child_stdout) { + if (fp_child_stdout) + fclose(fp_child_stdout); + else if (pipefd_stdout[PIPE_READ] != -1) + close(pipefd_stdout[PIPE_READ]); + + fp_child_stdout = NULL; + } + +set_return_values_and_return: + if(fpp_child_stdin) + *fpp_child_stdin = fp_child_stdin; + + if(fpp_child_stdout) + *fpp_child_stdout = fp_child_stdout; + + return ret; +} + +int netdata_popene_variadic_internal_dont_use_directly(volatile pid_t *pidptr, char **env, uint8_t flags, FILE **fpp_child_input, FILE **fpp_child_output, const char *command, ...) { + // convert the variable list arguments into what posix_spawn() needs + // all arguments are expected strings + va_list args; + int args_count; + + // count the number variable parameters + // the variable parameters are expected NULL terminated + { + const char *s; + + va_start(args, command); + args_count = 0; + while ((s = va_arg(args, const char *))) args_count++; + va_end(args); + } + + // create a string pointer array as needed by posix_spawn() + // variable array in the stack + const char *spawn_argv[args_count + 1]; + { + const char *s; + va_start(args, command); + int i; + for (i = 0; i < args_count; i++) { + s = va_arg(args, const char *); + spawn_argv[i] = s; + } + spawn_argv[args_count] = NULL; + va_end(args); + } + + return popene_internal(pidptr, env, flags, fpp_child_input, fpp_child_output, command, spawn_argv); +} + +// See man environ +extern char **environ; + +FILE *netdata_popen(const char *command, volatile pid_t *pidptr, FILE **fpp_child_input) { + FILE *fp_child_output = NULL; + const char *spawn_argv[] = { + "sh", + "-c", + command, + NULL + }; + (void)popene_internal(pidptr, environ, POPEN_FLAG_CLOSE_FD, fpp_child_input, &fp_child_output, "/bin/sh", spawn_argv); + return fp_child_output; +} + +FILE *netdata_popene(const char *command, volatile pid_t *pidptr, char **env, FILE **fpp_child_input) { + FILE *fp_child_output = NULL; + const char *spawn_argv[] = { + "sh", + "-c", + command, + NULL + }; + (void)popene_internal(pidptr, env, POPEN_FLAG_CLOSE_FD, fpp_child_input, &fp_child_output, "/bin/sh", spawn_argv); + return fp_child_output; +} + +// returns 0 on success, -1 on failure +int netdata_spawn(const char *command, volatile pid_t *pidptr) { + const char *spawn_argv[] = { + "sh", + "-c", + command, + NULL + }; + return popene_internal(pidptr, environ, POPEN_FLAG_NONE, NULL, NULL, "/bin/sh", spawn_argv); +} + +int netdata_pclose(FILE *fp_child_input, FILE *fp_child_output, pid_t pid) { + int ret; + siginfo_t info; + + netdata_log_debug(D_EXIT, "Request to netdata_pclose() on pid %d", pid); + + if (fp_child_input) + fclose(fp_child_input); + + if (fp_child_output) + fclose(fp_child_output); + + errno = 0; + + ret = netdata_waitid(P_PID, (id_t) pid, &info, WEXITED); + netdata_popen_tracking_del_pid(pid); + + if (ret != -1) { + switch (info.si_code) { + case CLD_EXITED: + if(info.si_status) + netdata_log_error("child pid %d exited with code %d.", info.si_pid, info.si_status); + return(info.si_status); + + case CLD_KILLED: + if(info.si_status == SIGTERM) { + netdata_log_info("child pid %d killed by SIGTERM", info.si_pid); + return(0); + } + else if(info.si_status == SIGPIPE) { + netdata_log_info("child pid %d killed by SIGPIPE.", info.si_pid); + return(0); + } + else { + netdata_log_error("child pid %d killed by signal %d.", info.si_pid, info.si_status); + return(-1); + } + + case CLD_DUMPED: + netdata_log_error("child pid %d core dumped by signal %d.", info.si_pid, info.si_status); + return(-2); + + case CLD_STOPPED: + netdata_log_error("child pid %d stopped by signal %d.", info.si_pid, info.si_status); + return(0); + + case CLD_TRAPPED: + netdata_log_error("child pid %d trapped by signal %d.", info.si_pid, info.si_status); + return(-4); + + case CLD_CONTINUED: + netdata_log_error("child pid %d continued by signal %d.", info.si_pid, info.si_status); + return(0); + + default: + netdata_log_error("child pid %d gave us a SIGCHLD with code %d and status %d.", info.si_pid, info.si_code, info.si_status); + return(-5); + } + } + else + netdata_log_error("Cannot waitid() for pid %d", pid); + + return 0; +} diff --git a/src/libnetdata/popen/popen.h b/src/libnetdata/popen/popen.h new file mode 100644 index 00000000..8f46abbc --- /dev/null +++ b/src/libnetdata/popen/popen.h @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_POPEN_H +#define NETDATA_POPEN_H 1 + +#include "../os/waitid.h" +int netdata_waitid(idtype_t idtype, id_t id, siginfo_t *infop, int options); + +#include "../libnetdata.h" + +#define PIPE_READ 0 +#define PIPE_WRITE 1 + +/* custom_popene_variadic_internal_dont_use_directly flag definitions */ +#define POPEN_FLAG_NONE 0 +#define POPEN_FLAG_CLOSE_FD (1 << 0) // Close all file descriptors other than STDIN_FILENO, STDOUT_FILENO, STDERR_FILENO + +// the flags to be used by default +#define POPEN_FLAGS_DEFAULT (POPEN_FLAG_CLOSE_FD) + +// mypopen_raw is the interface to use instead of custom_popene_variadic_internal_dont_use_directly() +// mypopen_raw will add the terminating NULL at the arguments list +// we append the parameter 'command' twice - this is because the underlying call needs the command to execute and the argv[0] to pass to it +#define netdata_popen_raw_default_flags_and_environment(pidptr, fpp_child_input, fpp_child_output, command, args...) netdata_popene_variadic_internal_dont_use_directly(pidptr, environ, POPEN_FLAGS_DEFAULT, fpp_child_input, fpp_child_output, command, command, ##args, NULL) +#define netdata_popen_raw_default_flags(pidptr, env, fpp_child_input, fpp_child_output, command, args...) netdata_popene_variadic_internal_dont_use_directly(pidptr, env, POPEN_FLAGS_DEFAULT, fpp_child_input, fpp_child_output, command, command, ##args, NULL) +#define netdata_popen_raw(pidptr, env, flags, fpp_child_input, fpp_child_output, command, args...) netdata_popene_variadic_internal_dont_use_directly(pidptr, env, flags, fpp_child_input, fpp_child_output, command, command, ##args, NULL) + +FILE *netdata_popen(const char *command, volatile pid_t *pidptr, FILE **fp_child_input); +FILE *netdata_popene(const char *command, volatile pid_t *pidptr, char **env, FILE **fp_child_input); +int netdata_popene_variadic_internal_dont_use_directly(volatile pid_t *pidptr, char **env, uint8_t flags, FILE **fpp_child_input, FILE **fpp_child_output, const char *command, ...); +int netdata_pclose(FILE *fp_child_input, FILE *fp_child_output, pid_t pid); + +int netdata_spawn(const char *command, volatile pid_t *pidptr); + +#endif /* NETDATA_POPEN_H */ diff --git a/src/libnetdata/procfile/README.md b/src/libnetdata/procfile/README.md new file mode 100644 index 00000000..9e737a51 --- /dev/null +++ b/src/libnetdata/procfile/README.md @@ -0,0 +1,71 @@ +<!-- +title: "PROCFILE" +custom_edit_url: https://github.com/netdata/netdata/edit/master/src/libnetdata/procfile/README.md +sidebar_label: "Procfile" +learn_status: "Published" +learn_topic_type: "Tasks" +learn_rel_path: "Developers/libnetdata" +--> + +# PROCFILE + +procfile is a library for reading text data files (i.e `/proc` files) in the fastest possible way. + +## How it works + +The library automatically adapts (through the iterations) its memory so that each file +is read with single `read()` call. + +Then the library splits the file into words, using the supplied separators. +The library also supported quoted words (i.e. strings within of which the separators are ignored). + +### Initialization + +Initially the caller: + +- calls `procfile_open()` to open the file and allocate the structures needed. + +### Iterations + +For each iteration, the caller: + +- calls `procfile_readall()` to read updated contents. + This call also rewinds (`lseek()` to 0) before reading it. + + For every file, a [BUFFER](/src/libnetdata/buffer/README.md) is used that is automatically adjusted to fit the entire + file contents of the file. So the file is read with a single `read()` call (providing atomicity / consistency when + the data are read from the kernel). + + Once the data are read, 2 arrays of pointers are updated: + + - a `words` array, pointing to each word in the data read + - a `lines` array, pointing to the first word for each line + + This is highly optimized. Both arrays are automatically adjusted to + fit all contents and are updated in a single pass on the data. + + The library provides a number of macros: + + - `procfile_lines()` returns the # of lines read + - `procfile_linewords()` returns the # of words in the given line + - `procfile_word()` returns a pointer the given word # + - `procfile_line()` returns a pointer to the first word of the given line # + - `procfile_lineword()` returns a pointer to the given word # of the given line # + +### Cleanup + +When the caller exits: + +- calls `procfile_free()` to close the file and free all memory used. + +### Performance + +- a **raspberry Pi 1** (the oldest single core one) can process 5.000+ `/proc` files per second. +- a **J1900 Celeron** processor can process 23.000+ `/proc` files per second per core. + +To achieve this kind of performance, the library tries to work in batches so that the code +and the data are inside the processor's caches. + +This library is extensively used in Netdata and its plugins. + + diff --git a/src/libnetdata/procfile/procfile.c b/src/libnetdata/procfile/procfile.c new file mode 100644 index 00000000..d9ebf4c9 --- /dev/null +++ b/src/libnetdata/procfile/procfile.c @@ -0,0 +1,490 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" + +#define PF_PREFIX "PROCFILE" + +#define PFWORDS_INCREASE_STEP 2000 +#define PFLINES_INCREASE_STEP 200 +#define PROCFILE_INCREMENT_BUFFER 4096 + +int procfile_open_flags = O_RDONLY | O_CLOEXEC; + +int procfile_adaptive_initial_allocation = 0; + +// if adaptive allocation is set, these store the +// max values we have seen so far +size_t procfile_max_lines = PFLINES_INCREASE_STEP; +size_t procfile_max_words = PFWORDS_INCREASE_STEP; +size_t procfile_max_allocation = PROCFILE_INCREMENT_BUFFER; + + +// ---------------------------------------------------------------------------- + +char *procfile_filename(procfile *ff) { + if(ff->filename) + return ff->filename; + + char filename[FILENAME_MAX + 1]; + char buffer[FILENAME_MAX + 1]; + snprintfz(buffer, FILENAME_MAX, "/proc/self/fd/%d", ff->fd); + + ssize_t l = readlink(buffer, filename, FILENAME_MAX); + if(unlikely(l == -1)) + snprintfz(filename, FILENAME_MAX, "unknown filename for fd %d", ff->fd); + else + filename[l] = '\0'; + + + ff->filename = strdupz(filename); + + // on non-linux systems, something like this will be needed + // fcntl(ff->fd, F_GETPATH, ff->filename) + + return ff->filename; +} + +// ---------------------------------------------------------------------------- +// An array of words + +static inline void procfile_words_add(procfile *ff, char *str) { + // netdata_log_debug(D_PROCFILE, PF_PREFIX ": adding word No %d: '%s'", fw->len, str); + + pfwords *fw = ff->words; + if(unlikely(fw->len == fw->size)) { + // netdata_log_debug(D_PROCFILE, PF_PREFIX ": expanding words"); + size_t minimum = PFWORDS_INCREASE_STEP; + size_t optimal = fw->size / 2; + size_t wanted = (optimal > minimum)?optimal:minimum; + + ff->words = fw = reallocz(fw, sizeof(pfwords) + (fw->size + wanted) * sizeof(char *)); + fw->size += wanted; + } + + fw->words[fw->len++] = str; +} + +NEVERNULL +static inline pfwords *procfile_words_create(void) { + // netdata_log_debug(D_PROCFILE, PF_PREFIX ": initializing words"); + + size_t size = (procfile_adaptive_initial_allocation) ? procfile_max_words : PFWORDS_INCREASE_STEP; + + pfwords *new = mallocz(sizeof(pfwords) + size * sizeof(char *)); + new->len = 0; + new->size = size; + return new; +} + +static inline void procfile_words_reset(pfwords *fw) { + // netdata_log_debug(D_PROCFILE, PF_PREFIX ": resetting words"); + fw->len = 0; +} + +static inline void procfile_words_free(pfwords *fw) { + // netdata_log_debug(D_PROCFILE, PF_PREFIX ": freeing words"); + + freez(fw); +} + + +// ---------------------------------------------------------------------------- +// An array of lines + +NEVERNULL +static inline size_t *procfile_lines_add(procfile *ff) { + // netdata_log_debug(D_PROCFILE, PF_PREFIX ": adding line %d at word %d", fl->len, first_word); + + pflines *fl = ff->lines; + if(unlikely(fl->len == fl->size)) { + // netdata_log_debug(D_PROCFILE, PF_PREFIX ": expanding lines"); + size_t minimum = PFLINES_INCREASE_STEP; + size_t optimal = fl->size / 2; + size_t wanted = (optimal > minimum)?optimal:minimum; + + ff->lines = fl = reallocz(fl, sizeof(pflines) + (fl->size + wanted) * sizeof(ffline)); + fl->size += wanted; + } + + ffline *ffl = &fl->lines[fl->len++]; + ffl->words = 0; + ffl->first = ff->words->len; + + return &ffl->words; +} + +NEVERNULL +static inline pflines *procfile_lines_create(void) { + // netdata_log_debug(D_PROCFILE, PF_PREFIX ": initializing lines"); + + size_t size = (unlikely(procfile_adaptive_initial_allocation)) ? procfile_max_words : PFLINES_INCREASE_STEP; + + pflines *new = mallocz(sizeof(pflines) + size * sizeof(ffline)); + new->len = 0; + new->size = size; + return new; +} + +static inline void procfile_lines_reset(pflines *fl) { + // netdata_log_debug(D_PROCFILE, PF_PREFIX ": resetting lines"); + + fl->len = 0; +} + +static inline void procfile_lines_free(pflines *fl) { + // netdata_log_debug(D_PROCFILE, PF_PREFIX ": freeing lines"); + + freez(fl); +} + + +// ---------------------------------------------------------------------------- +// The procfile + +void procfile_close(procfile *ff) { + if(unlikely(!ff)) return; + + netdata_log_debug(D_PROCFILE, PF_PREFIX ": Closing file '%s'", procfile_filename(ff)); + + freez(ff->filename); + procfile_lines_free(ff->lines); + procfile_words_free(ff->words); + + if(likely(ff->fd != -1)) close(ff->fd); + freez(ff); +} + +NOINLINE +static void procfile_parser(procfile *ff) { + // netdata_log_debug(D_PROCFILE, PF_PREFIX ": Parsing file '%s'", ff->filename); + + char *s = ff->data // our current position + , *e = &ff->data[ff->len] // the terminating null + , *t = ff->data; // the first character of a word (or quoted / parenthesized string) + + // the look up array to find our type of character + PF_CHAR_TYPE *separators = ff->separators; + + char quote = 0; // the quote character - only when in quoted string + size_t opened = 0; // counts the number of open parenthesis + + size_t *line_words = procfile_lines_add(ff); + + while(s < e) { + PF_CHAR_TYPE ct = separators[(unsigned char)(*s)]; + + // this is faster than a switch() + // read more here: http://lazarenko.me/switch/ + if(likely(ct == PF_CHAR_IS_WORD)) { + s++; + } + else if(likely(ct == PF_CHAR_IS_SEPARATOR)) { + if(!quote && !opened) { + if (s != t) { + // separator, but we have word before it + *s = '\0'; + procfile_words_add(ff, t); + (*line_words)++; + t = ++s; + } + else { + // separator at the beginning + // skip it + t = ++s; + } + } + else { + // we are inside a quote or parenthesized string + s++; + } + } + else if(likely(ct == PF_CHAR_IS_NEWLINE)) { + // end of line + + *s = '\0'; + procfile_words_add(ff, t); + (*line_words)++; + t = ++s; + + // netdata_log_debug(D_PROCFILE, PF_PREFIX ": ended line %d with %d words", l, ff->lines->lines[l].words); + + line_words = procfile_lines_add(ff); + } + else if(likely(ct == PF_CHAR_IS_QUOTE)) { + if(unlikely(!quote && s == t)) { + // quote opened at the beginning + quote = *s; + t = ++s; + } + else if(unlikely(quote && quote == *s)) { + // quote closed + quote = 0; + + *s = '\0'; + procfile_words_add(ff, t); + (*line_words)++; + t = ++s; + } + else + s++; + } + else if(likely(ct == PF_CHAR_IS_OPEN)) { + if(s == t) { + opened++; + t = ++s; + } + else if(opened) { + opened++; + s++; + } + else + s++; + } + else if(likely(ct == PF_CHAR_IS_CLOSE)) { + if(opened) { + opened--; + + if(!opened) { + *s = '\0'; + procfile_words_add(ff, t); + (*line_words)++; + t = ++s; + } + else + s++; + } + else + s++; + } + else + fatal("Internal Error: procfile_readall() does not handle all the cases."); + } + + if(likely(s > t && t < e)) { + // the last word + if(unlikely(ff->len >= ff->size)) { + // we are going to loose the last byte + s = &ff->data[ff->size - 1]; + } + + *s = '\0'; + procfile_words_add(ff, t); + (*line_words)++; + // t = ++s; + } +} + +procfile *procfile_readall(procfile *ff) { + // netdata_log_debug(D_PROCFILE, PF_PREFIX ": Reading file '%s'.", ff->filename); + + ff->len = 0; // zero the used size + ssize_t r = 1; // read at least once + while(r > 0) { + ssize_t s = ff->len; + ssize_t x = ff->size - s; + + if(unlikely(!x)) { + size_t minimum = PROCFILE_INCREMENT_BUFFER; + size_t optimal = ff->size / 2; + size_t wanted = (optimal > minimum)?optimal:minimum; + + netdata_log_debug(D_PROCFILE, PF_PREFIX ": Expanding data buffer for file '%s' by %zu bytes.", procfile_filename(ff), wanted); + ff = reallocz(ff, sizeof(procfile) + ff->size + wanted); + ff->size += wanted; + } + + netdata_log_debug(D_PROCFILE, "Reading file '%s', from position %zd with length %zd", procfile_filename(ff), s, (ssize_t)(ff->size - s)); + r = read(ff->fd, &ff->data[s], ff->size - s); + if(unlikely(r == -1)) { + if(unlikely(!(ff->flags & PROCFILE_FLAG_NO_ERROR_ON_FILE_IO))) collector_error(PF_PREFIX ": Cannot read from file '%s' on fd %d", procfile_filename(ff), ff->fd); + else if(unlikely(ff->flags & PROCFILE_FLAG_ERROR_ON_ERROR_LOG)) + netdata_log_error(PF_PREFIX ": Cannot read from file '%s' on fd %d", procfile_filename(ff), ff->fd); + procfile_close(ff); + return NULL; + } + + ff->len += r; + } + + // netdata_log_debug(D_PROCFILE, "Rewinding file '%s'", ff->filename); + if(unlikely(lseek(ff->fd, 0, SEEK_SET) == -1)) { + if(unlikely(!(ff->flags & PROCFILE_FLAG_NO_ERROR_ON_FILE_IO))) collector_error(PF_PREFIX ": Cannot rewind on file '%s'.", procfile_filename(ff)); + else if(unlikely(ff->flags & PROCFILE_FLAG_ERROR_ON_ERROR_LOG)) + netdata_log_error(PF_PREFIX ": Cannot rewind on file '%s'.", procfile_filename(ff)); + procfile_close(ff); + return NULL; + } + + procfile_lines_reset(ff->lines); + procfile_words_reset(ff->words); + procfile_parser(ff); + + if(unlikely(procfile_adaptive_initial_allocation)) { + if(unlikely(ff->len > procfile_max_allocation)) procfile_max_allocation = ff->len; + if(unlikely(ff->lines->len > procfile_max_lines)) procfile_max_lines = ff->lines->len; + if(unlikely(ff->words->len > procfile_max_words)) procfile_max_words = ff->words->len; + } + + // netdata_log_debug(D_PROCFILE, "File '%s' updated.", ff->filename); + return ff; +} + +static PF_CHAR_TYPE procfile_default_separators[256]; +__attribute__((constructor)) void procfile_initialize_default_separators(void) { + int i = 256; + while(i--) { + if(unlikely(i == '\n' || i == '\r')) + procfile_default_separators[i] = PF_CHAR_IS_NEWLINE; + + else if(unlikely(isspace(i) || !isprint(i))) + procfile_default_separators[i] = PF_CHAR_IS_SEPARATOR; + + else + procfile_default_separators[i] = PF_CHAR_IS_WORD; + } +} + +NOINLINE +static void procfile_set_separators(procfile *ff, const char *separators) { + // set the separators + if(unlikely(!separators)) + separators = " \t=|"; + + // copy the default + memcpy(ff->separators, procfile_default_separators, 256 * sizeof(PF_CHAR_TYPE)); + + PF_CHAR_TYPE *ffs = ff->separators; + const char *s = separators; + while(*s) + ffs[(int)*s++] = PF_CHAR_IS_SEPARATOR; +} + +void procfile_set_quotes(procfile *ff, const char *quotes) { + PF_CHAR_TYPE *ffs = ff->separators; + + // remove all quotes + int i = 256; + while(i--) + if(unlikely(ffs[i] == PF_CHAR_IS_QUOTE)) + ffs[i] = PF_CHAR_IS_WORD; + + // if nothing given, return + if(unlikely(!quotes || !*quotes)) + return; + + // set the quotes + const char *s = quotes; + while(*s) + ffs[(int)*s++] = PF_CHAR_IS_QUOTE; +} + +void procfile_set_open_close(procfile *ff, const char *open, const char *close) { + PF_CHAR_TYPE *ffs = ff->separators; + + // remove all open/close + int i = 256; + while(i--) + if(unlikely(ffs[i] == PF_CHAR_IS_OPEN || ffs[i] == PF_CHAR_IS_CLOSE)) + ffs[i] = PF_CHAR_IS_WORD; + + // if nothing given, return + if(unlikely(!open || !*open || !close || !*close)) + return; + + // set the openings + const char *s = open; + while(*s) + ffs[(int)*s++] = PF_CHAR_IS_OPEN; + + // set the closings + s = close; + while(*s) + ffs[(int)*s++] = PF_CHAR_IS_CLOSE; +} + +procfile *procfile_open(const char *filename, const char *separators, uint32_t flags) { + netdata_log_debug(D_PROCFILE, PF_PREFIX ": Opening file '%s'", filename); + + int fd = open(filename, procfile_open_flags, 0666); + if(unlikely(fd == -1)) { + if (unlikely(flags & PROCFILE_FLAG_ERROR_ON_ERROR_LOG)) + netdata_log_error(PF_PREFIX ": Cannot open file '%s'", filename); + else if (unlikely(!(flags & PROCFILE_FLAG_NO_ERROR_ON_FILE_IO))) { + if (errno == ENOENT) + collector_info(PF_PREFIX ": Cannot open file '%s'", filename); + else + collector_error(PF_PREFIX ": Cannot open file '%s'", filename); + } + return NULL; + } + + // netdata_log_info("PROCFILE: opened '%s' on fd %d", filename, fd); + + size_t size = (unlikely(procfile_adaptive_initial_allocation)) ? procfile_max_allocation : PROCFILE_INCREMENT_BUFFER; + procfile *ff = mallocz(sizeof(procfile) + size); + + //strncpyz(ff->filename, filename, FILENAME_MAX); + ff->filename = NULL; + ff->fd = fd; + ff->size = size; + ff->len = 0; + ff->flags = flags; + + ff->lines = procfile_lines_create(); + ff->words = procfile_words_create(); + + procfile_set_separators(ff, separators); + + netdata_log_debug(D_PROCFILE, "File '%s' opened.", filename); + return ff; +} + +procfile *procfile_reopen(procfile *ff, const char *filename, const char *separators, uint32_t flags) { + if(unlikely(!ff)) return procfile_open(filename, separators, flags); + + if(likely(ff->fd != -1)) { + // netdata_log_info("PROCFILE: closing fd %d", ff->fd); + close(ff->fd); + } + + ff->fd = open(filename, procfile_open_flags, 0666); + if(unlikely(ff->fd == -1)) { + procfile_close(ff); + return NULL; + } + + // netdata_log_info("PROCFILE: opened '%s' on fd %d", filename, ff->fd); + + //strncpyz(ff->filename, filename, FILENAME_MAX); + freez(ff->filename); + ff->filename = NULL; + ff->flags = flags; + + // do not do the separators again if NULL is given + if(likely(separators)) procfile_set_separators(ff, separators); + + return ff; +} + +// ---------------------------------------------------------------------------- +// example parsing of procfile data + +void procfile_print(procfile *ff) { + size_t lines = procfile_lines(ff), l; + char *s; + (void)s; + + netdata_log_debug(D_PROCFILE, "File '%s' with %zu lines and %zu words", procfile_filename(ff), ff->lines->len, ff->words->len); + + for(l = 0; likely(l < lines) ;l++) { + size_t words = procfile_linewords(ff, l); + + netdata_log_debug(D_PROCFILE, " line %zu starts at word %zu and has %zu words", l, ff->lines->lines[l].first, ff->lines->lines[l].words); + + size_t w; + for(w = 0; likely(w < words) ;w++) { + s = procfile_lineword(ff, l, w); + netdata_log_debug(D_PROCFILE, " [%zu.%zu] '%s'", l, w, s); + } + } +} diff --git a/src/libnetdata/procfile/procfile.h b/src/libnetdata/procfile/procfile.h new file mode 100644 index 00000000..8db5b45f --- /dev/null +++ b/src/libnetdata/procfile/procfile.h @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_PROCFILE_H +#define NETDATA_PROCFILE_H 1 + +#include "../libnetdata.h" + +// ---------------------------------------------------------------------------- +// An array of words + +typedef struct { + size_t len; // used entries + size_t size; // capacity + char *words[]; // array of pointers +} pfwords; + + +// ---------------------------------------------------------------------------- +// An array of lines + +typedef struct { + size_t words; // how many words this line has + size_t first; // the id of the first word of this line + // in the words array +} ffline; + +typedef struct { + size_t len; // used entries + size_t size; // capacity + ffline lines[]; // array of lines +} pflines; + + +// ---------------------------------------------------------------------------- +// The procfile + +#define PROCFILE_FLAG_DEFAULT 0x00000000 // To store inside `collector.log` +#define PROCFILE_FLAG_NO_ERROR_ON_FILE_IO 0x00000001 // Do not store nothing +#define PROCFILE_FLAG_ERROR_ON_ERROR_LOG 0x00000002 // Store inside `error.log` + +typedef enum __attribute__ ((__packed__)) procfile_separator { + PF_CHAR_IS_SEPARATOR, + PF_CHAR_IS_NEWLINE, + PF_CHAR_IS_WORD, + PF_CHAR_IS_QUOTE, + PF_CHAR_IS_OPEN, + PF_CHAR_IS_CLOSE +} PF_CHAR_TYPE; + +typedef struct procfile { + char *filename; // not populated until procfile_filename() is called + uint32_t flags; + int fd; // the file descriptor + size_t len; // the bytes we have placed into data + size_t size; // the bytes we have allocated for data + pflines *lines; + pfwords *words; + PF_CHAR_TYPE separators[256]; + char data[]; // allocated buffer to keep file contents +} procfile; + +// close the proc file and free all related memory +void procfile_close(procfile *ff); + +// (re)read and parse the proc file +procfile *procfile_readall(procfile *ff); + +// open a /proc or /sys file +procfile *procfile_open(const char *filename, const char *separators, uint32_t flags); + +// re-open a file +// if separators == NULL, the last separators are used +procfile *procfile_reopen(procfile *ff, const char *filename, const char *separators, uint32_t flags); + +// example walk-through a procfile parsed file +void procfile_print(procfile *ff); + +void procfile_set_quotes(procfile *ff, const char *quotes); +void procfile_set_open_close(procfile *ff, const char *open, const char *close); + +char *procfile_filename(procfile *ff); + +// ---------------------------------------------------------------------------- + +// set to the O_XXXX flags, to have procfile_open and procfile_reopen use them when opening proc files +extern int procfile_open_flags; + +// set this to 1, to have procfile adapt its initial buffer allocation to the max allocation used so far +extern int procfile_adaptive_initial_allocation; + +// return the number of lines present +#define procfile_lines(ff) ((ff)->lines->len) + +// return the number of words of the Nth line +#define procfile_linewords(ff, line) (((line) < procfile_lines(ff)) ? (ff)->lines->lines[(line)].words : 0) + +// return the Nth word of the file, or empty string +#define procfile_word(ff, word) (((word) < (ff)->words->len) ? (ff)->words->words[(word)] : "") + +// return the first word of the Nth line, or empty string +#define procfile_line(ff, line) (((line) < procfile_lines(ff)) ? procfile_word((ff), (ff)->lines->lines[(line)].first) : "") + +// return the Nth word of the current line +#define procfile_lineword(ff, line, word) (((line) < procfile_lines(ff) && (word) < procfile_linewords((ff), (line))) ? procfile_word((ff), (ff)->lines->lines[(line)].first + (word)) : "") + +// Open file without logging file IO error if any +#define procfile_open_no_log(filename, separators, flags) procfile_open(filename, separators, flags | PROCFILE_FLAG_NO_ERROR_ON_FILE_IO) + +#endif /* NETDATA_PROCFILE_H */ diff --git a/src/libnetdata/query_progress/README.md b/src/libnetdata/query_progress/README.md new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/src/libnetdata/query_progress/README.md diff --git a/src/libnetdata/query_progress/progress.c b/src/libnetdata/query_progress/progress.c new file mode 100644 index 00000000..10e083e0 --- /dev/null +++ b/src/libnetdata/query_progress/progress.c @@ -0,0 +1,660 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "progress.h" + +#define PROGRESS_CACHE_SIZE 200 + +// ---------------------------------------------------------------------------- +// hashtable for HASHED_KEY + +// cleanup hashtable defines +#include "../simple_hashtable_undef.h" + +struct query; +#define SIMPLE_HASHTABLE_VALUE_TYPE struct query +#define SIMPLE_HASHTABLE_KEY_TYPE nd_uuid_t +#define SIMPLE_HASHTABLE_NAME _QUERY +#define SIMPLE_HASHTABLE_VALUE2KEY_FUNCTION query_transaction +#define SIMPLE_HASHTABLE_COMPARE_KEYS_FUNCTION query_compare_keys +#include "../simple_hashtable.h" + +// ---------------------------------------------------------------------------- + +typedef struct query { + nd_uuid_t transaction; + + BUFFER *query; + BUFFER *payload; + BUFFER *client; + + usec_t started_ut; + usec_t finished_ut; + + HTTP_REQUEST_MODE mode; + HTTP_ACL acl; + + uint32_t sent_size; + uint32_t response_size; + short response_code; + + bool indexed; + + uint32_t updates; + + usec_t duration_ut; + size_t all; + size_t done; + + struct query *prev, *next; +} QUERY_PROGRESS; + +static inline nd_uuid_t *query_transaction(QUERY_PROGRESS *qp) { + return qp ? &qp->transaction : NULL; +} + +static inline bool query_compare_keys(nd_uuid_t *t1, nd_uuid_t *t2) { + if(t1 == t2 || (t1 && t2 && memcmp(t1, t2, sizeof(nd_uuid_t)) == 0)) + return true; + + return false; +} + +static struct progress { + SPINLOCK spinlock; + bool initialized; + + struct { + size_t available; + QUERY_PROGRESS *list; + } cache; + + SIMPLE_HASHTABLE_QUERY hashtable; + +} progress = { + .initialized = false, + .spinlock = NETDATA_SPINLOCK_INITIALIZER, +}; + +SIMPLE_HASHTABLE_HASH query_hash(nd_uuid_t *transaction) { + struct uuid_hi_lo_t { + uint64_t hi; + uint64_t lo; + } *parts = (struct uuid_hi_lo_t *)transaction; + + return parts->lo; +} + +static void query_progress_init_unsafe(void) { + if(!progress.initialized) { + simple_hashtable_init_QUERY(&progress.hashtable, PROGRESS_CACHE_SIZE * 4); + progress.initialized = true; + } +} + +// ---------------------------------------------------------------------------- + +static inline QUERY_PROGRESS *query_progress_find_in_hashtable_unsafe(nd_uuid_t *transaction) { + SIMPLE_HASHTABLE_HASH hash = query_hash(transaction); + SIMPLE_HASHTABLE_SLOT_QUERY *slot = simple_hashtable_get_slot_QUERY(&progress.hashtable, hash, transaction, true); + QUERY_PROGRESS *qp = SIMPLE_HASHTABLE_SLOT_DATA(slot); + + assert(!qp || qp->indexed); + + return qp; +} + +static inline void query_progress_add_to_hashtable_unsafe(QUERY_PROGRESS *qp) { + assert(!qp->indexed); + + SIMPLE_HASHTABLE_HASH hash = query_hash(&qp->transaction); + SIMPLE_HASHTABLE_SLOT_QUERY *slot = + simple_hashtable_get_slot_QUERY(&progress.hashtable, hash, &qp->transaction, true); + + internal_fatal(SIMPLE_HASHTABLE_SLOT_DATA(slot) != NULL && SIMPLE_HASHTABLE_SLOT_DATA(slot) != qp, + "Attempt to overwrite a progress slot, with another value"); + + simple_hashtable_set_slot_QUERY(&progress.hashtable, slot, hash, qp); + + qp->indexed = true; +} + +static inline void query_progress_remove_from_hashtable_unsafe(QUERY_PROGRESS *qp) { + assert(qp->indexed); + + SIMPLE_HASHTABLE_HASH hash = query_hash(&qp->transaction); + SIMPLE_HASHTABLE_SLOT_QUERY *slot = + simple_hashtable_get_slot_QUERY(&progress.hashtable, hash, &qp->transaction, true); + + if(SIMPLE_HASHTABLE_SLOT_DATA(slot) == qp) + simple_hashtable_del_slot_QUERY(&progress.hashtable, slot); + else + internal_fatal(SIMPLE_HASHTABLE_SLOT_DATA(slot) != NULL, + "Attempt to remove from the hashtable a progress slot with a different value"); + + qp->indexed = false; +} + +// ---------------------------------------------------------------------------- + +static QUERY_PROGRESS *query_progress_alloc(nd_uuid_t *transaction) { + QUERY_PROGRESS *qp; + qp = callocz(1, sizeof(*qp)); + uuid_copy(qp->transaction, *transaction); + qp->query = buffer_create(0, NULL); + qp->payload = buffer_create(0, NULL); + qp->client = buffer_create(0, NULL); + return qp; +} + +static void query_progress_free(QUERY_PROGRESS *qp) { + if(!qp) return; + + buffer_free(qp->query); + buffer_free(qp->payload); + buffer_free(qp->client); + freez(qp); +} + +static void query_progress_cleanup_to_reuse(QUERY_PROGRESS *qp, nd_uuid_t *transaction) { + assert(qp && qp->prev == NULL && qp->next == NULL); + assert(!transaction || !qp->indexed); + + buffer_flush(qp->query); + buffer_flush(qp->payload); + buffer_flush(qp->client); + qp->started_ut = qp->finished_ut = qp->duration_ut = 0; + qp->all = qp->done = qp->updates = 0; + qp->acl = 0; + qp->next = qp->prev = NULL; + qp->response_size = qp->sent_size = 0; + qp->response_code = 0; + + if(transaction) + uuid_copy(qp->transaction, *transaction); +} + +static inline void query_progress_update(QUERY_PROGRESS *qp, usec_t started_ut, HTTP_REQUEST_MODE mode, HTTP_ACL acl, const char *query, BUFFER *payload, const char *client) { + qp->mode = mode; + qp->acl = acl; + qp->started_ut = started_ut ? started_ut : now_realtime_usec(); + qp->finished_ut = 0; + qp->duration_ut = 0; + qp->response_size = 0; + qp->sent_size = 0; + qp->response_code = 0; + + if(query && *query && !buffer_strlen(qp->query)) + buffer_strcat(qp->query, query); + + if(payload && !buffer_strlen(qp->payload)) + buffer_copy(qp->payload, payload); + + if(client && *client && !buffer_strlen(qp->client)) + buffer_strcat(qp->client, client); +} + +// ---------------------------------------------------------------------------- + +static inline void query_progress_link_to_cache_unsafe(QUERY_PROGRESS *qp) { + assert(!qp->prev && !qp->next); + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(progress.cache.list, qp, prev, next); + progress.cache.available++; +} + +static inline void query_progress_unlink_from_cache_unsafe(QUERY_PROGRESS *qp) { + assert(qp->prev); + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(progress.cache.list, qp, prev, next); + progress.cache.available--; +} + +// ---------------------------------------------------------------------------- +// Progress API + +void query_progress_start_or_update(nd_uuid_t *transaction, usec_t started_ut, HTTP_REQUEST_MODE mode, HTTP_ACL acl, const char *query, BUFFER *payload, const char *client) { + if(!transaction) + return; + + spinlock_lock(&progress.spinlock); + query_progress_init_unsafe(); + + QUERY_PROGRESS *qp = query_progress_find_in_hashtable_unsafe(transaction); + if(qp) { + // the transaction is already there + if(qp->prev) { + // reusing a finished transaction + query_progress_unlink_from_cache_unsafe(qp); + query_progress_cleanup_to_reuse(qp, NULL); + } + } + else if (progress.cache.available >= PROGRESS_CACHE_SIZE && progress.cache.list) { + // transaction is not found - get the first available, if any. + qp = progress.cache.list; + query_progress_unlink_from_cache_unsafe(qp); + + query_progress_remove_from_hashtable_unsafe(qp); + query_progress_cleanup_to_reuse(qp, transaction); + } + else { + qp = query_progress_alloc(transaction); + } + + query_progress_update(qp, started_ut, mode, acl, query, payload, client); + + if(!qp->indexed) + query_progress_add_to_hashtable_unsafe(qp); + + spinlock_unlock(&progress.spinlock); +} + +void query_progress_set_finish_line(nd_uuid_t *transaction, size_t all) { + if(!transaction) + return; + + spinlock_lock(&progress.spinlock); + query_progress_init_unsafe(); + + QUERY_PROGRESS *qp = query_progress_find_in_hashtable_unsafe(transaction); + if(qp) { + qp->updates++; + + if(all > qp->all) + qp->all = all; + } + + spinlock_unlock(&progress.spinlock); +} + +void query_progress_done_step(nd_uuid_t *transaction, size_t done) { + if(!transaction) + return; + + spinlock_lock(&progress.spinlock); + query_progress_init_unsafe(); + + QUERY_PROGRESS *qp = query_progress_find_in_hashtable_unsafe(transaction); + if(qp) { + qp->updates++; + qp->done += done; + } + + spinlock_unlock(&progress.spinlock); +} + +void query_progress_finished(nd_uuid_t *transaction, usec_t finished_ut, short int response_code, usec_t duration_ut, size_t response_size, size_t sent_size) { + if(!transaction) + return; + + spinlock_lock(&progress.spinlock); + query_progress_init_unsafe(); + + // find this transaction to update it + { + QUERY_PROGRESS *qp = query_progress_find_in_hashtable_unsafe(transaction); + if(qp) { + qp->sent_size = sent_size; + qp->response_size = response_size; + qp->response_code = response_code; + qp->duration_ut = duration_ut; + qp->finished_ut = finished_ut ? finished_ut : now_realtime_usec(); + + if(qp->prev) + query_progress_unlink_from_cache_unsafe(qp); + + query_progress_link_to_cache_unsafe(qp); + } + } + + // find an item to free + { + QUERY_PROGRESS *qp_to_free = NULL; + if(progress.cache.available > PROGRESS_CACHE_SIZE && progress.cache.list) { + qp_to_free = progress.cache.list; + query_progress_unlink_from_cache_unsafe(qp_to_free); + query_progress_remove_from_hashtable_unsafe(qp_to_free); + } + + spinlock_unlock(&progress.spinlock); + + query_progress_free(qp_to_free); + } +} + +void query_progress_functions_update(nd_uuid_t *transaction, size_t done, size_t all) { + // functions send to the total 'done', not the increment + + if(!transaction) + return; + + spinlock_lock(&progress.spinlock); + query_progress_init_unsafe(); + + QUERY_PROGRESS *qp = query_progress_find_in_hashtable_unsafe(transaction); + + if(qp) { + if(all) + qp->all = all; + + if(done) + qp->done = done; + + qp->updates++; + } + + spinlock_unlock(&progress.spinlock); +} + +// ---------------------------------------------------------------------------- +// /api/v2/progress - to get the progress of a transaction + +int web_api_v2_report_progress(nd_uuid_t *transaction, BUFFER *wb) { + buffer_flush(wb); + buffer_json_initialize(wb, "\"", "\"", 0, true, BUFFER_JSON_OPTIONS_MINIFY); + + if(!transaction) { + buffer_json_member_add_uint64(wb, "status", 400); + buffer_json_member_add_string(wb, "message", "No transaction given"); + buffer_json_finalize(wb); + return 400; + } + + spinlock_lock(&progress.spinlock); + query_progress_init_unsafe(); + + QUERY_PROGRESS *qp = query_progress_find_in_hashtable_unsafe(transaction); + if(!qp) { + spinlock_unlock(&progress.spinlock); + buffer_json_member_add_uint64(wb, "status", HTTP_RESP_NOT_FOUND); + buffer_json_member_add_string(wb, "message", "Transaction not found"); + buffer_json_finalize(wb); + return HTTP_RESP_NOT_FOUND; + } + + buffer_json_member_add_uint64(wb, "status", HTTP_RESP_OK); + + buffer_json_member_add_uint64(wb, "started_ut", qp->started_ut); + if(qp->finished_ut) { + buffer_json_member_add_uint64(wb, "finished_ut", qp->finished_ut); + buffer_json_member_add_double(wb, "progress", 100.0); + buffer_json_member_add_uint64(wb, "age_ut", qp->finished_ut - qp->started_ut); + } + else { + usec_t now_ut = now_realtime_usec(); + buffer_json_member_add_uint64(wb, "now_ut", now_ut); + buffer_json_member_add_uint64(wb, "age_ut", now_ut - qp->started_ut); + + if (qp->all) + buffer_json_member_add_double(wb, "progress", (double) qp->done * 100.0 / (double) qp->all); + else + buffer_json_member_add_uint64(wb, "working", qp->done); + } + + buffer_json_finalize(wb); + + spinlock_unlock(&progress.spinlock); + + return 200; +} + +// ---------------------------------------------------------------------------- +// function to show the progress of all current queries +// and the recent few completed queries + +int progress_function_result(BUFFER *wb, const char *hostname) { + buffer_flush(wb); + wb->content_type = CT_APPLICATION_JSON; + buffer_json_initialize(wb, "\"", "\"", 0, true, BUFFER_JSON_OPTIONS_DEFAULT); + + buffer_json_member_add_string(wb, "hostname", hostname); + buffer_json_member_add_uint64(wb, "status", HTTP_RESP_OK); + buffer_json_member_add_string(wb, "type", "table"); + buffer_json_member_add_time_t(wb, "update_every", 1); + buffer_json_member_add_boolean(wb, "has_history", false); + buffer_json_member_add_string(wb, "help", RRDFUNCTIONS_PROGRESS_HELP); + buffer_json_member_add_array(wb, "data"); + + spinlock_lock(&progress.spinlock); + query_progress_init_unsafe(); + + usec_t now_ut = now_realtime_usec(); + usec_t max_duration_ut = 0; + size_t max_size = 0, max_sent = 0; + size_t archived = 0, running = 0; + SIMPLE_HASHTABLE_FOREACH_READ_ONLY(&progress.hashtable, sl, _QUERY) { + QUERY_PROGRESS *qp = SIMPLE_HASHTABLE_FOREACH_READ_ONLY_VALUE(sl); + if(unlikely(!qp)) continue; // not really needed, just for completeness + + if(qp->prev) + archived++; + else + running++; + + bool finished = qp->finished_ut ? true : false; + usec_t duration_ut = finished ? qp->duration_ut : now_ut - qp->started_ut; + if(duration_ut > max_duration_ut) + max_duration_ut = duration_ut; + + if(finished) { + if(qp->response_size > max_size) + max_size = qp->response_size; + + if(qp->sent_size > max_sent) + max_sent = qp->sent_size; + } + + buffer_json_add_array_item_array(wb); // row + + buffer_json_add_array_item_uuid_compact(wb, &qp->transaction); + buffer_json_add_array_item_uint64(wb, qp->started_ut); + buffer_json_add_array_item_string(wb, HTTP_REQUEST_MODE_2str(qp->mode)); + buffer_json_add_array_item_string(wb, buffer_tostring(qp->query)); + + if(!buffer_strlen(qp->client)) { + if(qp->acl & HTTP_ACL_ACLK) + buffer_json_add_array_item_string(wb, "ACLK"); + else if(qp->acl & HTTP_ACL_WEBRTC) + buffer_json_add_array_item_string(wb, "WEBRTC"); + else + buffer_json_add_array_item_string(wb, "unknown"); + } + else + buffer_json_add_array_item_string(wb, buffer_tostring(qp->client)); + + if(finished) { + buffer_json_add_array_item_string(wb, "finished"); + buffer_json_add_array_item_string(wb, "100.00 %%"); + } + else { + char buf[50]; + + buffer_json_add_array_item_string(wb, "in-progress"); + + if (qp->all) + snprintfz(buf, sizeof(buf), "%0.2f %%", (double) qp->done * 100.0 / (double) qp->all); + else + snprintfz(buf, sizeof(buf), "%zu", qp->done); + + buffer_json_add_array_item_string(wb, buf); + } + + buffer_json_add_array_item_double(wb, (double)duration_ut / USEC_PER_MS); + + if(finished) { + buffer_json_add_array_item_uint64(wb, qp->response_code); + buffer_json_add_array_item_uint64(wb, qp->response_size); + buffer_json_add_array_item_uint64(wb, qp->sent_size); + } + else { + buffer_json_add_array_item_string(wb, NULL); + buffer_json_add_array_item_string(wb, NULL); + buffer_json_add_array_item_string(wb, NULL); + } + + buffer_json_add_array_item_object(wb); // row options + { + char *severity = "notice"; + if(finished) { + if(qp->response_code == HTTP_RESP_NOT_MODIFIED || + qp->response_code == HTTP_RESP_CLIENT_CLOSED_REQUEST || + qp->response_code == HTTP_RESP_CONFLICT) + severity = "debug"; + else if(qp->response_code >= 500 && qp->response_code <= 599) + severity = "error"; + else if(qp->response_code >= 400 && qp->response_code <= 499) + severity = "warning"; + else if(qp->response_code >= 300 && qp->response_code <= 399) + severity = "notice"; + else + severity = "normal"; + } + buffer_json_member_add_string(wb, "severity", severity); + } + buffer_json_object_close(wb); // row options + + buffer_json_array_close(wb); // row + } + + assert(archived == progress.cache.available); + + spinlock_unlock(&progress.spinlock); + + buffer_json_array_close(wb); // data + buffer_json_member_add_object(wb, "columns"); + { + size_t field_id = 0; + + // transaction + buffer_rrdf_table_add_field(wb, field_id++, "Transaction", "Transaction ID", + RRDF_FIELD_TYPE_STRING, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NONE, + 0, NULL, NAN, RRDF_FIELD_SORT_ASCENDING, NULL, + RRDF_FIELD_SUMMARY_COUNT, RRDF_FIELD_FILTER_NONE, + RRDF_FIELD_OPTS_VISIBLE | RRDF_FIELD_OPTS_UNIQUE_KEY, + NULL); + + // timestamp + buffer_rrdf_table_add_field(wb, field_id++, "Started", "Query Start Timestamp", + RRDF_FIELD_TYPE_TIMESTAMP, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_DATETIME_USEC, + 0, NULL, NAN, RRDF_FIELD_SORT_DESCENDING, NULL, + RRDF_FIELD_SUMMARY_MAX, RRDF_FIELD_FILTER_NONE, + RRDF_FIELD_OPTS_VISIBLE, NULL); + + // request method + buffer_rrdf_table_add_field(wb, field_id++, "Method", "Request Method", + RRDF_FIELD_TYPE_STRING, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NONE, + 0, NULL, NAN, RRDF_FIELD_SORT_ASCENDING, NULL, + RRDF_FIELD_SUMMARY_COUNT, RRDF_FIELD_FILTER_MULTISELECT, + RRDF_FIELD_OPTS_VISIBLE, NULL); + + // query + buffer_rrdf_table_add_field(wb, field_id++, "Query", "Query", + RRDF_FIELD_TYPE_STRING, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NONE, + 0, NULL, NAN, RRDF_FIELD_SORT_ASCENDING, NULL, + RRDF_FIELD_SUMMARY_COUNT, RRDF_FIELD_FILTER_NONE, + RRDF_FIELD_OPTS_VISIBLE | RRDF_FIELD_OPTS_FULL_WIDTH | RRDF_FIELD_OPTS_WRAP, NULL); + + // client + buffer_rrdf_table_add_field(wb, field_id++, "Client", "Client", + RRDF_FIELD_TYPE_STRING, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NONE, + 0, NULL, NAN, RRDF_FIELD_SORT_ASCENDING, NULL, + RRDF_FIELD_SUMMARY_COUNT, RRDF_FIELD_FILTER_MULTISELECT, + RRDF_FIELD_OPTS_VISIBLE, NULL); + + // status + buffer_rrdf_table_add_field(wb, field_id++, "Status", "Query Status", + RRDF_FIELD_TYPE_STRING, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NONE, + 0, NULL, NAN, RRDF_FIELD_SORT_ASCENDING, NULL, + RRDF_FIELD_SUMMARY_COUNT, RRDF_FIELD_FILTER_MULTISELECT, + RRDF_FIELD_OPTS_VISIBLE, NULL); + + // progress + buffer_rrdf_table_add_field(wb, field_id++, "Progress", "Query Progress", + RRDF_FIELD_TYPE_STRING, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NONE, + 0, NULL, NAN, RRDF_FIELD_SORT_DESCENDING, NULL, + RRDF_FIELD_SUMMARY_COUNT, RRDF_FIELD_FILTER_NONE, + RRDF_FIELD_OPTS_VISIBLE, NULL); + + // duration + buffer_rrdf_table_add_field(wb, field_id++, "Duration", "Query Duration", + RRDF_FIELD_TYPE_DURATION, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NUMBER, + 2, "ms", (double)max_duration_ut / USEC_PER_MS, RRDF_FIELD_SORT_DESCENDING, NULL, + RRDF_FIELD_SUMMARY_MAX, RRDF_FIELD_FILTER_RANGE, + RRDF_FIELD_OPTS_VISIBLE, NULL); + + // response code + buffer_rrdf_table_add_field(wb, field_id++, "Response", "Query Response Code", + RRDF_FIELD_TYPE_INTEGER, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NONE, + 0, NULL, NAN, RRDF_FIELD_SORT_DESCENDING, NULL, + RRDF_FIELD_SUMMARY_COUNT, RRDF_FIELD_FILTER_MULTISELECT, + RRDF_FIELD_OPTS_VISIBLE, NULL); + + // response size + buffer_rrdf_table_add_field(wb, field_id++, "Size", "Query Response Size", + RRDF_FIELD_TYPE_INTEGER, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NONE, + 0, "bytes", (double)max_size, RRDF_FIELD_SORT_DESCENDING, NULL, + RRDF_FIELD_SUMMARY_SUM, RRDF_FIELD_FILTER_RANGE, + RRDF_FIELD_OPTS_NONE, NULL); + + // sent size + buffer_rrdf_table_add_field(wb, field_id++, "Sent", "Query Response Final Size", + RRDF_FIELD_TYPE_INTEGER, RRDF_FIELD_VISUAL_VALUE, RRDF_FIELD_TRANSFORM_NONE, + 0, "bytes", (double)max_sent, RRDF_FIELD_SORT_DESCENDING, NULL, + RRDF_FIELD_SUMMARY_SUM, RRDF_FIELD_FILTER_RANGE, + RRDF_FIELD_OPTS_NONE, NULL); + + // row options + buffer_rrdf_table_add_field(wb, field_id++, "rowOptions", "rowOptions", + RRDF_FIELD_TYPE_NONE, RRDR_FIELD_VISUAL_ROW_OPTIONS, RRDF_FIELD_TRANSFORM_NONE, + 0, NULL, NAN, RRDF_FIELD_SORT_FIXED, NULL, + RRDF_FIELD_SUMMARY_COUNT, RRDF_FIELD_FILTER_NONE, + RRDF_FIELD_OPTS_DUMMY, NULL); + } + + buffer_json_object_close(wb); // columns + buffer_json_member_add_string(wb, "default_sort_column", "Started"); + + buffer_json_member_add_time_t(wb, "expires", (time_t)((now_ut / USEC_PER_SEC) + 1)); + buffer_json_finalize(wb); + + return 200; +} + + +// ---------------------------------------------------------------------------- + +int progress_unittest(void) { + size_t permanent = 100; + nd_uuid_t valid[permanent]; + + usec_t started = now_monotonic_usec(); + + for(size_t i = 0; i < permanent ;i++) { + uuid_generate_random(valid[i]); + query_progress_start_or_update(&valid[i], 0, HTTP_REQUEST_MODE_GET, HTTP_ACL_ACLK, "permanent", NULL, "test"); + } + + for(size_t n = 0; n < 5000000 ;n++) { + nd_uuid_t t; + uuid_generate_random(t); + query_progress_start_or_update(&t, 0, HTTP_REQUEST_MODE_OPTIONS, HTTP_ACL_WEBRTC, "ephemeral", NULL, "test"); + query_progress_finished(&t, 0, 200, 1234, 123, 12); + + QUERY_PROGRESS *qp; + for(size_t i = 0; i < permanent ;i++) { + qp = query_progress_find_in_hashtable_unsafe(&valid[i]); + assert(qp); + (void)qp; + } + } + + usec_t ended = now_monotonic_usec(); + usec_t duration = ended - started; + + printf("progress hashtable resizes: %zu, size: %zu, used: %zu, deleted: %zu, searches: %zu, collisions: %zu, additions: %zu, deletions: %zu\n", + progress.hashtable.resizes, + progress.hashtable.size, progress.hashtable.used, progress.hashtable.deleted, + progress.hashtable.searches, progress.hashtable.collisions, progress.hashtable.additions, progress.hashtable.deletions); + + double d = (double)duration / USEC_PER_SEC; + printf("hashtable ops: %0.2f / sec\n", (double)progress.hashtable.searches / d); + + return 0; +} diff --git a/src/libnetdata/query_progress/progress.h b/src/libnetdata/query_progress/progress.h new file mode 100644 index 00000000..d45735dd --- /dev/null +++ b/src/libnetdata/query_progress/progress.h @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_QUERY_PROGRESS_H +#define NETDATA_QUERY_PROGRESS_H 1 + +#include "../libnetdata.h" + +void query_progress_start_or_update(nd_uuid_t *transaction, usec_t started_ut, HTTP_REQUEST_MODE mode, HTTP_ACL acl, const char *query, BUFFER *payload, const char *client); +void query_progress_done_step(nd_uuid_t *transaction, size_t done); +void query_progress_set_finish_line(nd_uuid_t *transaction, size_t all); +void query_progress_finished(nd_uuid_t *transaction, usec_t finished_ut, short int response_code, usec_t duration_ut, size_t response_size, size_t sent_size); +void query_progress_functions_update(nd_uuid_t *transaction, size_t done, size_t all); + +int web_api_v2_report_progress(nd_uuid_t *transaction, BUFFER *wb); + +#define RRDFUNCTIONS_PROGRESS_HELP "View the progress on the running and latest Netdata API Requests" +int progress_function_result(BUFFER *wb, const char *hostname); + +#endif // NETDATA_QUERY_PROGRESS_H diff --git a/src/libnetdata/required_dummies.h b/src/libnetdata/required_dummies.h new file mode 100644 index 00000000..3b23b87f --- /dev/null +++ b/src/libnetdata/required_dummies.h @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_LIB_DUMMIES_H +#define NETDATA_LIB_DUMMIES_H 1 + +// callback required by fatal() +void netdata_cleanup_and_exit(int ret, const char *action, const char *action_result, const char *action_data) +{ + (void)action; + (void)action_result; + (void)action_data; + + exit(ret); +} + +// callbacks required by popen() +void signals_block(void){} +void signals_unblock(void){} +void signals_reset(void){} + +void rrdset_thread_rda_free(void){} +void sender_thread_buffer_free(void){} +void query_target_free(void){} +void service_exits(void){} +void rrd_collector_finished(void){} + +// required by get_system_cpus() +char *netdata_configured_host_prefix = ""; + +#endif // NETDATA_LIB_DUMMIES_H diff --git a/src/libnetdata/simple_hashtable.h b/src/libnetdata/simple_hashtable.h new file mode 100644 index 00000000..13cdcd10 --- /dev/null +++ b/src/libnetdata/simple_hashtable.h @@ -0,0 +1,544 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_SIMPLE_HASHTABLE_H +#define NETDATA_SIMPLE_HASHTABLE_H + +typedef uint64_t SIMPLE_HASHTABLE_HASH; +#define SIMPLE_HASHTABLE_HASH_SECOND_HASH_SHIFTS 32 + +/* + * CONFIGURATION + * + * SIMPLE_HASHTABLE_NAME + * The name of the hashtable - all functions and defines will have this name appended + * Example: #define SIMPLE_HASHTABLE_NAME _FACET_KEY + * + * SIMPLE_HASHTABLE_VALUE_TYPE and SIMPLE_HASHTABLE_KEY_TYPE + * The data types of values and keys - optional - setting them will enable strict type checking by the compiler. + * If undefined, they both default to void. + * + * SIMPLE_HASHTABLE_SORT_FUNCTION + * A function name that accepts 2x values and compares them for sorting (returning -1, 0, 1). + * When set, the hashtable will maintain an always sorted array of the values in the hashtable. + * Do not use this for non-static hashtables. So, if your data is changing all the time, this can make the + * hashtable quite slower (it memmove()s an array of pointers to keep it sorted, on every single change). + * + * SIMPLE_HASHTABLE_VALUE2KEY_FUNCTION and SIMPLE_HASHTABLE_COMPARE_KEYS_FUNCTION + * The hashtable can either compare just hashes (the default), or hashes and keys (when these are set). + * Both need to be set for this feature to be enabled. + * + * - SIMPLE_HASHTABLE_VALUE2KEY_FUNCTION + * The name of a function accepting SIMPLE_HASHTABLE_VALUE_TYPE pointer. + * It should return a pointer to SIMPLE_HASHTABLE_KEY_TYPE. + * This function is called prior to SIMPLE_HASHTABLE_COMPARE_KEYS_FUNCTION to extract the key from a value. + * It is also called during hashtable resize, to rehash all values in the hashtable. + * + * - SIMPLE_HASHTABLE_COMPARE_KEYS_FUNCTION + * The name of a function accepting 2x SIMPLE_HASHTABLE_KEY_TYPE pointers. + * It should return true when the keys match. + * This function is only called when the hashes match, to verify that the keys also match. + * + * SIMPLE_HASHTABLE_SAMPLE_IMPLEMENTATION + * If defined, 3x functions will be injected for easily working with the hashtable. + * + */ + + +#ifndef SIMPLE_HASHTABLE_NAME +#define SIMPLE_HASHTABLE_NAME +#endif + +#ifndef SIMPLE_HASHTABLE_VALUE_TYPE +#define SIMPLE_HASHTABLE_VALUE_TYPE void +#endif + +#ifndef SIMPLE_HASHTABLE_KEY_TYPE +#define SIMPLE_HASHTABLE_KEY_TYPE void +#endif + +#ifndef SIMPLE_HASHTABLE_VALUE2KEY_FUNCTION +#undef SIMPLE_HASHTABLE_COMPARE_KEYS_FUNCTION +#endif + +#if defined(SIMPLE_HASHTABLE_VALUE2KEY_FUNCTION) +static inline SIMPLE_HASHTABLE_KEY_TYPE *SIMPLE_HASHTABLE_VALUE2KEY_FUNCTION(SIMPLE_HASHTABLE_VALUE_TYPE *); +#endif + +#if defined(SIMPLE_HASHTABLE_COMPARE_KEYS_FUNCTION) +static inline bool SIMPLE_HASHTABLE_COMPARE_KEYS_FUNCTION(SIMPLE_HASHTABLE_KEY_TYPE *, SIMPLE_HASHTABLE_KEY_TYPE *); +#endif + +// First layer of macro for token concatenation +#define CONCAT_INTERNAL(a, b) a ## b +// Second layer of macro, which ensures proper expansion +#define CONCAT(a, b) CONCAT_INTERNAL(a, b) + +// define names for all structures and structures +#define simple_hashtable_init_named CONCAT(simple_hashtable_init, SIMPLE_HASHTABLE_NAME) +#define simple_hashtable_destroy_named CONCAT(simple_hashtable_destroy, SIMPLE_HASHTABLE_NAME) + +#define simple_hashtable_slot_named CONCAT(simple_hashtable_slot, SIMPLE_HASHTABLE_NAME) +#define SIMPLE_HASHTABLE_SLOT_NAMED CONCAT(SIMPLE_HASHTABLE_SLOT, SIMPLE_HASHTABLE_NAME) +#define simple_hashtable_named CONCAT(simple_hashtable, SIMPLE_HASHTABLE_NAME) +#define SIMPLE_HASHTABLE_NAMED CONCAT(SIMPLE_HASHTABLE, SIMPLE_HASHTABLE_NAME) +#define simple_hashtable_resize_named CONCAT(simple_hashtable_resize, SIMPLE_HASHTABLE_NAME) +#define simple_hashtable_can_use_slot_named CONCAT(simple_hashtable_keys_match, SIMPLE_HASHTABLE_NAME) +#define simple_hashtable_get_slot_named CONCAT(simple_hashtable_get_slot, SIMPLE_HASHTABLE_NAME) +#define simple_hashtable_del_slot_named CONCAT(simple_hashtable_del_slot, SIMPLE_HASHTABLE_NAME) +#define simple_hashtable_set_slot_named CONCAT(simple_hashtable_set_slot, SIMPLE_HASHTABLE_NAME) +#define simple_hashtable_first_read_only_named CONCAT(simple_hashtable_first_read_only, SIMPLE_HASHTABLE_NAME) +#define simple_hashtable_next_read_only_named CONCAT(simple_hashtable_next_read_only, SIMPLE_HASHTABLE_NAME) + +#define simple_hashtable_sorted_binary_search_named CONCAT(simple_hashtable_sorted_binary_search, SIMPLE_HASHTABLE_NAME) +#define simple_hashtable_add_value_sorted_named CONCAT(simple_hashtable_add_value_sorted, SIMPLE_HASHTABLE_NAME) +#define simple_hashtable_del_value_sorted_named CONCAT(simple_hashtable_del_value_sorted, SIMPLE_HASHTABLE_NAME) +#define simple_hashtable_replace_value_sorted_named CONCAT(simple_hashtable_replace_value_sorted, SIMPLE_HASHTABLE_NAME) +#define simple_hashtable_sorted_array_first_read_only_named CONCAT(simple_hashtable_sorted_array_first_read_only, SIMPLE_HASHTABLE_NAME) +#define simple_hashtable_sorted_array_next_read_only_named CONCAT(simple_hashtable_sorted_array_next_read_only, SIMPLE_HASHTABLE_NAME) + +typedef struct simple_hashtable_slot_named { + SIMPLE_HASHTABLE_HASH hash; + SIMPLE_HASHTABLE_VALUE_TYPE *data; +} SIMPLE_HASHTABLE_SLOT_NAMED; + +typedef struct simple_hashtable_named { + size_t resizes; + size_t searches; + size_t collisions; + size_t additions; + size_t deletions; + size_t deleted; + size_t used; + size_t size; + bool needs_cleanup; + SIMPLE_HASHTABLE_SLOT_NAMED *hashtable; + +#ifdef SIMPLE_HASHTABLE_SORT_FUNCTION + struct { + size_t used; + size_t size; + SIMPLE_HASHTABLE_VALUE_TYPE **array; + } sorted; +#endif +} SIMPLE_HASHTABLE_NAMED; + +#ifdef SIMPLE_HASHTABLE_SORT_FUNCTION +static inline size_t simple_hashtable_sorted_binary_search_named(SIMPLE_HASHTABLE_NAMED *ht, SIMPLE_HASHTABLE_VALUE_TYPE *value) { + size_t left = 0, right = ht->sorted.used; + + while (left < right) { + size_t mid = left + (right - left) / 2; + if (SIMPLE_HASHTABLE_SORT_FUNCTION(ht->sorted.array[mid], value) < 0) + left = mid + 1; + else + right = mid; + } + + return left; +} + +static inline void simple_hashtable_add_value_sorted_named(SIMPLE_HASHTABLE_NAMED *ht, SIMPLE_HASHTABLE_VALUE_TYPE *value) { + size_t index = simple_hashtable_sorted_binary_search_named(ht, value); + + // Ensure there's enough space in the sorted array + if (ht->sorted.used >= ht->sorted.size) { + size_t size = ht->sorted.size ? ht->sorted.size * 2 : 64; + SIMPLE_HASHTABLE_VALUE_TYPE **array = mallocz(size * sizeof(SIMPLE_HASHTABLE_VALUE_TYPE *)); + if(ht->sorted.array) { + memcpy(array, ht->sorted.array, ht->sorted.size * sizeof(SIMPLE_HASHTABLE_VALUE_TYPE *)); + freez(ht->sorted.array); + } + ht->sorted.array = array; + ht->sorted.size = size; + } + + // Use memmove to shift elements and create space for the new element + memmove(&ht->sorted.array[index + 1], &ht->sorted.array[index], (ht->sorted.used - index) * sizeof(SIMPLE_HASHTABLE_VALUE_TYPE *)); + + ht->sorted.array[index] = value; + ht->sorted.used++; +} + +static inline void simple_hashtable_del_value_sorted_named(SIMPLE_HASHTABLE_NAMED *ht, SIMPLE_HASHTABLE_VALUE_TYPE *value) { + size_t index = simple_hashtable_sorted_binary_search_named(ht, value); + + // Check if the value exists at the found index + assert(index < ht->sorted.used && ht->sorted.array[index] == value); + + // Use memmove to shift elements and close the gap + memmove(&ht->sorted.array[index], &ht->sorted.array[index + 1], (ht->sorted.used - index - 1) * sizeof(SIMPLE_HASHTABLE_VALUE_TYPE *)); + ht->sorted.used--; +} + +static inline void simple_hashtable_replace_value_sorted_named(SIMPLE_HASHTABLE_NAMED *ht, SIMPLE_HASHTABLE_VALUE_TYPE *old_value, SIMPLE_HASHTABLE_VALUE_TYPE *new_value) { + if(new_value == old_value) + return; + + size_t old_value_index = simple_hashtable_sorted_binary_search_named(ht, old_value); + assert(old_value_index < ht->sorted.used && ht->sorted.array[old_value_index] == old_value); + + int r = SIMPLE_HASHTABLE_SORT_FUNCTION(old_value, new_value); + if(r == 0) { + // Same value, so use the same index + ht->sorted.array[old_value_index] = new_value; + return; + } + + size_t new_value_index = simple_hashtable_sorted_binary_search_named(ht, new_value); + if(old_value_index == new_value_index) { + // Not the same value, but still at the same index + ht->sorted.array[old_value_index] = new_value; + return; + } + else if (old_value_index < new_value_index) { + // The old value is before the new value + size_t shift_start = old_value_index + 1; + size_t shift_end = new_value_index - 1; + size_t shift_size = shift_end - old_value_index; + + memmove(&ht->sorted.array[old_value_index], &ht->sorted.array[shift_start], shift_size * sizeof(SIMPLE_HASHTABLE_VALUE_TYPE *)); + ht->sorted.array[shift_end] = new_value; + } + else { + // The old value is after the new value + size_t shift_start = new_value_index; + size_t shift_end = old_value_index; + size_t shift_size = shift_end - new_value_index; + + memmove(&ht->sorted.array[new_value_index + 1], &ht->sorted.array[shift_start], shift_size * sizeof(SIMPLE_HASHTABLE_VALUE_TYPE *)); + ht->sorted.array[new_value_index] = new_value; + } +} + +static inline SIMPLE_HASHTABLE_VALUE_TYPE **simple_hashtable_sorted_array_first_read_only_named(SIMPLE_HASHTABLE_NAMED *ht) { + if (ht->sorted.used > 0) { + return &ht->sorted.array[0]; + } + return NULL; +} + +static inline SIMPLE_HASHTABLE_VALUE_TYPE **simple_hashtable_sorted_array_next_read_only_named(SIMPLE_HASHTABLE_NAMED *ht, SIMPLE_HASHTABLE_VALUE_TYPE **last) { + if (!last) return NULL; + + // Calculate the current position in the sorted array + size_t currentIndex = last - ht->sorted.array; + + // Proceed to the next element if it exists + if (currentIndex + 1 < ht->sorted.used) { + return &ht->sorted.array[currentIndex + 1]; + } + + // If no more elements, return NULL + return NULL; +} + +#define SIMPLE_HASHTABLE_SORTED_FOREACH_READ_ONLY(ht, var, type, name) \ + for (type **(var) = simple_hashtable_sorted_array_first_read_only ## name(ht); \ + var; \ + (var) = simple_hashtable_sorted_array_next_read_only ## name(ht, var)) + +#define SIMPLE_HASHTABLE_SORTED_FOREACH_READ_ONLY_VALUE(var) (*(var)) + +#else +static inline void simple_hashtable_add_value_sorted_named(SIMPLE_HASHTABLE_NAMED *ht __maybe_unused, SIMPLE_HASHTABLE_VALUE_TYPE *value __maybe_unused) { ; } +static inline void simple_hashtable_del_value_sorted_named(SIMPLE_HASHTABLE_NAMED *ht __maybe_unused, SIMPLE_HASHTABLE_VALUE_TYPE *value __maybe_unused) { ; } +static inline void simple_hashtable_replace_value_sorted_named(SIMPLE_HASHTABLE_NAMED *ht __maybe_unused, SIMPLE_HASHTABLE_VALUE_TYPE *old_value __maybe_unused, SIMPLE_HASHTABLE_VALUE_TYPE *new_value __maybe_unused) { ; } +#endif + +static inline void simple_hashtable_init_named(SIMPLE_HASHTABLE_NAMED *ht, size_t size) { + memset(ht, 0, sizeof(*ht)); + ht->size = size; + ht->hashtable = callocz(ht->size, sizeof(*ht->hashtable)); +} + +static inline void simple_hashtable_destroy_named(SIMPLE_HASHTABLE_NAMED *ht) { +#ifdef SIMPLE_HASHTABLE_SORT_FUNCTION + freez(ht->sorted.array); +#endif + + freez(ht->hashtable); + memset(ht, 0, sizeof(*ht)); +} + +static inline void simple_hashtable_resize_named(SIMPLE_HASHTABLE_NAMED *ht); + +#define simple_hashtable_data_unset ((void *)NULL) +#define simple_hashtable_data_deleted ((void *)UINT64_MAX) +#define simple_hashtable_data_usernull ((void *)(UINT64_MAX - 1)) +#define simple_hashtable_is_slot_unset(sl) ((sl)->data == simple_hashtable_data_unset) +#define simple_hashtable_is_slot_deleted(sl) ((sl)->data == simple_hashtable_data_deleted) +#define simple_hashtable_is_slot_usernull(sl) ((sl)->data == simple_hashtable_data_usernull) +#define SIMPLE_HASHTABLE_SLOT_DATA(sl) ((simple_hashtable_is_slot_unset(sl) || simple_hashtable_is_slot_deleted(sl) || simple_hashtable_is_slot_usernull(sl)) ? NULL : (sl)->data) + +static inline bool simple_hashtable_can_use_slot_named( + SIMPLE_HASHTABLE_SLOT_NAMED *sl, SIMPLE_HASHTABLE_HASH hash, + SIMPLE_HASHTABLE_KEY_TYPE *key __maybe_unused) { + + if(simple_hashtable_is_slot_unset(sl)) + return true; + + if(simple_hashtable_is_slot_deleted(sl)) + return false; + + if(sl->hash == hash) { +#if defined(SIMPLE_HASHTABLE_COMPARE_KEYS_FUNCTION) && defined(SIMPLE_HASHTABLE_VALUE2KEY_FUNCTION) + return SIMPLE_HASHTABLE_COMPARE_KEYS_FUNCTION(SIMPLE_HASHTABLE_VALUE2KEY_FUNCTION(SIMPLE_HASHTABLE_SLOT_DATA(sl)), key); +#else + return true; +#endif + } + + return false; +} + +#define SIMPLE_HASHTABLE_NEEDS_RESIZE(ht) ((ht)->size <= ((ht)->used - (ht)->deleted) << 1 || (ht)->used >= (ht)->size) + +// IMPORTANT: the pointer returned by this call is valid up to the next call of this function (or the resize one). +// If you need to cache something, cache the hash, not the slot pointer. +static inline SIMPLE_HASHTABLE_SLOT_NAMED *simple_hashtable_get_slot_named( + SIMPLE_HASHTABLE_NAMED *ht, SIMPLE_HASHTABLE_HASH hash, + SIMPLE_HASHTABLE_KEY_TYPE *key, bool resize) { + + // This function finds the requested hash and key in the hashtable. + // It uses a second version of the hash in case of collisions, and then linear probing. + // It may resize the hashtable if it is more than 50% full. + + // Deleted items remain in the hashtable, but they are marked as DELETED. + // Reuse of DELETED slots happens only if the slot to be returned is UNSET. + // So, when looking up for an item, it tries to find it, assuming DELETED + // slots are occupied. If the item to be returned is UNSET, and it has + // encountered a DELETED slot, it returns the DELETED one instead of the UNSET. + + ht->searches++; + + size_t slot; + SIMPLE_HASHTABLE_SLOT_NAMED *sl; + SIMPLE_HASHTABLE_SLOT_NAMED *deleted; + + slot = hash % ht->size; + sl = &ht->hashtable[slot]; + deleted = simple_hashtable_is_slot_deleted(sl) ? sl : NULL; + if(likely(simple_hashtable_can_use_slot_named(sl, hash, key))) + return (simple_hashtable_is_slot_unset(sl) && deleted) ? deleted : sl; + + ht->collisions++; + + if(unlikely(resize && (ht->needs_cleanup || SIMPLE_HASHTABLE_NEEDS_RESIZE(ht)))) { + simple_hashtable_resize_named(ht); + deleted = NULL; // our deleted pointer is not valid anymore + + slot = hash % ht->size; + sl = &ht->hashtable[slot]; + if(likely(simple_hashtable_can_use_slot_named(sl, hash, key))) + return sl; + + ht->collisions++; + } + + slot = ((hash >> SIMPLE_HASHTABLE_HASH_SECOND_HASH_SHIFTS) + 1) % ht->size; + sl = &ht->hashtable[slot]; + deleted = (!deleted && simple_hashtable_is_slot_deleted(sl)) ? sl : deleted; + + // Linear probing until we find it + SIMPLE_HASHTABLE_SLOT_NAMED *sl_started = sl; + size_t collisions_started = ht->collisions; + while (!simple_hashtable_can_use_slot_named(sl, hash, key)) { + slot = (slot + 1) % ht->size; // Wrap around if necessary + sl = &ht->hashtable[slot]; + deleted = (!deleted && simple_hashtable_is_slot_deleted(sl)) ? sl : deleted; + ht->collisions++; + + if(sl == sl_started) { + if(deleted) { + // we looped through all items, and we didn't find a free slot, + // but we have found a deleted slot, so return it. + return deleted; + } + else if(resize) { + // the hashtable is full, without any deleted slots. + // we need to resize it now. + simple_hashtable_resize_named(ht); + return simple_hashtable_get_slot_named(ht, hash, key, false); + } + else { + // the hashtable is full, but resize is false. + // this should never happen. + assert(sl != sl_started); + } + } + } + + if((ht->collisions - collisions_started) > (ht->size / 2) && ht->deleted >= (ht->size / 3)) { + // we traversed through half of the hashtable to find a slot, + // but we have more than 1/3 deleted items + ht->needs_cleanup = true; + } + + return (simple_hashtable_is_slot_unset(sl) && deleted) ? deleted : sl; +} + +static inline bool simple_hashtable_del_slot_named(SIMPLE_HASHTABLE_NAMED *ht, SIMPLE_HASHTABLE_SLOT_NAMED *sl) { + if(simple_hashtable_is_slot_unset(sl) || simple_hashtable_is_slot_deleted(sl)) + return false; + + ht->deletions++; + ht->deleted++; + + simple_hashtable_del_value_sorted_named(ht, SIMPLE_HASHTABLE_SLOT_DATA(sl)); + + sl->data = simple_hashtable_data_deleted; + return true; +} + +static inline void simple_hashtable_set_slot_named( + SIMPLE_HASHTABLE_NAMED *ht, SIMPLE_HASHTABLE_SLOT_NAMED *sl, + SIMPLE_HASHTABLE_HASH hash, SIMPLE_HASHTABLE_VALUE_TYPE *data) { + + if(data == NULL) + data = simple_hashtable_data_usernull; + + if(unlikely(data == simple_hashtable_data_unset || data == simple_hashtable_data_deleted)) { + simple_hashtable_del_slot_named(ht, sl); + return; + } + + if(likely(simple_hashtable_is_slot_unset(sl))) { + simple_hashtable_add_value_sorted_named(ht, data); + ht->used++; + } + + else if(unlikely(simple_hashtable_is_slot_deleted(sl))) { + ht->deleted--; + } + + else + simple_hashtable_replace_value_sorted_named(ht, SIMPLE_HASHTABLE_SLOT_DATA(sl), data); + + sl->hash = hash; + sl->data = data; + ht->additions++; +} + +// IMPORTANT +// this call invalidates all SIMPLE_HASHTABLE_SLOT_NAMED pointers +static inline void simple_hashtable_resize_named(SIMPLE_HASHTABLE_NAMED *ht) { + SIMPLE_HASHTABLE_SLOT_NAMED *old = ht->hashtable; + size_t old_size = ht->size; + + size_t new_size = ht->size; + + if(SIMPLE_HASHTABLE_NEEDS_RESIZE(ht)) + new_size = (ht->size << 1) - ((ht->size > 16) ? 1 : 0); + + ht->resizes++; + ht->size = new_size; + ht->hashtable = callocz(new_size, sizeof(*ht->hashtable)); + size_t used = 0; + for(size_t i = 0 ; i < old_size ; i++) { + SIMPLE_HASHTABLE_SLOT_NAMED *slot = &old[i]; + if(simple_hashtable_is_slot_unset(slot) || simple_hashtable_is_slot_deleted(slot)) + continue; + + SIMPLE_HASHTABLE_KEY_TYPE *key = NULL; + +#if defined(SIMPLE_HASHTABLE_COMPARE_KEYS_FUNCTION) && defined(SIMPLE_HASHTABLE_VALUE2KEY_FUNCTION) + SIMPLE_HASHTABLE_VALUE_TYPE *value = SIMPLE_HASHTABLE_SLOT_DATA(slot); + key = SIMPLE_HASHTABLE_VALUE2KEY_FUNCTION(value); +#endif + + SIMPLE_HASHTABLE_SLOT_NAMED *slot2 = simple_hashtable_get_slot_named(ht, slot->hash, key, false); + *slot2 = *slot; + used++; + } + + assert(used == ht->used - ht->deleted); + + ht->used = used; + ht->deleted = 0; + ht->needs_cleanup = false; + + freez(old); +} + +// ---------------------------------------------------------------------------- +// hashtable traversal, in read-only mode +// the hashtable should not be modified while the traversal is taking place + +static inline SIMPLE_HASHTABLE_SLOT_NAMED *simple_hashtable_first_read_only_named(SIMPLE_HASHTABLE_NAMED *ht) { + for(size_t i = 0; i < ht->size ;i++) { + SIMPLE_HASHTABLE_SLOT_NAMED *sl = &ht->hashtable[i]; + if(!simple_hashtable_is_slot_unset(sl) && !simple_hashtable_is_slot_deleted(sl)) + return sl; + } + + return NULL; +} + +static inline SIMPLE_HASHTABLE_SLOT_NAMED *simple_hashtable_next_read_only_named(SIMPLE_HASHTABLE_NAMED *ht, SIMPLE_HASHTABLE_SLOT_NAMED *last) { + if (!last) return NULL; + + // Calculate the current position in the array + size_t index = last - ht->hashtable; + + // Iterate over the hashtable starting from the next element + for (size_t i = index + 1; i < ht->size; i++) { + SIMPLE_HASHTABLE_SLOT_NAMED *sl = &ht->hashtable[i]; + if (!simple_hashtable_is_slot_unset(sl) && !simple_hashtable_is_slot_deleted(sl)) { + return sl; + } + } + + // If no more data slots are found, return NULL + return NULL; +} + +#define SIMPLE_HASHTABLE_FOREACH_READ_ONLY(ht, var, name) \ + for(struct simple_hashtable_slot ## name *(var) = simple_hashtable_first_read_only ## name(ht); \ + var; \ + (var) = simple_hashtable_next_read_only ## name(ht, var)) + +#define SIMPLE_HASHTABLE_FOREACH_READ_ONLY_VALUE(var) SIMPLE_HASHTABLE_SLOT_DATA(var) + +// ---------------------------------------------------------------------------- +// high level implementation + +#ifdef SIMPLE_HASHTABLE_SAMPLE_IMPLEMENTATION + +#ifndef XXH_INLINE_ALL +#define XXH_INLINE_ALL +#endif +#include "xxhash.h" + +#define simple_hashtable_set_named CONCAT(simple_hashtable_set, SIMPLE_HASHTABLE_NAME) +#define simple_hashtable_get_named CONCAT(simple_hashtable_get, SIMPLE_HASHTABLE_NAME) +#define simple_hashtable_del_named CONCAT(simple_hashtable_del, SIMPLE_HASHTABLE_NAME) + +static inline SIMPLE_HASHTABLE_VALUE_TYPE *simple_hashtable_set_named(SIMPLE_HASHTABLE_NAMED *ht, SIMPLE_HASHTABLE_KEY_TYPE *key, size_t key_len, SIMPLE_HASHTABLE_VALUE_TYPE *data) { + XXH64_hash_t hash = XXH3_64bits((void *)key, key_len); + SIMPLE_HASHTABLE_SLOT_NAMED *sl = simple_hashtable_get_slot_named(ht, hash, key, true); + simple_hashtable_set_slot_named(ht, sl, hash, data); + return SIMPLE_HASHTABLE_SLOT_DATA(sl); +} + +static inline SIMPLE_HASHTABLE_VALUE_TYPE *simple_hashtable_get_named(SIMPLE_HASHTABLE_NAMED *ht, SIMPLE_HASHTABLE_KEY_TYPE *key, size_t key_len, SIMPLE_HASHTABLE_VALUE_TYPE *data) { + XXH64_hash_t hash = XXH3_64bits((void *)key, key_len); + SIMPLE_HASHTABLE_SLOT_NAMED *sl = simple_hashtable_get_slot_named(ht, hash, key, true); + return SIMPLE_HASHTABLE_SLOT_DATA(sl); +} + +static inline bool simple_hashtable_del_named(SIMPLE_HASHTABLE_NAMED *ht, SIMPLE_HASHTABLE_KEY_TYPE *key, size_t key_len, SIMPLE_HASHTABLE_VALUE_TYPE *data) { + XXH64_hash_t hash = XXH3_64bits((void *)key, key_len); + SIMPLE_HASHTABLE_SLOT_NAMED *sl = simple_hashtable_get_slot_named(ht, hash, key, true); + return simple_hashtable_del_slot_named(ht, sl); +} + +#endif // SIMPLE_HASHTABLE_SAMPLE_IMPLEMENTATION + +// ---------------------------------------------------------------------------- +// Clear the preprocessor defines of simple_hashtable.h +// allowing simple_hashtable.h to be included multiple times +// with different configuration each time. + +#include "simple_hashtable_undef.h" + +#endif //NETDATA_SIMPLE_HASHTABLE_H diff --git a/src/libnetdata/simple_hashtable_undef.h b/src/libnetdata/simple_hashtable_undef.h new file mode 100644 index 00000000..3fe5a708 --- /dev/null +++ b/src/libnetdata/simple_hashtable_undef.h @@ -0,0 +1,35 @@ + +// this file clears the preprocessor defines of simple_hashtable.h +// allowing simple_hashtable.h to be included multiple times +// with different configuration each time. + +#undef SIMPLE_HASHTABLE_HASH_SECOND_HASH_SHIFTS + +#undef simple_hashtable_init_named +#undef simple_hashtable_destroy_named +#undef simple_hashtable_slot_named +#undef SIMPLE_HASHTABLE_SLOT_NAMED +#undef simple_hashtable_named +#undef SIMPLE_HASHTABLE_NAMED +#undef simple_hashtable_resize_named +#undef simple_hashtable_can_use_slot_named +#undef simple_hashtable_get_slot_named +#undef simple_hashtable_del_slot_named +#undef simple_hashtable_set_slot_named +#undef simple_hashtable_first_read_only_named +#undef simple_hashtable_next_read_only_named +#undef simple_hashtable_sorted_binary_search_named +#undef simple_hashtable_add_value_sorted_named +#undef simple_hashtable_del_value_sorted_named +#undef simple_hashtable_replace_value_sorted_named +#undef simple_hashtable_sorted_array_first_read_only_named +#undef simple_hashtable_sorted_array_next_read_only_named + +#undef SIMPLE_HASHTABLE_SAMPLE_IMPLEMENTATION +#undef SIMPLE_HASHTABLE_SORT_FUNCTION +#undef SIMPLE_HASHTABLE_VALUE_TYPE +#undef SIMPLE_HASHTABLE_KEY_TYPE +#undef SIMPLE_HASHTABLE_VALUE2KEY_FUNCTION +#undef SIMPLE_HASHTABLE_COMPARE_KEYS_FUNCTION +#undef SIMPLE_HASHTABLE_NAME +#undef NETDATA_SIMPLE_HASHTABLE_H diff --git a/src/libnetdata/simple_pattern/README.md b/src/libnetdata/simple_pattern/README.md new file mode 100644 index 00000000..cf8a0f64 --- /dev/null +++ b/src/libnetdata/simple_pattern/README.md @@ -0,0 +1,47 @@ +<!-- +title: "Simple patterns" +description: "Netdata supports simple patterns, which are less cryptic versions of regular expressions. Use familiar notation for powerful results." +custom_edit_url: https://github.com/netdata/netdata/edit/master/src/libnetdata/simple_pattern/README.md +sidebar_label: "Simple patterns" +learn_status: "Published" +learn_topic_type: "Tasks" +learn_rel_path: "Developers/libnetdata" +--> + +# Simple patterns + +Unix prefers regular expressions. But they are just too hard, too cryptic +to use, write and understand. + +So, Netdata supports **simple patterns**. + +Simple patterns are a space separated list of words, that can have `*` +as a wildcard. Each word may use any number of `*`. Simple patterns +allow **negative** matches by prefixing a word with `!`. + +So, `pattern = !*bad* *` will match anything, except all those that +contain the word `bad`. + +Simple patterns are quite powerful: `pattern = *foobar* !foo* !*bar *` +matches everything containing `foobar`, except strings that start +with `foo` or end with `bar`. + +You can use the Netdata command line to check simple patterns, +like this: + +```sh +# netdata -W simple-pattern '*foobar* !foo* !*bar *' 'hello world' +RESULT: MATCHED - pattern '*foobar* !foo* !*bar *' matches 'hello world' + +# netdata -W simple-pattern '*foobar* !foo* !*bar *' 'hello world bar' +RESULT: NOT MATCHED - pattern '*foobar* !foo* !*bar *' does not match 'hello world bar' + +# netdata -W simple-pattern '*foobar* !foo* !*bar *' 'hello world foobar' +RESULT: MATCHED - pattern '*foobar* !foo* !*bar *' matches 'hello world foobar' +``` + +Netdata stops processing to the first positive or negative match +(left to right). If it is not matched by either positive or negative +patterns, it is denied at the end. + + diff --git a/src/libnetdata/simple_pattern/simple_pattern.c b/src/libnetdata/simple_pattern/simple_pattern.c new file mode 100644 index 00000000..7a7f41b1 --- /dev/null +++ b/src/libnetdata/simple_pattern/simple_pattern.c @@ -0,0 +1,406 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" + +struct simple_pattern { + const char *match; + uint32_t len; + + SIMPLE_PREFIX_MODE mode; + bool negative; + bool case_sensitive; + + struct simple_pattern *child; + struct simple_pattern *next; +}; + +static struct simple_pattern *parse_pattern(char *str, SIMPLE_PREFIX_MODE default_mode, size_t count) { + if(unlikely(count >= 1000)) + return NULL; + + // fprintf(stderr, "PARSING PATTERN: '%s'\n", str); + + SIMPLE_PREFIX_MODE mode; + struct simple_pattern *child = NULL; + + char *s = str, *c = str; + + // skip asterisks in front + while(*c == '*') c++; + + // find the next asterisk + while(*c && *c != '*') c++; + + // do we have an asterisk in the middle? + if(*c == '*' && c[1] != '\0') { + // yes, we have + child = parse_pattern(c, default_mode, count + 1); + c[1] = '\0'; + } + + // check what this one matches + + size_t len = strlen(s); + if(len >= 2 && *s == '*' && s[len - 1] == '*') { + s[len - 1] = '\0'; + s++; + mode = SIMPLE_PATTERN_SUBSTRING; + } + else if(len >= 1 && *s == '*') { + s++; + mode = SIMPLE_PATTERN_SUFFIX; + } + else if(len >= 1 && s[len - 1] == '*') { + s[len - 1] = '\0'; + mode = SIMPLE_PATTERN_PREFIX; + } + else + mode = default_mode; + + // allocate the structure + struct simple_pattern *m = callocz(1, sizeof(struct simple_pattern)); + if(*s) { + m->match = strdupz(s); + m->len = strlen(m->match); + m->mode = mode; + } + else { + m->mode = SIMPLE_PATTERN_SUBSTRING; + } + + m->child = child; + + return m; +} + +SIMPLE_PATTERN *simple_pattern_create(const char *list, const char *separators, SIMPLE_PREFIX_MODE default_mode, bool case_sensitive) { + struct simple_pattern *root = NULL, *last = NULL; + + if(unlikely(!list || !*list)) return root; + + char isseparator[256] = { + [' '] = 1 // space + , ['\t'] = 1 // tab + , ['\r'] = 1 // carriage return + , ['\n'] = 1 // new line + , ['\f'] = 1 // form feed + , ['\v'] = 1 // vertical tab + }; + + if (unlikely(separators && *separators)) { + memset(&isseparator[0], 0, sizeof(isseparator)); + while(*separators) isseparator[(unsigned char)*separators++] = 1; + } + + char *buf = mallocz(strlen(list) + 1); + const char *s = list; + + while(s && *s) { + buf[0] = '\0'; + char *c = buf; + + bool negative = false; + + // skip all spaces + while(isseparator[(unsigned char)*s]) + s++; + + if(*s == '!') { + negative = true; + s++; + } + + // empty string + if(unlikely(!*s)) + break; + + // find the next space + char escape = 0; + while(*s) { + if(*s == '\\' && !escape) { + escape = 1; + s++; + } + else { + if (isseparator[(unsigned char)*s] && !escape) { + s++; + break; + } + + *c++ = *s++; + escape = 0; + } + } + + // terminate our string + *c = '\0'; + + // if we matched the empty string, skip it + if(unlikely(!*buf)) + continue; + + // fprintf(stderr, "FOUND PATTERN: '%s'\n", buf); + struct simple_pattern *m = parse_pattern(buf, default_mode, 0); + m->negative = negative; + m->case_sensitive = case_sensitive; + + if(default_mode == SIMPLE_PATTERN_SUBSTRING) { + m->mode = SIMPLE_PATTERN_SUBSTRING; + + struct simple_pattern *tm; + for(tm = m; tm->child ; tm = tm->child) ; + tm->mode = SIMPLE_PATTERN_SUBSTRING; + } + + // link it at the end + if(unlikely(!root)) + root = last = m; + else { + last->next = m; + last = m; + } + } + + freez(buf); + return (SIMPLE_PATTERN *)root; +} + +static inline char *add_wildcarded(const char *matched, size_t matched_size, char *wildcarded, size_t *wildcarded_size) { + //if(matched_size) { + // char buf[matched_size + 1]; + // strncpyz(buf, matched, matched_size); + // fprintf(stderr, "ADD WILDCARDED '%s' of length %zu\n", buf, matched_size); + //} + + if(unlikely(wildcarded && *wildcarded_size && matched && *matched && matched_size)) { + size_t wss = *wildcarded_size - 1; + size_t len = (matched_size < wss)?matched_size:wss; + if(likely(len)) { + strncpyz(wildcarded, matched, len); + + *wildcarded_size -= len; + return &wildcarded[len]; + } + } + + return wildcarded; +} + +static inline int sp_strcmp(const char *s1, const char *s2, bool case_sensitive) { + if(case_sensitive) + return strcmp(s1, s2); + + return strcasecmp(s1, s2); +} + +static inline int sp_strncmp(const char *s1, const char *s2, size_t n, bool case_sensitive) { + if(case_sensitive) + return strncmp(s1, s2, n); + + return strncasecmp(s1, s2, n); +} + +static inline char *sp_strstr(const char *haystack, const char *needle, bool case_sensitive) { + if(case_sensitive) + return strstr(haystack, needle); + + return strcasestr(haystack, needle); +} + +static inline bool match_pattern(struct simple_pattern *m, const char *str, size_t len, char *wildcarded, size_t *wildcarded_size) { + char *s; + + bool loop = true; + while(loop && m->len <= len) { + loop = false; + + switch(m->mode) { + default: + case SIMPLE_PATTERN_EXACT: + if(unlikely(sp_strcmp(str, m->match, m->case_sensitive) == 0)) { + if(!m->child) return true; + return false; + } + break; + + case SIMPLE_PATTERN_SUBSTRING: + if(!m->len) return true; + if((s = sp_strstr(str, m->match, m->case_sensitive))) { + wildcarded = add_wildcarded(str, s - str, wildcarded, wildcarded_size); + if(!m->child) { + add_wildcarded(&s[m->len], len - (&s[m->len] - str), wildcarded, wildcarded_size); + return true; + } + + // instead of recursion + { + len = len - (s - str) - m->len; + str = &s[m->len]; + m = m->child; + loop = true; + // return match_pattern(m->child, &s[m->len], len - (s - str) - m->len, wildcarded, wildcarded_size); + } + } + break; + + case SIMPLE_PATTERN_PREFIX: + if(unlikely(sp_strncmp(str, m->match, m->len, m->case_sensitive) == 0)) { + if(!m->child) { + add_wildcarded(&str[m->len], len - m->len, wildcarded, wildcarded_size); + return true; + } + // instead of recursion + { + len = len - m->len; + str = &str[m->len]; + m = m->child; + loop = true; + // return match_pattern(m->child, &str[m->len], len - m->len, wildcarded, wildcarded_size); + } + } + break; + + case SIMPLE_PATTERN_SUFFIX: + if(unlikely(sp_strcmp(&str[len - m->len], m->match, m->case_sensitive) == 0)) { + add_wildcarded(str, len - m->len, wildcarded, wildcarded_size); + if(!m->child) return true; + return false; + } + break; + } + } + + return false; +} + +static inline SIMPLE_PATTERN_RESULT simple_pattern_matches_extract_with_length(SIMPLE_PATTERN *list, const char *str, size_t len, char *wildcarded, size_t wildcarded_size) { + struct simple_pattern *m, *root = (struct simple_pattern *)list; + + for(m = root; m ; m = m->next) { + char *ws = wildcarded; + size_t wss = wildcarded_size; + if(unlikely(ws)) *ws = '\0'; + + if (match_pattern(m, str, len, ws, &wss)) { + if (m->negative) return SP_MATCHED_NEGATIVE; + return SP_MATCHED_POSITIVE; + } + } + + return SP_NOT_MATCHED; +} + +SIMPLE_PATTERN_RESULT simple_pattern_matches_buffer_extract(SIMPLE_PATTERN *list, BUFFER *str, char *wildcarded, size_t wildcarded_size) { + if(!list || !str || buffer_strlen(str)) return SP_NOT_MATCHED; + return simple_pattern_matches_extract_with_length(list, buffer_tostring(str), buffer_strlen(str), wildcarded, wildcarded_size); +} + +SIMPLE_PATTERN_RESULT simple_pattern_matches_string_extract(SIMPLE_PATTERN *list, STRING *str, char *wildcarded, size_t wildcarded_size) { + if(!list || !str) return SP_NOT_MATCHED; + return simple_pattern_matches_extract_with_length(list, string2str(str), string_strlen(str), wildcarded, wildcarded_size); +} + +SIMPLE_PATTERN_RESULT simple_pattern_matches_extract(SIMPLE_PATTERN *list, const char *str, char *wildcarded, size_t wildcarded_size) { + if(!list || !str || !*str) return SP_NOT_MATCHED; + return simple_pattern_matches_extract_with_length(list, str, strlen(str), wildcarded, wildcarded_size); +} + +SIMPLE_PATTERN_RESULT simple_pattern_matches_length_extract(SIMPLE_PATTERN *list, const char *str, size_t len, char *wildcarded, size_t wildcarded_size) { + if(!list || !str || !*str || !len) return SP_NOT_MATCHED; + return simple_pattern_matches_extract_with_length(list, str, len, wildcarded, wildcarded_size); +} + +static inline void free_pattern(struct simple_pattern *m) { + if(!m) return; + + free_pattern(m->child); + free_pattern(m->next); + freez((void *)m->match); + freez(m); +} + +void simple_pattern_free(SIMPLE_PATTERN *list) { + if(!list) return; + + free_pattern(((struct simple_pattern *)list)); +} + +/* Debugging patterns + + This code should be dead - it is useful for debugging but should not be called by production code. + Feel free to comment it out, but please leave it in the file. +*/ +extern void simple_pattern_dump(uint64_t debug_type, SIMPLE_PATTERN *p) +{ + struct simple_pattern *root = (struct simple_pattern *)p; + if(root==NULL) { + netdata_log_debug(debug_type,"dump_pattern(NULL)"); + return; + } + netdata_log_debug(debug_type,"dump_pattern(%p) child=%p next=%p mode=%u match=%s", root, root->child, root->next, root->mode, + root->match); + if(root->child!=NULL) + simple_pattern_dump(debug_type, (SIMPLE_PATTERN*)root->child); + if(root->next!=NULL) + simple_pattern_dump(debug_type, (SIMPLE_PATTERN*)root->next); +} + +/* Heuristic: decide if the pattern could match a DNS name. + + Although this functionality is used directly by socket.c:connection_allowed() it must be in this file + because of the SIMPLE_PATTERN/simple_pattern structure hiding. + Based on RFC952 / RFC1123. We need to decide if the pattern may match a DNS name, or not. For the negative + cases we need to be sure that it can only match an ipv4 or ipv6 address: + * IPv6 addresses contain ':', which are illegal characters in DNS. + * IPv4 addresses cannot contain alpha- characters. + * DNS TLDs must be alphanumeric to distinguish from IPv4. + Some patterns (e.g. "*a*" ) could match multiple cases (i.e. DNS or IPv6). + Some patterns will be awkward (e.g. "192.168.*") as they look like they are intended to match IPv4-only + but could match DNS (i.e. "192.168.com" is a valid name). +*/ +static void scan_is_potential_name(struct simple_pattern *p, int *alpha, int *colon, int *wildcards) +{ + while (p) { + if (p->match) { + if(p->mode == SIMPLE_PATTERN_EXACT && !strcmp("localhost", p->match)) { + p = p->child; + continue; + } + char const *scan = p->match; + while (*scan != 0) { + if ((*scan >= 'a' && *scan <= 'z') || (*scan >= 'A' && *scan <= 'Z')) + *alpha = 1; + if (*scan == ':') + *colon = 1; + scan++; + } + if (p->mode != SIMPLE_PATTERN_EXACT) + *wildcards = 1; + p = p->child; + } + } +} + +extern int simple_pattern_is_potential_name(SIMPLE_PATTERN *p) +{ + int alpha=0, colon=0, wildcards=0; + struct simple_pattern *root = (struct simple_pattern*)p; + while (root != NULL) { + if (root->match != NULL) { + scan_is_potential_name(root, &alpha, &colon, &wildcards); + } + if (root->mode != SIMPLE_PATTERN_EXACT) + wildcards = 1; + root = root->next; + } + return (alpha || wildcards) && !colon; +} + +char *simple_pattern_iterate(SIMPLE_PATTERN **p) +{ + struct simple_pattern *root = (struct simple_pattern *) *p; + struct simple_pattern **Proot = (struct simple_pattern **)p; + + (*Proot) = (*Proot)->next; + return (char *) root->match; +} diff --git a/src/libnetdata/simple_pattern/simple_pattern.h b/src/libnetdata/simple_pattern/simple_pattern.h new file mode 100644 index 00000000..1af0f87b --- /dev/null +++ b/src/libnetdata/simple_pattern/simple_pattern.h @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_SIMPLE_PATTERN_H +#define NETDATA_SIMPLE_PATTERN_H + +#include "../libnetdata.h" + +typedef enum __attribute__ ((__packed__)) { + SIMPLE_PATTERN_EXACT, + SIMPLE_PATTERN_PREFIX, + SIMPLE_PATTERN_SUFFIX, + SIMPLE_PATTERN_SUBSTRING +} SIMPLE_PREFIX_MODE; + +typedef enum __attribute__ ((__packed__)) { + SP_NOT_MATCHED, + SP_MATCHED_NEGATIVE, + SP_MATCHED_POSITIVE, +} SIMPLE_PATTERN_RESULT; + +struct simple_pattern; +typedef struct simple_pattern SIMPLE_PATTERN; + +// create a simple_pattern from the string given +// default_mode is used in cases where EXACT matches, without an asterisk, +// should be considered PREFIX matches. +SIMPLE_PATTERN *simple_pattern_create(const char *list, const char *separators, SIMPLE_PREFIX_MODE default_mode, bool case_sensitive); + +struct netdata_string; + +// test if string str is matched from the pattern and fill 'wildcarded' with the parts matched by '*' +SIMPLE_PATTERN_RESULT simple_pattern_matches_extract(SIMPLE_PATTERN *list, const char *str, char *wildcarded, size_t wildcarded_size); +SIMPLE_PATTERN_RESULT simple_pattern_matches_string_extract(SIMPLE_PATTERN *list, struct netdata_string *str, char *wildcarded, size_t wildcarded_size); +SIMPLE_PATTERN_RESULT simple_pattern_matches_buffer_extract(SIMPLE_PATTERN *list, BUFFER *str, char *wildcarded, size_t wildcarded_size); +SIMPLE_PATTERN_RESULT simple_pattern_matches_length_extract(SIMPLE_PATTERN *list, const char *str, size_t len, char *wildcarded, size_t wildcarded_size); + +// test if string str is matched from the pattern +#define simple_pattern_matches(list, str) (simple_pattern_matches_extract(list, str, NULL, 0) == SP_MATCHED_POSITIVE) +#define simple_pattern_matches_string(list, str) (simple_pattern_matches_string_extract(list, str, NULL, 0) == SP_MATCHED_POSITIVE) +#define simple_pattern_matches_buffer(list, str) (simple_pattern_matches_buffer_extract(list, str, NULL, 0) == SP_MATCHED_POSITIVE) + +// free a simple_pattern that was created with simple_pattern_create() +// list can be NULL, in which case, this does nothing. +void simple_pattern_free(SIMPLE_PATTERN *list); + +void simple_pattern_dump(uint64_t debug_type, SIMPLE_PATTERN *p) ; +int simple_pattern_is_potential_name(SIMPLE_PATTERN *p) ; +char *simple_pattern_iterate(SIMPLE_PATTERN **p); + +#define SIMPLE_PATTERN_DEFAULT_WEB_SEPARATORS ",|\t\r\n\f\v" + +#define is_valid_sp(x) ((x) && *(x) && !((x)[0] == '*' && (x)[1] == '\0')) + +#define string_to_simple_pattern(str) (is_valid_sp(str) ? simple_pattern_create(str, SIMPLE_PATTERN_DEFAULT_WEB_SEPARATORS, SIMPLE_PATTERN_EXACT, true) : NULL) +#define string_to_simple_pattern_nocase(str) (is_valid_sp(str) ? simple_pattern_create(str, SIMPLE_PATTERN_DEFAULT_WEB_SEPARATORS, SIMPLE_PATTERN_EXACT, false) : NULL) + +#endif //NETDATA_SIMPLE_PATTERN_H diff --git a/src/libnetdata/socket/README.md b/src/libnetdata/socket/README.md new file mode 100644 index 00000000..b81cbb8d --- /dev/null +++ b/src/libnetdata/socket/README.md @@ -0,0 +1,8 @@ +<!-- +Title: "Socket" +custom_edit_url: https://github.com/netdata/netdata/edit/master/src/libnetdata/socket/README.md +sidebar_label: "Socket" +learn_status: "Published" +learn_topic_type: "References" +learn_rel_path: "Developers/libnetdata" +--> diff --git a/src/libnetdata/socket/security.c b/src/libnetdata/socket/security.c new file mode 100644 index 00000000..502998b7 --- /dev/null +++ b/src/libnetdata/socket/security.c @@ -0,0 +1,754 @@ +#include "../libnetdata.h" + +#ifdef ENABLE_HTTPS + +SSL_CTX *netdata_ssl_exporting_ctx =NULL; +SSL_CTX *netdata_ssl_streaming_sender_ctx =NULL; +SSL_CTX *netdata_ssl_web_server_ctx =NULL; +const char *netdata_ssl_security_key =NULL; +const char *netdata_ssl_security_cert =NULL; +const char *tls_version=NULL; +const char *tls_ciphers=NULL; +bool netdata_ssl_validate_certificate = true; +bool netdata_ssl_validate_certificate_sender = true; + +static SOCKET_PEERS netdata_ssl_peers(NETDATA_SSL *ssl) { + int sock_fd; + + if(unlikely(!ssl->conn)) + sock_fd = -1; + else + sock_fd = SSL_get_rfd(ssl->conn); + + return socket_peers(sock_fd); +} + +static void netdata_ssl_log_error_queue(const char *call, NETDATA_SSL *ssl, unsigned long err) { + nd_log_limit_static_thread_var(erl, 1, 0); + + if(err == SSL_ERROR_NONE) + err = ERR_get_error(); + + if(err == SSL_ERROR_NONE) + return; + + do { + char *code; + + switch (err) { + case SSL_ERROR_SSL: + code = "SSL_ERROR_SSL"; + ssl->state = NETDATA_SSL_STATE_FAILED; + break; + + case SSL_ERROR_WANT_READ: + code = "SSL_ERROR_WANT_READ"; + break; + + case SSL_ERROR_WANT_WRITE: + code = "SSL_ERROR_WANT_WRITE"; + break; + + case SSL_ERROR_WANT_X509_LOOKUP: + code = "SSL_ERROR_WANT_X509_LOOKUP"; + break; + + case SSL_ERROR_SYSCALL: + code = "SSL_ERROR_SYSCALL"; + ssl->state = NETDATA_SSL_STATE_FAILED; + break; + + case SSL_ERROR_ZERO_RETURN: + code = "SSL_ERROR_ZERO_RETURN"; + break; + + case SSL_ERROR_WANT_CONNECT: + code = "SSL_ERROR_WANT_CONNECT"; + break; + + case SSL_ERROR_WANT_ACCEPT: + code = "SSL_ERROR_WANT_ACCEPT"; + break; + +#ifdef SSL_ERROR_WANT_ASYNC + case SSL_ERROR_WANT_ASYNC: + code = "SSL_ERROR_WANT_ASYNC"; + break; +#endif + +#ifdef SSL_ERROR_WANT_ASYNC_JOB + case SSL_ERROR_WANT_ASYNC_JOB: + code = "SSL_ERROR_WANT_ASYNC_JOB"; + break; +#endif + +#ifdef SSL_ERROR_WANT_CLIENT_HELLO_CB + case SSL_ERROR_WANT_CLIENT_HELLO_CB: + code = "SSL_ERROR_WANT_CLIENT_HELLO_CB"; + break; +#endif + +#ifdef SSL_ERROR_WANT_RETRY_VERIFY + case SSL_ERROR_WANT_RETRY_VERIFY: + code = "SSL_ERROR_WANT_RETRY_VERIFY"; + break; +#endif + + default: + code = "SSL_ERROR_UNKNOWN"; + break; + } + + char str[1024 + 1]; + ERR_error_string_n(err, str, 1024); + str[1024] = '\0'; + SOCKET_PEERS peers = netdata_ssl_peers(ssl); + nd_log_limit(&erl, NDLS_DAEMON, NDLP_ERR, + "SSL: %s() on socket local [[%s]:%d] <-> remote [[%s]:%d], returned error %lu (%s): %s", + call, peers.local.ip, peers.local.port, peers.peer.ip, peers.peer.port, err, code, str); + + } while((err = ERR_get_error())); +} + +bool netdata_ssl_open_ext(NETDATA_SSL *ssl, SSL_CTX *ctx, int fd, const unsigned char *alpn_protos, unsigned int alpn_protos_len) { + errno = 0; + ssl->ssl_errno = 0; + + if(ssl->conn) { + if(!ctx || SSL_get_SSL_CTX(ssl->conn) != ctx) { + SSL_free(ssl->conn); + ssl->conn = NULL; + } + else if (SSL_clear(ssl->conn) == 0) { + netdata_ssl_log_error_queue("SSL_clear", ssl, SSL_ERROR_NONE); + SSL_free(ssl->conn); + ssl->conn = NULL; + } + } + + if(!ssl->conn) { + if(!ctx) { + internal_error(true, "SSL: not CTX given"); + ssl->state = NETDATA_SSL_STATE_FAILED; + return false; + } + + ssl->conn = SSL_new(ctx); + if (!ssl->conn) { + netdata_ssl_log_error_queue("SSL_new", ssl, SSL_ERROR_NONE); + ssl->state = NETDATA_SSL_STATE_FAILED; + return false; + } + if (alpn_protos && alpn_protos_len > 0) + SSL_set_alpn_protos(ssl->conn, alpn_protos, alpn_protos_len); + } + + if(SSL_set_fd(ssl->conn, fd) != 1) { + netdata_ssl_log_error_queue("SSL_set_fd", ssl, SSL_ERROR_NONE); + ssl->state = NETDATA_SSL_STATE_FAILED; + return false; + } + + ssl->state = NETDATA_SSL_STATE_INIT; + + ERR_clear_error(); + + return true; +} + +bool netdata_ssl_open(NETDATA_SSL *ssl, SSL_CTX *ctx, int fd) { + return netdata_ssl_open_ext(ssl, ctx, fd, NULL, 0); +} + +void netdata_ssl_close(NETDATA_SSL *ssl) { + errno = 0; + ssl->ssl_errno = 0; + + if(ssl->conn) { + if(SSL_connection(ssl)) { + int ret = SSL_shutdown(ssl->conn); + if(ret == 0) + SSL_shutdown(ssl->conn); + } + + SSL_free(ssl->conn); + + ERR_clear_error(); + } + + *ssl = NETDATA_SSL_UNSET_CONNECTION; +} + +static inline bool is_handshake_complete(NETDATA_SSL *ssl, const char *op) { + nd_log_limit_static_thread_var(erl, 1, 0); + + if(unlikely(!ssl->conn)) { + internal_error(true, "SSL: trying to %s on a NULL connection", op); + return false; + } + + switch(ssl->state) { + case NETDATA_SSL_STATE_NOT_SSL: { + SOCKET_PEERS peers = netdata_ssl_peers(ssl); + nd_log_limit(&erl, NDLS_DAEMON, NDLP_WARNING, + "SSL: on socket local [[%s]:%d] <-> remote [[%s]:%d], attempt to %s on non-SSL connection", + peers.local.ip, peers.local.port, peers.peer.ip, peers.peer.port, op); + return false; + } + + case NETDATA_SSL_STATE_INIT: { + SOCKET_PEERS peers = netdata_ssl_peers(ssl); + nd_log_limit(&erl, NDLS_DAEMON, NDLP_WARNING, + "SSL: on socket local [[%s]:%d] <-> remote [[%s]:%d], attempt to %s on an incomplete connection", + peers.local.ip, peers.local.port, peers.peer.ip, peers.peer.port, op); + return false; + } + + case NETDATA_SSL_STATE_FAILED: { + SOCKET_PEERS peers = netdata_ssl_peers(ssl); + nd_log_limit(&erl, NDLS_DAEMON, NDLP_WARNING, + "SSL: on socket local [[%s]:%d] <-> remote [[%s]:%d], attempt to %s on a failed connection", + peers.local.ip, peers.local.port, peers.peer.ip, peers.peer.port, op); + return false; + } + + case NETDATA_SSL_STATE_COMPLETE: { + return true; + } + } + + return false; +} + +/* + * netdata_ssl_read() should return the same as read(): + * + * Positive value: The read() function succeeded and read some bytes. The exact number of bytes read is returned. + * + * Zero: For files and sockets, a return value of zero signifies end-of-file (EOF), meaning no more data is available + * for reading. For sockets, this usually means the other side has closed the connection. + * + * -1: An error occurred. The specific error can be found by examining the errno variable. + * EAGAIN or EWOULDBLOCK: The file descriptor is in non-blocking mode, and the read operation would block. + * (These are often the same value, but can be different on some systems.) + */ + +ssize_t netdata_ssl_pending(NETDATA_SSL *ssl) { + return SSL_pending(ssl->conn); +} + +bool netdata_ssl_has_pending(NETDATA_SSL *ssl) { + // this call was added on OpenSSL 1.1.0 + // however, it is more accurate than SSL_pending() + // unfortunately it does not exists in libressl. + // return SSL_has_pending(ssl->conn); + + return SSL_pending(ssl->conn) > 0; +} + +ssize_t netdata_ssl_read(NETDATA_SSL *ssl, void *buf, size_t num) { + errno = 0; + ssl->ssl_errno = 0; + + if(unlikely(!is_handshake_complete(ssl, "read"))) + return -1; + + int bytes = SSL_read(ssl->conn, buf, (int)num); + + if(unlikely(bytes <= 0)) { + int err = SSL_get_error(ssl->conn, bytes); + if (err == SSL_ERROR_ZERO_RETURN) { + ssl->ssl_errno = err; + return 0; + } + + if (err == SSL_ERROR_WANT_READ || err == SSL_ERROR_WANT_WRITE) { + ssl->ssl_errno = err; + errno = EWOULDBLOCK; + } + else + netdata_ssl_log_error_queue("SSL_read", ssl, err); + + bytes = -1; // according to read() or recv() + } + + return bytes; +} + +/* + * netdata_ssl_write() should return the same as write(): + * + * Positive value: The write() function succeeded and wrote some bytes. The exact number of bytes written is returned. + * + * Zero: It's technically possible for write() to return zero, indicating that zero bytes were written. However, for a + * socket, this generally does not happen unless the size of the data to be written is zero. + * + * -1: An error occurred. The specific error can be found by examining the errno variable. + * EAGAIN or EWOULDBLOCK: The file descriptor is in non-blocking mode, and the write operation would block. + * (These are often the same value, but can be different on some systems.) + */ + +ssize_t netdata_ssl_write(NETDATA_SSL *ssl, const void *buf, size_t num) { + errno = 0; + ssl->ssl_errno = 0; + + if(unlikely(!is_handshake_complete(ssl, "write"))) + return -1; + + int bytes = SSL_write(ssl->conn, (uint8_t *)buf, (int)num); + + if(unlikely(bytes <= 0)) { + int err = SSL_get_error(ssl->conn, bytes); + if (err == SSL_ERROR_WANT_READ || err == SSL_ERROR_WANT_WRITE) { + ssl->ssl_errno = err; + errno = EWOULDBLOCK; + } + else + netdata_ssl_log_error_queue("SSL_write", ssl, err); + + bytes = -1; // according to write() or send() + } + + return bytes; +} + +static inline bool is_handshake_initialized(NETDATA_SSL *ssl, const char *op) { + nd_log_limit_static_thread_var(erl, 1, 0); + + if(unlikely(!ssl->conn)) { + internal_error(true, "SSL: trying to %s on a NULL connection", op); + return false; + } + + switch(ssl->state) { + case NETDATA_SSL_STATE_NOT_SSL: { + SOCKET_PEERS peers = netdata_ssl_peers(ssl); + nd_log_limit(&erl, NDLS_DAEMON, NDLP_WARNING, + "SSL: on socket local [[%s]:%d] <-> remote [[%s]:%d], attempt to %s on non-SSL connection", + peers.local.ip, peers.local.port, peers.peer.ip, peers.peer.port, op); + return false; + } + + case NETDATA_SSL_STATE_INIT: { + return true; + } + + case NETDATA_SSL_STATE_FAILED: { + SOCKET_PEERS peers = netdata_ssl_peers(ssl); + nd_log_limit(&erl, NDLS_DAEMON, NDLP_WARNING, + "SSL: on socket local [[%s]:%d] <-> remote [[%s]:%d], attempt to %s on a failed connection", + peers.local.ip, peers.local.port, peers.peer.ip, peers.peer.port, op); + return false; + } + + case NETDATA_SSL_STATE_COMPLETE: { + SOCKET_PEERS peers = netdata_ssl_peers(ssl); + nd_log_limit(&erl, NDLS_DAEMON, NDLP_WARNING, + "SSL: on socket local [[%s]:%d] <-> remote [[%s]:%d], attempt to %s on an complete connection", + peers.local.ip, peers.local.port, peers.peer.ip, peers.peer.port, op); + return false; + } + } + + return false; +} + +#define WANT_READ_WRITE_TIMEOUT_MS 10 + +static inline bool want_read_write_should_retry(NETDATA_SSL *ssl, int err) { + int ssl_errno = SSL_get_error(ssl->conn, err); + if(ssl_errno == SSL_ERROR_WANT_READ || ssl_errno == SSL_ERROR_WANT_WRITE) { + struct pollfd pfds[1] = { [0] = { + .fd = SSL_get_rfd(ssl->conn), + .events = (short)(((ssl_errno == SSL_ERROR_WANT_READ ) ? POLLIN : 0) | + ((ssl_errno == SSL_ERROR_WANT_WRITE) ? POLLOUT : 0)), + }}; + + if(poll(pfds, 1, WANT_READ_WRITE_TIMEOUT_MS) <= 0) + return false; // timeout (0) or error (<0) + + return true; // we have activity, so we should retry + } + + return false; // an unknown error +} + +bool netdata_ssl_connect(NETDATA_SSL *ssl) { + errno = 0; + ssl->ssl_errno = 0; + + if(unlikely(!is_handshake_initialized(ssl, "connect"))) + return false; + + SSL_set_connect_state(ssl->conn); + + int err; + while ((err = SSL_connect(ssl->conn)) != 1) { + if(!want_read_write_should_retry(ssl, err)) + break; + } + + if (err != 1) { + err = SSL_get_error(ssl->conn, err); + netdata_ssl_log_error_queue("SSL_connect", ssl, err); + ssl->state = NETDATA_SSL_STATE_FAILED; + return false; + } + + ssl->state = NETDATA_SSL_STATE_COMPLETE; + return true; +} + +bool netdata_ssl_accept(NETDATA_SSL *ssl) { + errno = 0; + ssl->ssl_errno = 0; + + if(unlikely(!is_handshake_initialized(ssl, "accept"))) + return false; + + SSL_set_accept_state(ssl->conn); + + int err; + while ((err = SSL_accept(ssl->conn)) != 1) { + if(!want_read_write_should_retry(ssl, err)) + break; + } + + if (err != 1) { + err = SSL_get_error(ssl->conn, err); + netdata_ssl_log_error_queue("SSL_accept", ssl, err); + ssl->state = NETDATA_SSL_STATE_FAILED; + return false; + } + + ssl->state = NETDATA_SSL_STATE_COMPLETE; + return true; +} + +/** + * Info Callback + * + * Function used as callback for the OpenSSL Library + * + * @param ssl a pointer to the SSL structure of the client + * @param where the variable with the flags set. + * @param ret the return of the caller + */ +static void netdata_ssl_info_callback(const SSL *ssl, int where, int ret __maybe_unused) { + (void)ssl; + if (where & SSL_CB_ALERT) { + netdata_log_debug(D_WEB_CLIENT,"SSL INFO CALLBACK %s %s", SSL_alert_type_string(ret), SSL_alert_desc_string_long(ret)); + } +} + +/** + * OpenSSL Library + * + * Starts the openssl library for the Netdata. + */ +void netdata_ssl_initialize_openssl() { + +#if OPENSSL_VERSION_NUMBER < OPENSSL_VERSION_110 +# if (SSLEAY_VERSION_NUMBER >= OPENSSL_VERSION_097) + OPENSSL_config(NULL); +# endif + + SSL_load_error_strings(); + + SSL_library_init(); + +#else + + if (OPENSSL_init_ssl(OPENSSL_INIT_LOAD_CONFIG, NULL) != 1) { + netdata_log_error("SSL library cannot be initialized."); + } + +#endif +} + +#if OPENSSL_VERSION_NUMBER >= OPENSSL_VERSION_110 +/** + * TLS version + * + * Returns the TLS version depending of the user input. + * + * @param lversion is the user input. + * + * @return it returns the version number. + */ +static int netdata_ssl_select_tls_version(const char *lversion) { + if (!strcmp(lversion, "1") || !strcmp(lversion, "1.0")) + return TLS1_VERSION; + else if (!strcmp(lversion, "1.1")) + return TLS1_1_VERSION; + else if (!strcmp(lversion, "1.2")) + return TLS1_2_VERSION; +#if defined(TLS1_3_VERSION) + else if (!strcmp(lversion, "1.3")) + return TLS1_3_VERSION; +#endif + +#if defined(TLS_MAX_VERSION) + return TLS_MAX_VERSION; +#else + return TLS1_2_VERSION; +#endif +} +#endif + +/** + * Initialize Openssl Client + * + * Starts the client context with TLS 1.2. + * + * @return It returns the context on success or NULL otherwise + */ +SSL_CTX * netdata_ssl_create_client_ctx(unsigned long mode) { + SSL_CTX *ctx; +#if OPENSSL_VERSION_NUMBER < OPENSSL_VERSION_110 + ctx = SSL_CTX_new(SSLv23_client_method()); +#else + ctx = SSL_CTX_new(TLS_client_method()); +#endif + if(ctx) { +#if OPENSSL_VERSION_NUMBER < OPENSSL_VERSION_110 + SSL_CTX_set_options (ctx,SSL_OP_NO_SSLv2|SSL_OP_NO_SSLv3|SSL_OP_NO_COMPRESSION); +#else + SSL_CTX_set_min_proto_version(ctx, TLS1_VERSION); +# if defined(TLS_MAX_VERSION) + SSL_CTX_set_max_proto_version(ctx, TLS_MAX_VERSION); +# elif defined(TLS1_3_VERSION) + SSL_CTX_set_max_proto_version(ctx, TLS1_3_VERSION); +# elif defined(TLS1_2_VERSION) + SSL_CTX_set_max_proto_version(ctx, TLS1_2_VERSION); +# endif +#endif + } + + if(mode) + SSL_CTX_set_mode(ctx, mode); + + return ctx; +} + +/** + * Initialize OpenSSL server + * + * Starts the server context with TLS 1.2 and load the certificate. + * + * @return It returns the context on success or NULL otherwise + */ +static SSL_CTX * netdata_ssl_create_server_ctx(unsigned long mode) { + SSL_CTX *ctx; + char lerror[512]; + static int netdata_id_context = 1; + + //TO DO: Confirm the necessity to check return for other OPENSSL function +#if OPENSSL_VERSION_NUMBER < OPENSSL_VERSION_110 + ctx = SSL_CTX_new(SSLv23_server_method()); + if (!ctx) { + netdata_log_error("Cannot create a new SSL context, netdata won't encrypt communication"); + return NULL; + } + + SSL_CTX_use_certificate_file(ctx, netdata_ssl_security_cert, SSL_FILETYPE_PEM); +#else + ctx = SSL_CTX_new(TLS_server_method()); + if (!ctx) { + netdata_log_error("Cannot create a new SSL context, netdata won't encrypt communication"); + return NULL; + } + + SSL_CTX_use_certificate_chain_file(ctx, netdata_ssl_security_cert); +#endif + +#if OPENSSL_VERSION_NUMBER < OPENSSL_VERSION_110 + SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv2|SSL_OP_NO_SSLv3|SSL_OP_NO_COMPRESSION); +#else + SSL_CTX_set_min_proto_version(ctx, TLS1_VERSION); + SSL_CTX_set_max_proto_version(ctx, netdata_ssl_select_tls_version(tls_version)); + + if(tls_ciphers && strcmp(tls_ciphers, "none") != 0) { + if (!SSL_CTX_set_cipher_list(ctx, tls_ciphers)) { + netdata_log_error("SSL error. cannot set the cipher list"); + } + } +#endif + + SSL_CTX_use_PrivateKey_file(ctx, netdata_ssl_security_key,SSL_FILETYPE_PEM); + + if (!SSL_CTX_check_private_key(ctx)) { + ERR_error_string_n(ERR_get_error(),lerror,sizeof(lerror)); + netdata_log_error("SSL cannot check the private key: %s",lerror); + SSL_CTX_free(ctx); + return NULL; + } + + SSL_CTX_set_session_id_context(ctx,(void*)&netdata_id_context,(unsigned int)sizeof(netdata_id_context)); + SSL_CTX_set_info_callback(ctx, netdata_ssl_info_callback); + +#if (OPENSSL_VERSION_NUMBER < OPENSSL_VERSION_095) + SSL_CTX_set_verify_depth(ctx,1); +#endif + netdata_log_debug(D_WEB_CLIENT,"SSL GLOBAL CONTEXT STARTED\n"); + + SSL_CTX_set_mode(ctx, mode); + + return ctx; +} + +/** + * Start SSL + * + * Call the correct function to start the SSL context. + * + * @param selector informs the context that must be initialized, the following list has the valid values: + * NETDATA_SSL_CONTEXT_SERVER - the server context + * NETDATA_SSL_CONTEXT_STREAMING - Starts the streaming context. + * NETDATA_SSL_CONTEXT_EXPORTING - Starts the OpenTSDB context + */ +void netdata_ssl_initialize_ctx(int selector) { + static SPINLOCK sp = NETDATA_SPINLOCK_INITIALIZER; + spinlock_lock(&sp); + + switch (selector) { + case NETDATA_SSL_WEB_SERVER_CTX: { + if(!netdata_ssl_web_server_ctx) { + struct stat statbuf; + if (stat(netdata_ssl_security_key, &statbuf) || stat(netdata_ssl_security_cert, &statbuf)) + netdata_log_info("To use encryption it is necessary to set \"ssl certificate\" and \"ssl key\" in [web] !\n"); + else { + netdata_ssl_web_server_ctx = netdata_ssl_create_server_ctx( + SSL_MODE_ENABLE_PARTIAL_WRITE | + SSL_MODE_ACCEPT_MOVING_WRITE_BUFFER | + // SSL_MODE_AUTO_RETRY | + 0); + + if(netdata_ssl_web_server_ctx && !netdata_ssl_validate_certificate) + SSL_CTX_set_verify(netdata_ssl_web_server_ctx, SSL_VERIFY_NONE, NULL); + } + } + break; + } + + case NETDATA_SSL_STREAMING_SENDER_CTX: { + if(!netdata_ssl_streaming_sender_ctx) { + //This is necessary for the stream, because it is working sometimes with nonblock socket. + //It returns the bitmask after to change, there is not any description of errors in the documentation + netdata_ssl_streaming_sender_ctx = netdata_ssl_create_client_ctx( + SSL_MODE_ENABLE_PARTIAL_WRITE | + SSL_MODE_ACCEPT_MOVING_WRITE_BUFFER | + // SSL_MODE_AUTO_RETRY | + 0 + ); + + if(netdata_ssl_streaming_sender_ctx && !netdata_ssl_validate_certificate_sender) + SSL_CTX_set_verify(netdata_ssl_streaming_sender_ctx, SSL_VERIFY_NONE, NULL); + } + break; + } + + case NETDATA_SSL_EXPORTING_CTX: { + if(!netdata_ssl_exporting_ctx) { + netdata_ssl_exporting_ctx = netdata_ssl_create_client_ctx(0); + + if(netdata_ssl_exporting_ctx && !netdata_ssl_validate_certificate) + SSL_CTX_set_verify(netdata_ssl_exporting_ctx, SSL_VERIFY_NONE, NULL); + } + break; + } + } + + spinlock_unlock(&sp); +} + +/** + * Clean Open SSL + * + * Clean all the allocated contexts from netdata. + */ +void netdata_ssl_cleanup() +{ + if (netdata_ssl_web_server_ctx) { + SSL_CTX_free(netdata_ssl_web_server_ctx); + netdata_ssl_web_server_ctx = NULL; + } + + if (netdata_ssl_streaming_sender_ctx) { + SSL_CTX_free(netdata_ssl_streaming_sender_ctx); + netdata_ssl_streaming_sender_ctx = NULL; + } + + if (netdata_ssl_exporting_ctx) { + SSL_CTX_free(netdata_ssl_exporting_ctx); + netdata_ssl_exporting_ctx = NULL; + } + +#if OPENSSL_VERSION_NUMBER < OPENSSL_VERSION_110 + ERR_free_strings(); +#endif +} + +/** + * Test Certificate + * + * Check the certificate of Netdata parent + * + * @param ssl is the connection structure + * + * @return It returns 0 on success and -1 otherwise + */ +int security_test_certificate(SSL *ssl) { + X509* cert = SSL_get_peer_certificate(ssl); + int ret; + long status; + if (!cert) { + return -1; + } + + status = SSL_get_verify_result(ssl); + if((X509_V_OK != status)) + { + char error[512]; + ERR_error_string_n(ERR_get_error(), error, sizeof(error)); + netdata_log_error("SSL RFC4158 check: We have a invalid certificate, the tests result with %ld and message %s", status, error); + ret = -1; + } else { + ret = 0; + } + + return ret; +} + +/** + * Location for context + * + * Case the user give us a directory with the certificates available and + * the Netdata parent certificate, we use this function to validate the certificate. + * + * @param ctx the context where the path will be set. + * @param file the file with Netdata parent certificate. + * @param path the directory where the certificates are stored. + * + * @return It returns 0 on success and -1 otherwise. + */ +int ssl_security_location_for_context(SSL_CTX *ctx, char *file, char *path) { + int load_custom = 1, load_default = 1; + if (file || path) { + if(!SSL_CTX_load_verify_locations(ctx, file, path)) { + netdata_log_info("Netdata can not verify custom CAfile or CApath for parent's SSL certificate, so it will use the default OpenSSL configuration to validate certificates!"); + load_custom = 0; + } + } + + if(!SSL_CTX_set_default_verify_paths(ctx)) { + netdata_log_info("Can not verify default OpenSSL configuration to validate certificates!"); + load_default = 0; + } + + if (load_custom == 0 && load_default == 0) + return -1; + + return 0; +} +#endif diff --git a/src/libnetdata/socket/security.h b/src/libnetdata/socket/security.h new file mode 100644 index 00000000..283d81db --- /dev/null +++ b/src/libnetdata/socket/security.h @@ -0,0 +1,77 @@ +#ifndef NETDATA_SECURITY_H +# define NETDATA_SECURITY_H + +typedef enum __attribute__((packed)) { + NETDATA_SSL_STATE_NOT_SSL = 1, // This connection is not SSL + NETDATA_SSL_STATE_INIT, // SSL handshake is initialized + NETDATA_SSL_STATE_FAILED, // SSL handshake failed + NETDATA_SSL_STATE_COMPLETE, // SSL handshake successful +} NETDATA_SSL_STATE; + +#define NETDATA_SSL_WEB_SERVER_CTX 0 +#define NETDATA_SSL_STREAMING_SENDER_CTX 1 +#define NETDATA_SSL_EXPORTING_CTX 2 + +# ifdef ENABLE_HTTPS + +#define OPENSSL_VERSION_095 0x00905100L +#define OPENSSL_VERSION_097 0x0907000L +#define OPENSSL_VERSION_110 0x10100000L +#define OPENSSL_VERSION_111 0x10101000L +#define OPENSSL_VERSION_300 0x30000000L + +# include <openssl/ssl.h> +# include <openssl/err.h> +# include <openssl/evp.h> +# include <openssl/pem.h> +# if (SSLEAY_VERSION_NUMBER >= OPENSSL_VERSION_097) && (OPENSSL_VERSION_NUMBER < OPENSSL_VERSION_110) +# include <openssl/conf.h> +# endif + +#if OPENSSL_VERSION_NUMBER >= OPENSSL_VERSION_300 +#include <openssl/core_names.h> +#include <openssl/decoder.h> +#endif + +typedef struct netdata_ssl { + SSL *conn; // SSL connection + NETDATA_SSL_STATE state; // The state for SSL connection + unsigned long ssl_errno; // The SSL errno of the last SSL call +} NETDATA_SSL; + +#define NETDATA_SSL_UNSET_CONNECTION (NETDATA_SSL){ .conn = NULL, .state = NETDATA_SSL_STATE_NOT_SSL, .ssl_errno = 0 } + +#define SSL_connection(ssl) ((ssl)->conn && (ssl)->state != NETDATA_SSL_STATE_NOT_SSL) + +extern SSL_CTX *netdata_ssl_exporting_ctx; +extern SSL_CTX *netdata_ssl_streaming_sender_ctx; +extern SSL_CTX *netdata_ssl_web_server_ctx; +extern const char *netdata_ssl_security_key; +extern const char *netdata_ssl_security_cert; +extern const char *tls_version; +extern const char *tls_ciphers; +extern bool netdata_ssl_validate_certificate; +extern bool netdata_ssl_validate_certificate_sender; +int ssl_security_location_for_context(SSL_CTX *ctx,char *file,char *path); + +void netdata_ssl_initialize_openssl(); +void netdata_ssl_cleanup(); +void netdata_ssl_initialize_ctx(int selector); +int security_test_certificate(SSL *ssl); +SSL_CTX * netdata_ssl_create_client_ctx(unsigned long mode); + +bool netdata_ssl_connect(NETDATA_SSL *ssl); +bool netdata_ssl_accept(NETDATA_SSL *ssl); + +bool netdata_ssl_open(NETDATA_SSL *ssl, SSL_CTX *ctx, int fd); +bool netdata_ssl_open_ext(NETDATA_SSL *ssl, SSL_CTX *ctx, int fd, const unsigned char *alpn_protos, unsigned int alpn_protos_len); +void netdata_ssl_close(NETDATA_SSL *ssl); + +ssize_t netdata_ssl_read(NETDATA_SSL *ssl, void *buf, size_t num); +ssize_t netdata_ssl_write(NETDATA_SSL *ssl, const void *buf, size_t num); + +ssize_t netdata_ssl_pending(NETDATA_SSL *ssl); +bool netdata_ssl_has_pending(NETDATA_SSL *ssl); + +# endif //ENABLE_HTTPS +#endif //NETDATA_SECURITY_H diff --git a/src/libnetdata/socket/socket.c b/src/libnetdata/socket/socket.c new file mode 100644 index 00000000..0ba24b74 --- /dev/null +++ b/src/libnetdata/socket/socket.c @@ -0,0 +1,2179 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE // for POLLRDHUP +#endif + +#ifndef __BSD_VISIBLE +#define __BSD_VISIBLE // for POLLRDHUP +#endif + +#include "../libnetdata.h" + +bool ip_to_hostname(const char *ip, char *dst, size_t dst_len) { + if(!dst || !dst_len) + return false; + + struct sockaddr_in sa; + struct sockaddr_in6 sa6; + struct sockaddr *sa_ptr; + int sa_len; + + // Try to convert the IP address to sockaddr_in (IPv4) + if (inet_pton(AF_INET, ip, &(sa.sin_addr)) == 1) { + sa.sin_family = AF_INET; + sa_ptr = (struct sockaddr *)&sa; + sa_len = sizeof(sa); + } + // Try to convert the IP address to sockaddr_in6 (IPv6) + else if (inet_pton(AF_INET6, ip, &(sa6.sin6_addr)) == 1) { + sa6.sin6_family = AF_INET6; + sa_ptr = (struct sockaddr *)&sa6; + sa_len = sizeof(sa6); + } + + else { + dst[0] = '\0'; + return false; + } + + // Perform the reverse lookup + int res = getnameinfo(sa_ptr, sa_len, dst, dst_len, NULL, 0, NI_NAMEREQD); + if(res != 0) + return false; + + return true; +} + +SOCKET_PEERS socket_peers(int sock_fd) { + SOCKET_PEERS peers; + + if(sock_fd < 0) { + strncpyz(peers.peer.ip, "not connected", sizeof(peers.peer.ip) - 1); + peers.peer.port = 0; + + strncpyz(peers.local.ip, "not connected", sizeof(peers.local.ip) - 1); + peers.local.port = 0; + + return peers; + } + + struct sockaddr_storage addr; + socklen_t addr_len = sizeof(addr); + + // Get peer info + if (getpeername(sock_fd, (struct sockaddr *)&addr, &addr_len) == 0) { + if (addr.ss_family == AF_INET) { // IPv4 + struct sockaddr_in *s = (struct sockaddr_in *)&addr; + inet_ntop(AF_INET, &s->sin_addr, peers.peer.ip, sizeof(peers.peer.ip)); + peers.peer.port = ntohs(s->sin_port); + } + else { // IPv6 + struct sockaddr_in6 *s = (struct sockaddr_in6 *)&addr; + inet_ntop(AF_INET6, &s->sin6_addr, peers.peer.ip, sizeof(peers.peer.ip)); + peers.peer.port = ntohs(s->sin6_port); + } + } + else { + strncpyz(peers.peer.ip, "unknown", sizeof(peers.peer.ip) - 1); + peers.peer.port = 0; + } + + // Get local info + addr_len = sizeof(addr); + if (getsockname(sock_fd, (struct sockaddr *)&addr, &addr_len) == 0) { + if (addr.ss_family == AF_INET) { // IPv4 + struct sockaddr_in *s = (struct sockaddr_in *) &addr; + inet_ntop(AF_INET, &s->sin_addr, peers.local.ip, sizeof(peers.local.ip)); + peers.local.port = ntohs(s->sin_port); + } else { // IPv6 + struct sockaddr_in6 *s = (struct sockaddr_in6 *) &addr; + inet_ntop(AF_INET6, &s->sin6_addr, peers.local.ip, sizeof(peers.local.ip)); + peers.local.port = ntohs(s->sin6_port); + } + } + else { + strncpyz(peers.local.ip, "unknown", sizeof(peers.local.ip) - 1); + peers.local.port = 0; + } + + return peers; +} + + +// -------------------------------------------------------------------------------------------------------------------- +// various library calls + +#ifdef __gnu_linux__ +#define LARGE_SOCK_SIZE 33554431 // don't ask why - I found it at brubeck source - I guess it is just a large number +#else +#define LARGE_SOCK_SIZE 4096 +#endif + +bool fd_is_socket(int fd) { + int type; + socklen_t len = sizeof(type); + if (getsockopt(fd, SOL_SOCKET, SO_TYPE, &type, &len) == -1) + return false; + + return true; +} + +bool sock_has_output_error(int fd) { + if(fd < 0) { + //internal_error(true, "invalid socket %d", fd); + return false; + } + +// if(!fd_is_socket(fd)) { +// //internal_error(true, "fd %d is not a socket", fd); +// return false; +// } + + short int errors = POLLERR | POLLHUP | POLLNVAL; + +#ifdef POLLRDHUP + errors |= POLLRDHUP; +#endif + + struct pollfd pfd = { + .fd = fd, + .events = POLLOUT | errors, + .revents = 0, + }; + + if(poll(&pfd, 1, 0) == -1) { + //internal_error(true, "poll() failed"); + return false; + } + + return ((pfd.revents & errors) || !(pfd.revents & POLLOUT)); +} + +int sock_setnonblock(int fd) { + int flags; + + flags = fcntl(fd, F_GETFL); + flags |= O_NONBLOCK; + + int ret = fcntl(fd, F_SETFL, flags); + if(ret < 0) + nd_log(NDLS_DAEMON, NDLP_ERR, + "Failed to set O_NONBLOCK on socket %d", + fd); + + return ret; +} + +int sock_delnonblock(int fd) { + int flags; + + flags = fcntl(fd, F_GETFL); + flags &= ~O_NONBLOCK; + + int ret = fcntl(fd, F_SETFL, flags); + if(ret < 0) + nd_log(NDLS_DAEMON, NDLP_ERR, + "Failed to remove O_NONBLOCK on socket %d", + fd); + + return ret; +} + +int sock_setreuse(int fd, int reuse) { + int ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof(reuse)); + + if(ret == -1) + nd_log(NDLS_DAEMON, NDLP_ERR, + "Failed to set SO_REUSEADDR on socket %d", + fd); + + return ret; +} + +void sock_setcloexec(int fd) +{ + UNUSED(fd); +#ifndef SOCK_CLOEXEC + int flags = fcntl(fd, F_GETFD); + if (flags != -1) + (void) fcntl(fd, F_SETFD, flags | FD_CLOEXEC); +#endif +} + +int sock_setreuse_port(int fd __maybe_unused, int reuse __maybe_unused) { + int ret; + +#ifdef SO_REUSEPORT + ret = setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &reuse, sizeof(reuse)); + if(ret == -1 && errno != ENOPROTOOPT) + nd_log(NDLS_DAEMON, NDLP_ERR, + "failed to set SO_REUSEPORT on socket %d", + fd); +#else + ret = -1; +#endif + + return ret; +} + +int sock_enlarge_in(int fd) { + int ret, bs = LARGE_SOCK_SIZE; + + ret = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &bs, sizeof(bs)); + + if(ret == -1) + nd_log(NDLS_DAEMON, NDLP_ERR, + "Failed to set SO_RCVBUF on socket %d", + fd); + + return ret; +} + +int sock_enlarge_out(int fd) { + int ret, bs = LARGE_SOCK_SIZE; + ret = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &bs, sizeof(bs)); + + if(ret == -1) + nd_log(NDLS_DAEMON, NDLP_ERR, + "Failed to set SO_SNDBUF on socket %d", + fd); + + return ret; +} + + +// -------------------------------------------------------------------------------------------------------------------- + +char *strdup_client_description(int family, const char *protocol, const char *ip, uint16_t port) { + char buffer[100 + 1]; + + switch(family) { + case AF_INET: + snprintfz(buffer, sizeof(buffer) - 1, "%s:%s:%d", protocol, ip, port); + break; + + case AF_INET6: + default: + snprintfz(buffer, sizeof(buffer) - 1, "%s:[%s]:%d", protocol, ip, port); + break; + + case AF_UNIX: + snprintfz(buffer, sizeof(buffer) - 1, "%s:%s", protocol, ip); + break; + } + + return strdupz(buffer); +} + +// -------------------------------------------------------------------------------------------------------------------- +// listening sockets + +int create_listen_socket_unix(const char *path, int listen_backlog) { + int sock; + + sock = socket(AF_UNIX, SOCK_STREAM | DEFAULT_SOCKET_FLAGS, 0); + if(sock < 0) { + nd_log(NDLS_DAEMON, NDLP_ERR, + "LISTENER: UNIX socket() on path '%s' failed.", + path); + + return -1; + } + + sock_setnonblock(sock); + sock_setcloexec(sock); + sock_enlarge_in(sock); + + struct sockaddr_un name; + memset(&name, 0, sizeof(struct sockaddr_un)); + name.sun_family = AF_UNIX; + strncpy(name.sun_path, path, sizeof(name.sun_path)-1); + + errno = 0; + if (unlink(path) == -1 && errno != ENOENT) + nd_log(NDLS_DAEMON, NDLP_ERR, + "LISTENER: failed to remove existing (probably obsolete or left-over) file on UNIX socket path '%s'.", + path); + + if(bind (sock, (struct sockaddr *) &name, sizeof (name)) < 0) { + close(sock); + nd_log(NDLS_DAEMON, NDLP_ERR, + "LISTENER: UNIX bind() on path '%s' failed.", + path); + + return -1; + } + + // we have to chmod this to 0777 so that the client will be able + // to read from and write to this socket. + if(chmod(path, 0777) == -1) + nd_log(NDLS_DAEMON, NDLP_ERR, + "LISTENER: failed to chmod() socket file '%s'.", + path); + + if(listen(sock, listen_backlog) < 0) { + close(sock); + nd_log(NDLS_DAEMON, NDLP_ERR, + "LISTENER: UNIX listen() on path '%s' failed.", + path); + + return -1; + } + + return sock; +} + +int create_listen_socket4(int socktype, const char *ip, uint16_t port, int listen_backlog) { + int sock; + + sock = socket(AF_INET, socktype | DEFAULT_SOCKET_FLAGS, 0); + if(sock < 0) { + nd_log(NDLS_DAEMON, NDLP_ERR, + "LISTENER: IPv4 socket() on ip '%s' port %d, socktype %d failed.", + ip, port, socktype); + + return -1; + } + sock_setreuse(sock, 1); + sock_setreuse_port(sock, 0); + sock_setnonblock(sock); + sock_setcloexec(sock); + sock_enlarge_in(sock); + + struct sockaddr_in name; + memset(&name, 0, sizeof(struct sockaddr_in)); + name.sin_family = AF_INET; + name.sin_port = htons (port); + + int ret = inet_pton(AF_INET, ip, (void *)&name.sin_addr.s_addr); + if(ret != 1) { + nd_log(NDLS_DAEMON, NDLP_ERR, + "LISTENER: Failed to convert IP '%s' to a valid IPv4 address.", + ip); + + close(sock); + return -1; + } + + if(bind (sock, (struct sockaddr *) &name, sizeof (name)) < 0) { + close(sock); + nd_log(NDLS_DAEMON, NDLP_ERR, + "LISTENER: IPv4 bind() on ip '%s' port %d, socktype %d failed.", + ip, port, socktype); + + return -1; + } + + if(socktype == SOCK_STREAM && listen(sock, listen_backlog) < 0) { + close(sock); + nd_log(NDLS_DAEMON, NDLP_ERR, + "LISTENER: IPv4 listen() on ip '%s' port %d, socktype %d failed.", + ip, port, socktype); + + return -1; + } + + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "LISTENER: Listening on IPv4 ip '%s' port %d, socktype %d", + ip, port, socktype); + + return sock; +} + +int create_listen_socket6(int socktype, uint32_t scope_id, const char *ip, int port, int listen_backlog) { + int sock; + int ipv6only = 1; + + sock = socket(AF_INET6, socktype | DEFAULT_SOCKET_FLAGS, 0); + if (sock < 0) { + nd_log(NDLS_DAEMON, NDLP_ERR, + "LISTENER: IPv6 socket() on ip '%s' port %d, socktype %d, failed.", + ip, port, socktype); + + return -1; + } + sock_setreuse(sock, 1); + sock_setreuse_port(sock, 0); + sock_setnonblock(sock); + sock_setcloexec(sock); + sock_enlarge_in(sock); + + /* IPv6 only */ + if(setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY, (void*)&ipv6only, sizeof(ipv6only)) != 0) + nd_log(NDLS_DAEMON, NDLP_ERR, + "LISTENER: Cannot set IPV6_V6ONLY on ip '%s' port %d, socktype %d.", + ip, port, socktype); + + struct sockaddr_in6 name; + memset(&name, 0, sizeof(struct sockaddr_in6)); + name.sin6_family = AF_INET6; + name.sin6_port = htons ((uint16_t) port); + name.sin6_scope_id = scope_id; + + int ret = inet_pton(AF_INET6, ip, (void *)&name.sin6_addr.s6_addr); + if(ret != 1) { + nd_log(NDLS_DAEMON, NDLP_ERR, + "LISTENER: Failed to convert IP '%s' to a valid IPv6 address.", + ip); + + close(sock); + return -1; + } + + name.sin6_scope_id = scope_id; + + if (bind (sock, (struct sockaddr *) &name, sizeof (name)) < 0) { + close(sock); + nd_log(NDLS_DAEMON, NDLP_ERR, + "LISTENER: IPv6 bind() on ip '%s' port %d, socktype %d failed.", + ip, port, socktype); + + return -1; + } + + if (socktype == SOCK_STREAM && listen(sock, listen_backlog) < 0) { + close(sock); + nd_log(NDLS_DAEMON, NDLP_ERR, + "LISTENER: IPv6 listen() on ip '%s' port %d, socktype %d failed.", + ip, port, socktype); + + return -1; + } + + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "LISTENER: Listening on IPv6 ip '%s' port %d, socktype %d", + ip, port, socktype); + + return sock; +} + +static inline int listen_sockets_add(LISTEN_SOCKETS *sockets, int fd, int family, int socktype, const char *protocol, const char *ip, uint16_t port, int acl_flags) { + if(sockets->opened >= MAX_LISTEN_FDS) { + nd_log(NDLS_DAEMON, NDLP_ERR, + "LISTENER: Too many listening sockets. Failed to add listening %s socket at ip '%s' port %d, protocol %s, socktype %d", + protocol, ip, port, protocol, socktype); + + close(fd); + return -1; + } + + sockets->fds[sockets->opened] = fd; + sockets->fds_types[sockets->opened] = socktype; + sockets->fds_families[sockets->opened] = family; + sockets->fds_names[sockets->opened] = strdup_client_description(family, protocol, ip, port); + sockets->fds_acl_flags[sockets->opened] = acl_flags; + + sockets->opened++; + return 0; +} + +int listen_sockets_check_is_member(LISTEN_SOCKETS *sockets, int fd) { + size_t i; + for(i = 0; i < sockets->opened ;i++) + if(sockets->fds[i] == fd) return 1; + + return 0; +} + +static inline void listen_sockets_init(LISTEN_SOCKETS *sockets) { + size_t i; + for(i = 0; i < MAX_LISTEN_FDS ;i++) { + sockets->fds[i] = -1; + sockets->fds_names[i] = NULL; + sockets->fds_types[i] = -1; + } + + sockets->opened = 0; + sockets->failed = 0; +} + +void listen_sockets_close(LISTEN_SOCKETS *sockets) { + size_t i; + for(i = 0; i < sockets->opened ;i++) { + close(sockets->fds[i]); + sockets->fds[i] = -1; + + freez(sockets->fds_names[i]); + sockets->fds_names[i] = NULL; + + sockets->fds_types[i] = -1; + } + + sockets->opened = 0; + sockets->failed = 0; +} + +/* + * SSL ACL + * + * Search the SSL acl and apply it case it is set. + * + * @param acl is the acl given by the user. + */ +HTTP_ACL socket_ssl_acl(char *acl) { + char *ssl = strchr(acl,'^'); + if(ssl) { + //Due the format of the SSL command it is always the last command, + //we finish it here to avoid problems with the ACLs + *ssl = '\0'; +#ifdef ENABLE_HTTPS + ssl++; + if (!strncmp("SSL=",ssl,4)) { + ssl += 4; + if (!strcmp(ssl,"optional")) { + return HTTP_ACL_SSL_OPTIONAL; + } + else if (!strcmp(ssl,"force")) { + return HTTP_ACL_SSL_FORCE; + } + } +#endif + } + + return HTTP_ACL_NONE; +} + +HTTP_ACL read_acl(char *st) { + HTTP_ACL ret = socket_ssl_acl(st); + + if (!strcmp(st,"dashboard")) ret |= HTTP_ACL_DASHBOARD; + if (!strcmp(st,"registry")) ret |= HTTP_ACL_REGISTRY; + if (!strcmp(st,"badges")) ret |= HTTP_ACL_BADGES; + if (!strcmp(st,"management")) ret |= HTTP_ACL_MANAGEMENT; + if (!strcmp(st,"streaming")) ret |= HTTP_ACL_STREAMING; + if (!strcmp(st,"netdata.conf")) ret |= HTTP_ACL_NETDATACONF; + + return ret; +} + +static inline int bind_to_this(LISTEN_SOCKETS *sockets, const char *definition, uint16_t default_port, int listen_backlog) { + int added = 0; + HTTP_ACL acl_flags = HTTP_ACL_NONE; + + struct addrinfo hints; + struct addrinfo *result = NULL, *rp = NULL; + + char buffer[strlen(definition) + 1]; + strcpy(buffer, definition); + + char buffer2[10 + 1]; + snprintfz(buffer2, 10, "%d", default_port); + + char *ip = buffer, *port = buffer2, *interface = "", *portconfig; + + int protocol = IPPROTO_TCP, socktype = SOCK_STREAM; + const char *protocol_str = "tcp"; + + if(strncmp(ip, "tcp:", 4) == 0) { + ip += 4; + protocol = IPPROTO_TCP; + socktype = SOCK_STREAM; + protocol_str = "tcp"; + acl_flags |= HTTP_ACL_API; + } + else if(strncmp(ip, "udp:", 4) == 0) { + ip += 4; + protocol = IPPROTO_UDP; + socktype = SOCK_DGRAM; + protocol_str = "udp"; + acl_flags |= HTTP_ACL_API_UDP; + } + else if(strncmp(ip, "unix:", 5) == 0) { + char *path = ip + 5; + socktype = SOCK_STREAM; + protocol_str = "unix"; + int fd = create_listen_socket_unix(path, listen_backlog); + if (fd == -1) { + nd_log(NDLS_DAEMON, NDLP_ERR, + "LISTENER: Cannot create unix socket '%s'", + path); + + sockets->failed++; + } else { + acl_flags = HTTP_ACL_API_UNIX | HTTP_ACL_DASHBOARD | HTTP_ACL_REGISTRY | HTTP_ACL_BADGES | + HTTP_ACL_MANAGEMENT | HTTP_ACL_NETDATACONF | HTTP_ACL_STREAMING | HTTP_ACL_SSL_DEFAULT; + listen_sockets_add(sockets, fd, AF_UNIX, socktype, protocol_str, path, 0, acl_flags); + added++; + } + return added; + } + + char *e = ip; + if(*e == '[') { + e = ++ip; + while(*e && *e != ']') e++; + if(*e == ']') { + *e = '\0'; + e++; + } + } + else { + while(*e && *e != ':' && *e != '%' && *e != '=') e++; + } + + if(*e == '%') { + *e = '\0'; + e++; + interface = e; + while(*e && *e != ':' && *e != '=') e++; + } + + if(*e == ':') { + port = e + 1; + *e = '\0'; + e++; + while(*e && *e != '=') e++; + } + + if(*e == '=') { + *e='\0'; + e++; + portconfig = e; + while (*e != '\0') { + if (*e == '|') { + *e = '\0'; + acl_flags |= read_acl(portconfig); + e++; + portconfig = e; + continue; + } + e++; + } + acl_flags |= read_acl(portconfig); + } else { + acl_flags |= HTTP_ACL_DASHBOARD | HTTP_ACL_REGISTRY | HTTP_ACL_BADGES | HTTP_ACL_MANAGEMENT | HTTP_ACL_NETDATACONF | HTTP_ACL_STREAMING | HTTP_ACL_SSL_DEFAULT; + } + + //Case the user does not set the option SSL in the "bind to", but he has + //the certificates, I must redirect, so I am assuming here the default option + if(!(acl_flags & HTTP_ACL_SSL_OPTIONAL) && !(acl_flags & HTTP_ACL_SSL_FORCE)) { + acl_flags |= HTTP_ACL_SSL_DEFAULT; + } + + uint32_t scope_id = 0; + if(*interface) { + scope_id = if_nametoindex(interface); + if(!scope_id) + nd_log(NDLS_DAEMON, NDLP_ERR, + "LISTENER: Cannot find a network interface named '%s'. " + "Continuing with limiting the network interface", + interface); + } + + if(!*ip || *ip == '*' || !strcmp(ip, "any") || !strcmp(ip, "all")) + ip = NULL; + + if(!*port) + port = buffer2; + + memset(&hints, 0, sizeof(hints)); + hints.ai_family = AF_UNSPEC; /* Allow IPv4 or IPv6 */ + hints.ai_socktype = socktype; + hints.ai_flags = AI_PASSIVE; /* For wildcard IP address */ + hints.ai_protocol = protocol; + hints.ai_canonname = NULL; + hints.ai_addr = NULL; + hints.ai_next = NULL; + + int r = getaddrinfo(ip, port, &hints, &result); + if (r != 0) { + nd_log(NDLS_DAEMON, NDLP_ERR, + "LISTENER: getaddrinfo('%s', '%s'): %s\n", + ip, port, gai_strerror(r)); + + return -1; + } + + for (rp = result; rp != NULL; rp = rp->ai_next) { + int fd = -1; + int family; + + char rip[INET_ADDRSTRLEN + INET6_ADDRSTRLEN] = "INVALID"; + uint16_t rport = default_port; + + family = rp->ai_addr->sa_family; + switch (family) { + case AF_INET: { + struct sockaddr_in *sin = (struct sockaddr_in *) rp->ai_addr; + inet_ntop(AF_INET, &sin->sin_addr, rip, INET_ADDRSTRLEN); + rport = ntohs(sin->sin_port); + fd = create_listen_socket4(socktype, rip, rport, listen_backlog); + break; + } + + case AF_INET6: { + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) rp->ai_addr; + inet_ntop(AF_INET6, &sin6->sin6_addr, rip, INET6_ADDRSTRLEN); + rport = ntohs(sin6->sin6_port); + fd = create_listen_socket6(socktype, scope_id, rip, rport, listen_backlog); + break; + } + + default: + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "LISTENER: Unknown socket family %d", + family); + + break; + } + + if (fd == -1) { + nd_log(NDLS_DAEMON, NDLP_ERR, + "LISTENER: Cannot bind to ip '%s', port %d", + rip, rport); + + sockets->failed++; + } + else { + listen_sockets_add(sockets, fd, family, socktype, protocol_str, rip, rport, acl_flags); + added++; + } + } + + freeaddrinfo(result); + + return added; +} + +int listen_sockets_setup(LISTEN_SOCKETS *sockets) { + listen_sockets_init(sockets); + + sockets->backlog = (int) appconfig_get_number(sockets->config, sockets->config_section, "listen backlog", sockets->backlog); + + long long int old_port = sockets->default_port; + long long int new_port = appconfig_get_number(sockets->config, sockets->config_section, "default port", sockets->default_port); + if(new_port < 1 || new_port > 65535) { + nd_log(NDLS_DAEMON, NDLP_ERR, + "LISTENER: Invalid listen port %lld given. Defaulting to %lld.", + new_port, old_port); + + sockets->default_port = (uint16_t) appconfig_set_number(sockets->config, sockets->config_section, "default port", old_port); + } + else sockets->default_port = (uint16_t)new_port; + + char *s = appconfig_get(sockets->config, sockets->config_section, "bind to", sockets->default_bind_to); + while(*s) { + char *e = s; + + // skip separators, moving both s(tart) and e(nd) + while(isspace((uint8_t)*e) || *e == ',') s = ++e; + + // move e(nd) to the first separator + while(*e && !isspace((uint8_t)*e) && *e != ',') e++; + + // is there anything? + if(!*s || s == e) break; + + char buf[e - s + 1]; + strncpyz(buf, s, e - s); + bind_to_this(sockets, buf, sockets->default_port, sockets->backlog); + + s = e; + } + + if(sockets->failed) { + size_t i; + for(i = 0; i < sockets->opened ;i++) + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "LISTENER: Listen socket %s opened successfully.", + sockets->fds_names[i]); + } + + return (int)sockets->opened; +} + + +// -------------------------------------------------------------------------------------------------------------------- +// connect to another host/port + +// connect_to_this_unix() +// path the path of the unix socket +// timeout the timeout for establishing a connection + +static inline int connect_to_unix(const char *path, struct timeval *timeout) { + int fd = socket(AF_UNIX, SOCK_STREAM | DEFAULT_SOCKET_FLAGS, 0); + if(fd == -1) { + nd_log(NDLS_DAEMON, NDLP_ERR, + "Failed to create UNIX socket() for '%s'", + path); + + return -1; + } + + if(timeout) { + if(setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, (char *) timeout, sizeof(struct timeval)) < 0) + nd_log(NDLS_DAEMON, NDLP_ERR, + "Failed to set timeout on UNIX socket '%s'", + path); + } + + sock_setcloexec(fd); + + struct sockaddr_un addr; + memset(&addr, 0, sizeof(addr)); + addr.sun_family = AF_UNIX; + strncpy(addr.sun_path, path, sizeof(addr.sun_path)-1); + + if (connect(fd, (struct sockaddr*)&addr, sizeof(addr)) == -1) { + nd_log(NDLS_DAEMON, NDLP_ERR, + "Cannot connect to UNIX socket on path '%s'.", + path); + + close(fd); + return -1; + } + + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "Connected to UNIX socket on path '%s'.", + path); + + return fd; +} + +// connect_to_this_ip46() +// protocol IPPROTO_TCP, IPPROTO_UDP +// socktype SOCK_STREAM, SOCK_DGRAM +// host the destination hostname or IP address (IPv4 or IPv6) to connect to +// if it resolves to many IPs, all are tried (IPv4 and IPv6) +// scope_id the if_index id of the interface to use for connecting (0 = any) +// (used only under IPv6) +// service the service name or port to connect to +// timeout the timeout for establishing a connection + +int connect_to_this_ip46(int protocol, int socktype, const char *host, uint32_t scope_id, const char *service, struct timeval *timeout) { + struct addrinfo hints; + struct addrinfo *ai_head = NULL, *ai = NULL; + + memset(&hints, 0, sizeof(hints)); + hints.ai_family = PF_UNSPEC; /* Allow IPv4 or IPv6 */ + hints.ai_socktype = socktype; + hints.ai_protocol = protocol; + + int ai_err = getaddrinfo(host, service, &hints, &ai_head); + if (ai_err != 0) { + + nd_log(NDLS_DAEMON, NDLP_ERR, + "Cannot resolve host '%s', port '%s': %s", + host, service, gai_strerror(ai_err)); + + return -1; + } + + char hostBfr[NI_MAXHOST + 1]; + char servBfr[NI_MAXSERV + 1]; + + ND_LOG_STACK lgs[] = { + ND_LOG_FIELD_TXT(NDF_DST_IP, hostBfr), + ND_LOG_FIELD_TXT(NDF_DST_PORT, servBfr), + ND_LOG_FIELD_END(), + }; + ND_LOG_STACK_PUSH(lgs); + + int fd = -1; + for (ai = ai_head; ai != NULL && fd == -1; ai = ai->ai_next) { + if(nd_thread_signaled_to_cancel()) break; + + if (ai->ai_family == PF_INET6) { + struct sockaddr_in6 *pSadrIn6 = (struct sockaddr_in6 *) ai->ai_addr; + if(pSadrIn6->sin6_scope_id == 0) { + pSadrIn6->sin6_scope_id = scope_id; + } + } + + getnameinfo(ai->ai_addr, + ai->ai_addrlen, + hostBfr, + sizeof(hostBfr), + servBfr, + sizeof(servBfr), + NI_NUMERICHOST | NI_NUMERICSERV); + + switch (ai->ai_addr->sa_family) { + case PF_INET: { + struct sockaddr_in *pSadrIn = (struct sockaddr_in *)ai->ai_addr; + (void)pSadrIn; + break; + } + + case PF_INET6: { + struct sockaddr_in6 *pSadrIn6 = (struct sockaddr_in6 *) ai->ai_addr; + (void)pSadrIn6; + break; + } + + default: { + // Unknown protocol family + continue; + } + } + + fd = socket(ai->ai_family, ai->ai_socktype | DEFAULT_SOCKET_FLAGS, ai->ai_protocol); + if(fd != -1) { + if(timeout) { + if(setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, (char *) timeout, sizeof(struct timeval)) < 0) + nd_log(NDLS_DAEMON, NDLP_ERR, + "Failed to set timeout on the socket to ip '%s' port '%s'", + hostBfr, servBfr); + } + sock_setcloexec(fd); + + errno = 0; + if(connect(fd, ai->ai_addr, ai->ai_addrlen) < 0) { + if(errno == EALREADY || errno == EINPROGRESS) { + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "Waiting for connection to ip %s port %s to be established", + hostBfr, servBfr); + + // Convert 'struct timeval' to milliseconds for poll(): + int timeout_ms = timeout->tv_sec * 1000 + timeout->tv_usec / 1000; + + switch(wait_on_socket_or_cancel_with_timeout( +#ifdef ENABLE_HTTPS + NULL, +#endif + fd, timeout_ms, POLLOUT, NULL)) { + case 0: // proceed + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "connect() to ip %s port %s completed successfully", + hostBfr, servBfr); + break; + + case -1: // thread cancelled + nd_log(NDLS_DAEMON, NDLP_ERR, + "Thread is cancelled while connecting to '%s', port '%s'.", + hostBfr, servBfr); + + close(fd); + fd = -1; + break; + + case 1: // timeout + nd_log(NDLS_DAEMON, NDLP_ERR, + "Timed out while connecting to '%s', port '%s'.", + hostBfr, servBfr); + + close(fd); + fd = -1; + break; + + default: + case 2: // error + nd_log(NDLS_DAEMON, NDLP_ERR, + "Failed to connect to '%s', port '%s'.", + hostBfr, servBfr); + + close(fd); + fd = -1; + break; + } + } + else { + nd_log(NDLS_DAEMON, NDLP_ERR, + "Failed to connect to '%s', port '%s'", + hostBfr, servBfr); + + close(fd); + fd = -1; + } + } + } + else + nd_log(NDLS_DAEMON, NDLP_ERR, + "Failed to socket() to '%s', port '%s'", + hostBfr, servBfr); + } + + freeaddrinfo(ai_head); + + return fd; +} + +// connect_to_this() +// +// definition format: +// +// [PROTOCOL:]IP[%INTERFACE][:PORT] +// +// PROTOCOL = tcp or udp +// IP = IPv4 or IPv6 IP or hostname, optionally enclosed in [] (required for IPv6) +// INTERFACE = for IPv6 only, the network interface to use +// PORT = port number or service name + +int connect_to_this(const char *definition, int default_port, struct timeval *timeout) { + char buffer[strlen(definition) + 1]; + strcpy(buffer, definition); + + char default_service[10 + 1]; + snprintfz(default_service, 10, "%d", default_port); + + char *host = buffer, *service = default_service, *interface = ""; + int protocol = IPPROTO_TCP, socktype = SOCK_STREAM; + uint32_t scope_id = 0; + + if(strncmp(host, "tcp:", 4) == 0) { + host += 4; + protocol = IPPROTO_TCP; + socktype = SOCK_STREAM; + } + else if(strncmp(host, "udp:", 4) == 0) { + host += 4; + protocol = IPPROTO_UDP; + socktype = SOCK_DGRAM; + } + else if(strncmp(host, "unix:", 5) == 0) { + char *path = host + 5; + return connect_to_unix(path, timeout); + } + else if(*host == '/') { + char *path = host; + return connect_to_unix(path, timeout); + } + + char *e = host; + if(*e == '[') { + e = ++host; + while(*e && *e != ']') e++; + if(*e == ']') { + *e = '\0'; + e++; + } + } + else { + while(*e && *e != ':' && *e != '%') e++; + } + + if(*e == '%') { + *e = '\0'; + e++; + interface = e; + while(*e && *e != ':') e++; + } + + if(*e == ':') { + *e = '\0'; + e++; + service = e; + } + + if(!*host) { + nd_log(NDLS_DAEMON, NDLP_ERR, + "Definition '%s' does not specify a host.", + definition); + + return -1; + } + + if(*interface) { + scope_id = if_nametoindex(interface); + if(!scope_id) + nd_log(NDLS_DAEMON, NDLP_ERR, + "Cannot find a network interface named '%s'. Continuing with limiting the network interface", + interface); + } + + if(!*service) + service = default_service; + + + return connect_to_this_ip46(protocol, socktype, host, scope_id, service, timeout); +} + +void foreach_entry_in_connection_string(const char *destination, bool (*callback)(char *entry, void *data), void *data) { + const char *s = destination; + while(*s) { + const char *e = s; + + // skip separators, moving both s(tart) and e(nd) + while(isspace((uint8_t)*e) || *e == ',') s = ++e; + + // move e(nd) to the first separator + while(*e && !isspace((uint8_t)*e) && *e != ',') e++; + + // is there anything? + if(!*s || s == e) break; + + char buf[e - s + 1]; + strncpyz(buf, s, e - s); + + if(callback(buf, data)) break; + + s = e; + } +} + +struct connect_to_one_of_data { + int default_port; + struct timeval *timeout; + size_t *reconnects_counter; + char *connected_to; + size_t connected_to_size; + int sock; +}; + +static bool connect_to_one_of_callback(char *entry, void *data) { + struct connect_to_one_of_data *t = data; + + if(t->reconnects_counter) + t->reconnects_counter++; + + t->sock = connect_to_this(entry, t->default_port, t->timeout); + if(t->sock != -1) { + if(t->connected_to && t->connected_to_size) { + strncpyz(t->connected_to, entry, t->connected_to_size); + t->connected_to[t->connected_to_size - 1] = '\0'; + } + + return true; + } + + return false; +} + +int connect_to_one_of(const char *destination, int default_port, struct timeval *timeout, size_t *reconnects_counter, char *connected_to, size_t connected_to_size) { + struct connect_to_one_of_data t = { + .default_port = default_port, + .timeout = timeout, + .reconnects_counter = reconnects_counter, + .connected_to = connected_to, + .connected_to_size = connected_to_size, + .sock = -1, + }; + + foreach_entry_in_connection_string(destination, connect_to_one_of_callback, &t); + + return t.sock; +} + +static bool connect_to_one_of_urls_callback(char *entry, void *data) { + char *s = strchr(entry, '/'); + if(s) *s = '\0'; + + return connect_to_one_of_callback(entry, data); +} + +int connect_to_one_of_urls(const char *destination, int default_port, struct timeval *timeout, size_t *reconnects_counter, char *connected_to, size_t connected_to_size) { + struct connect_to_one_of_data t = { + .default_port = default_port, + .timeout = timeout, + .reconnects_counter = reconnects_counter, + .connected_to = connected_to, + .connected_to_size = connected_to_size, + .sock = -1, + }; + + foreach_entry_in_connection_string(destination, connect_to_one_of_urls_callback, &t); + + return t.sock; +} + + +// -------------------------------------------------------------------------------------------------------------------- +// helpers to send/receive data in one call, in blocking mode, with a timeout + +// returns: -1 = thread cancelled, 0 = proceed to read/write, 1 = time exceeded, 2 = error on fd +// timeout parameter can be zero to wait forever +inline int wait_on_socket_or_cancel_with_timeout( +#ifdef ENABLE_HTTPS + NETDATA_SSL *ssl, +#endif + int fd, int timeout_ms, short int poll_events, short int *revents) { + struct pollfd pfd = { + .fd = fd, + .events = poll_events, + .revents = 0, + }; + + bool forever = (timeout_ms == 0); + + while (timeout_ms > 0 || forever) { + if(nd_thread_signaled_to_cancel()) { + errno = ECANCELED; + return -1; + } + +#ifdef ENABLE_HTTPS + if(poll_events == POLLIN && ssl && SSL_connection(ssl) && netdata_ssl_has_pending(ssl)) + return 0; +#endif + + const int wait_ms = (timeout_ms >= ND_CHECK_CANCELLABILITY_WHILE_WAITING_EVERY_MS || forever) ? + ND_CHECK_CANCELLABILITY_WHILE_WAITING_EVERY_MS : timeout_ms; + + errno = 0; + + // check every wait_ms + const int ret = poll(&pfd, 1, wait_ms); + + if(revents) + *revents = pfd.revents; + + if(ret == -1) { + // poll failed + + if(errno == EINTR || errno == EAGAIN) + continue; + + return 2; + } + + if(ret == 0) { + // timeout + if(!forever) + timeout_ms -= wait_ms; + continue; + } + + if(pfd.revents & poll_events) + return 0; + + // all other errors + return 2; + } + + errno = ETIMEDOUT; + return 1; +} + +ssize_t recv_timeout( +#ifdef ENABLE_HTTPS + NETDATA_SSL *ssl, +#endif + int sockfd, void *buf, size_t len, int flags, int timeout) { + + switch(wait_on_socket_or_cancel_with_timeout( +#ifdef ENABLE_HTTPS + ssl, +#endif + sockfd, timeout * 1000, POLLIN, NULL)) { + case 0: // data are waiting + break; + + case 1: // timeout + return 0; + + default: + case -1: // thread cancelled + case 2: // error on socket + return -1; + } + +#ifdef ENABLE_HTTPS + if (SSL_connection(ssl)) { + return netdata_ssl_read(ssl, buf, len); + } +#endif + + return recv(sockfd, buf, len, flags); +} + +ssize_t send_timeout( +#ifdef ENABLE_HTTPS + NETDATA_SSL *ssl, +#endif + int sockfd, void *buf, size_t len, int flags, int timeout) { + + switch(wait_on_socket_or_cancel_with_timeout( +#ifdef ENABLE_HTTPS + ssl, +#endif + sockfd, timeout * 1000, POLLOUT, NULL)) { + case 0: // data are waiting + break; + + case 1: // timeout + return 0; + + default: + case -1: // thread cancelled + case 2: // error on socket + return -1; + } + +#ifdef ENABLE_HTTPS + if(ssl->conn) { + if (SSL_connection(ssl)) { + return netdata_ssl_write(ssl, buf, len); + } + else { + nd_log(NDLS_DAEMON, NDLP_ERR, + "cannot write to SSL connection - connection is not ready."); + + return -1; + } + } +#endif + return send(sockfd, buf, len, flags); +} + + +// -------------------------------------------------------------------------------------------------------------------- +// accept4() replacement for systems that do not have one + +#ifndef HAVE_ACCEPT4 +int accept4(int sock, struct sockaddr *addr, socklen_t *addrlen, int flags) { + int fd = accept(sock, addr, addrlen); + int newflags = 0; + + if (fd < 0) return fd; + +#ifdef SOCK_CLOEXEC +#ifdef O_CLOEXEC + if (flags & SOCK_CLOEXEC) { + newflags |= O_CLOEXEC; + flags &= ~SOCK_CLOEXEC; + } +#endif +#endif + + if (flags) { + close(fd); + errno = EINVAL; + return -1; + } + + if (fcntl(fd, F_SETFL, newflags) < 0) { + int saved_errno = errno; + close(fd); + errno = saved_errno; + return -1; + } + + return fd; +} +#endif + +/* + * --------------------------------------------------------------------------------------------------------------------- + * connection_allowed() - if there is an access list then check the connection matches a pattern. + * Numeric patterns are checked against the IP address first, only if they + * do not match is the hostname resolved (reverse-DNS) and checked. If the + * hostname matches then we perform forward DNS resolution to check the IP + * is really associated with the DNS record. This call is repeatable: the + * web server may check more refined matches against the connection. Will + * update the client_host if uninitialized - ensure the hostsize is the number + * of *writable* bytes (i.e. be aware of the strdup used to compact the pollinfo). + */ +int connection_allowed(int fd, char *client_ip, char *client_host, size_t hostsize, SIMPLE_PATTERN *access_list, + const char *patname, int allow_dns) +{ + if (!access_list) + return 1; + if (simple_pattern_matches(access_list, client_ip)) + return 1; + // If the hostname is unresolved (and needed) then attempt the DNS lookups. + //if (client_host[0]==0 && simple_pattern_is_potential_name(access_list)) + if (client_host[0]==0 && allow_dns) + { + struct sockaddr_storage sadr; + socklen_t addrlen = sizeof(sadr); + int err = getpeername(fd, (struct sockaddr*)&sadr, &addrlen); + if (err != 0 || + (err = getnameinfo((struct sockaddr *)&sadr, addrlen, client_host, (socklen_t)hostsize, + NULL, 0, NI_NAMEREQD)) != 0) { + + nd_log(NDLS_DAEMON, NDLP_ERR, + "Incoming %s on '%s' does not match a numeric pattern, and host could not be resolved (err=%s)", + patname, client_ip, gai_strerror(err)); + + if (hostsize >= 8) + strcpy(client_host,"UNKNOWN"); + return 0; + } + struct addrinfo *addr_infos = NULL; + if (getaddrinfo(client_host, NULL, NULL, &addr_infos) !=0 ) { + nd_log(NDLS_DAEMON, NDLP_ERR, + "LISTENER: cannot validate hostname '%s' from '%s' by resolving it", + client_host, client_ip); + + if (hostsize >= 8) + strcpy(client_host,"UNKNOWN"); + return 0; + } + struct addrinfo *scan = addr_infos; + int validated = 0; + while (scan) { + char address[INET6_ADDRSTRLEN]; + address[0] = 0; + switch (scan->ai_addr->sa_family) { + case AF_INET: + inet_ntop(AF_INET, &((struct sockaddr_in*)(scan->ai_addr))->sin_addr, address, INET6_ADDRSTRLEN); + break; + case AF_INET6: + inet_ntop(AF_INET6, &((struct sockaddr_in6*)(scan->ai_addr))->sin6_addr, address, INET6_ADDRSTRLEN); + break; + } + if (!strcmp(client_ip, address)) { + validated = 1; + break; + } + scan = scan->ai_next; + } + if (!validated) { + nd_log(NDLS_DAEMON, NDLP_ERR, + "LISTENER: Cannot validate '%s' as ip of '%s', not listed in DNS", + client_ip, client_host); + + if (hostsize >= 8) + strcpy(client_host,"UNKNOWN"); + } + if (addr_infos!=NULL) + freeaddrinfo(addr_infos); + } + if (!simple_pattern_matches(access_list, client_host)) + return 0; + + return 1; +} + +// -------------------------------------------------------------------------------------------------------------------- +// accept_socket() - accept a socket and store client IP and port +int accept_socket(int fd, int flags, char *client_ip, size_t ipsize, char *client_port, size_t portsize, + char *client_host, size_t hostsize, SIMPLE_PATTERN *access_list, int allow_dns) { + struct sockaddr_storage sadr; + socklen_t addrlen = sizeof(sadr); + + int nfd = accept4(fd, (struct sockaddr *)&sadr, &addrlen, flags | DEFAULT_SOCKET_FLAGS); + if (likely(nfd >= 0)) { + if (getnameinfo((struct sockaddr *)&sadr, addrlen, client_ip, (socklen_t)ipsize, + client_port, (socklen_t)portsize, NI_NUMERICHOST | NI_NUMERICSERV) != 0) { + + nd_log(NDLS_DAEMON, NDLP_ERR, + "LISTENER: cannot getnameinfo() on received client connection."); + + strncpyz(client_ip, "UNKNOWN", ipsize); + strncpyz(client_port, "UNKNOWN", portsize); + } + if (!strcmp(client_ip, "127.0.0.1") || !strcmp(client_ip, "::1")) { + strncpyz(client_ip, "localhost", ipsize); + } + sock_setcloexec(nfd); + +#ifdef __FreeBSD__ + if(((struct sockaddr *)&sadr)->sa_family == AF_LOCAL) + strncpyz(client_ip, "localhost", ipsize); +#endif + + client_ip[ipsize - 1] = '\0'; + client_port[portsize - 1] = '\0'; + + switch (((struct sockaddr *)&sadr)->sa_family) { + case AF_UNIX: + // netdata_log_debug(D_LISTENER, "New UNIX domain web client from %s on socket %d.", client_ip, fd); + // set the port - certain versions of libc return garbage on unix sockets + strncpyz(client_port, "UNIX", portsize); + break; + + case AF_INET: + // netdata_log_debug(D_LISTENER, "New IPv4 web client from %s port %s on socket %d.", client_ip, client_port, fd); + break; + + case AF_INET6: + if (strncmp(client_ip, "::ffff:", 7) == 0) { + memmove(client_ip, &client_ip[7], strlen(&client_ip[7]) + 1); + // netdata_log_debug(D_LISTENER, "New IPv4 web client from %s port %s on socket %d.", client_ip, client_port, fd); + } + // else + // netdata_log_debug(D_LISTENER, "New IPv6 web client from %s port %s on socket %d.", client_ip, client_port, fd); + break; + + default: + // netdata_log_debug(D_LISTENER, "New UNKNOWN web client from %s port %s on socket %d.", client_ip, client_port, fd); + break; + } + if (!connection_allowed(nfd, client_ip, client_host, hostsize, access_list, "connection", allow_dns)) { + errno = 0; + nd_log(NDLS_DAEMON, NDLP_WARNING, + "Permission denied for client '%s', port '%s'", + client_ip, client_port); + + close(nfd); + nfd = -1; + errno = EPERM; + } + } +#ifdef HAVE_ACCEPT4 + else if (errno == ENOSYS) + nd_log(NDLS_DAEMON, NDLP_ERR, + "Netdata has been compiled with the assumption that the system has the accept4() call, but it is not here. " + "Recompile netdata like this: ./configure --disable-accept4 ..."); +#endif + + return nfd; +} + + +// -------------------------------------------------------------------------------------------------------------------- +// poll() based listener +// this should be the fastest possible listener for up to 100 sockets +// above 100, an epoll() interface is needed on Linux + +#define POLL_FDS_INCREASE_STEP 10 + +inline POLLINFO *poll_add_fd(POLLJOB *p + , int fd + , int socktype + , HTTP_ACL port_acl + , uint32_t flags + , const char *client_ip + , const char *client_port + , const char *client_host + , void *(*add_callback)(POLLINFO * /*pi*/, short int * /*events*/, void * /*data*/) + , void (*del_callback)(POLLINFO * /*pi*/) + , int (*rcv_callback)(POLLINFO * /*pi*/, short int * /*events*/) + , int (*snd_callback)(POLLINFO * /*pi*/, short int * /*events*/) + , void *data +) { + if(unlikely(fd < 0)) return NULL; + + //if(p->limit && p->used >= p->limit) { + // nd_log(NDLS_DAEMON, NDLP_WARNING, "Max sockets limit reached (%zu sockets), dropping connection", p->used); + // close(fd); + // return NULL; + //} + + if(unlikely(!p->first_free)) { + size_t new_slots = p->slots + POLL_FDS_INCREASE_STEP; + + p->fds = reallocz(p->fds, sizeof(struct pollfd) * new_slots); + p->inf = reallocz(p->inf, sizeof(POLLINFO) * new_slots); + + // reset all the newly added slots + ssize_t i; + for(i = new_slots - 1; i >= (ssize_t)p->slots ; i--) { + p->fds[i].fd = -1; + p->fds[i].events = 0; + p->fds[i].revents = 0; + + p->inf[i].p = p; + p->inf[i].slot = (size_t)i; + p->inf[i].flags = 0; + p->inf[i].socktype = -1; + p->inf[i].port_acl = -1; + + p->inf[i].client_ip = NULL; + p->inf[i].client_port = NULL; + p->inf[i].client_host = NULL; + p->inf[i].del_callback = p->del_callback; + p->inf[i].rcv_callback = p->rcv_callback; + p->inf[i].snd_callback = p->snd_callback; + p->inf[i].data = NULL; + + // link them so that the first free will be earlier in the array + // (we loop decrementing i) + p->inf[i].next = p->first_free; + p->first_free = &p->inf[i]; + } + + p->slots = new_slots; + } + + POLLINFO *pi = p->first_free; + p->first_free = p->first_free->next; + + struct pollfd *pf = &p->fds[pi->slot]; + pf->fd = fd; + pf->events = POLLIN; + pf->revents = 0; + + pi->fd = fd; + pi->p = p; + pi->socktype = socktype; + pi->port_acl = port_acl; + pi->flags = flags; + pi->next = NULL; + pi->client_ip = strdupz(client_ip); + pi->client_port = strdupz(client_port); + pi->client_host = strdupz(client_host); + + pi->del_callback = del_callback; + pi->rcv_callback = rcv_callback; + pi->snd_callback = snd_callback; + + pi->connected_t = now_boottime_sec(); + pi->last_received_t = 0; + pi->last_sent_t = 0; + pi->last_sent_t = 0; + pi->recv_count = 0; + pi->send_count = 0; + + p->used++; + if(unlikely(pi->slot > p->max)) + p->max = pi->slot; + + if(pi->flags & POLLINFO_FLAG_CLIENT_SOCKET) { + pi->data = add_callback(pi, &pf->events, data); + } + + if(pi->flags & POLLINFO_FLAG_SERVER_SOCKET) { + p->min = pi->slot; + } + + return pi; +} + +inline void poll_close_fd(POLLINFO *pi) { + POLLJOB *p = pi->p; + + struct pollfd *pf = &p->fds[pi->slot]; + + if(unlikely(pf->fd == -1)) return; + + if(pi->flags & POLLINFO_FLAG_CLIENT_SOCKET) { + pi->del_callback(pi); + + if(likely(!(pi->flags & POLLINFO_FLAG_DONT_CLOSE))) { + if(close(pf->fd) == -1) + nd_log(NDLS_DAEMON, NDLP_ERR, + "Failed to close() poll_events() socket %d", + pf->fd); + } + } + + pf->fd = -1; + pf->events = 0; + pf->revents = 0; + + pi->fd = -1; + pi->socktype = -1; + pi->flags = 0; + pi->data = NULL; + + pi->del_callback = NULL; + pi->rcv_callback = NULL; + pi->snd_callback = NULL; + + freez(pi->client_ip); + pi->client_ip = NULL; + + freez(pi->client_port); + pi->client_port = NULL; + + freez(pi->client_host); + pi->client_host = NULL; + + pi->next = p->first_free; + p->first_free = pi; + + p->used--; + if(unlikely(p->max == pi->slot)) { + p->max = p->min; + ssize_t i; + for(i = (ssize_t)pi->slot; i > (ssize_t)p->min ;i--) { + if (unlikely(p->fds[i].fd != -1)) { + p->max = (size_t)i; + break; + } + } + } +} + +void *poll_default_add_callback(POLLINFO *pi, short int *events, void *data) { + (void)pi; + (void)events; + (void)data; + + return NULL; +} + +void poll_default_del_callback(POLLINFO *pi) { + if(pi->data) + nd_log(NDLS_DAEMON, NDLP_ERR, + "POLLFD: internal error: del_callback_default() called with data pointer - possible memory leak"); +} + +int poll_default_rcv_callback(POLLINFO *pi, short int *events) { + *events |= POLLIN; + + char buffer[1024 + 1]; + + ssize_t rc; + do { + rc = recv(pi->fd, buffer, 1024, MSG_DONTWAIT); + if (rc < 0) { + // read failed + if (errno != EWOULDBLOCK && errno != EAGAIN) { + nd_log(NDLS_DAEMON, NDLP_ERR, + "POLLFD: poll_default_rcv_callback(): recv() failed with %zd.", + rc); + + return -1; + } + } else if (rc) { + // data received + nd_log(NDLS_DAEMON, NDLP_WARNING, + "POLLFD: internal error: poll_default_rcv_callback() is discarding %zd bytes received on socket %d", + rc, pi->fd); + } + } while (rc != -1); + + return 0; +} + +int poll_default_snd_callback(POLLINFO *pi, short int *events) { + *events &= ~POLLOUT; + + nd_log(NDLS_DAEMON, NDLP_WARNING, + "POLLFD: internal error: poll_default_snd_callback(): nothing to send on socket %d", + pi->fd); + + return 0; +} + +void poll_default_tmr_callback(void *timer_data) { + (void)timer_data; +} + +static void poll_events_cleanup(void *pptr) { + POLLJOB *p = CLEANUP_FUNCTION_GET_PTR(pptr); + if(!p) return; + + for(size_t i = 0 ; i <= p->max ; i++) { + POLLINFO *pi = &p->inf[i]; + poll_close_fd(pi); + } + + freez(p->fds); + freez(p->inf); +} + +static int poll_process_error(POLLINFO *pi, struct pollfd *pf, short int revents) { + ND_LOG_STACK lgs[] = { + ND_LOG_FIELD_TXT(NDF_SRC_IP, pi->client_ip), + ND_LOG_FIELD_TXT(NDF_SRC_PORT, pi->client_port), + ND_LOG_FIELD_END(), + }; + ND_LOG_STACK_PUSH(lgs); + + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "POLLFD: LISTENER: received %s %s %s on socket at slot %zu (fd %d) client '%s' port '%s' expecting %s %s %s, having %s %s %s" + , revents & POLLERR ? "POLLERR" : "" + , revents & POLLHUP ? "POLLHUP" : "" + , revents & POLLNVAL ? "POLLNVAL" : "" + , pi->slot + , pi->fd + , pi->client_ip ? pi->client_ip : "<undefined-ip>" + , pi->client_port ? pi->client_port : "<undefined-port>" + , pf->events & POLLIN ? "POLLIN" : "", pf->events & POLLOUT ? "POLLOUT" : "", pf->events & POLLPRI ? "POLLPRI" : "" + , revents & POLLIN ? "POLLIN" : "", revents & POLLOUT ? "POLLOUT" : "", revents & POLLPRI ? "POLLPRI" : "" + ); + + pf->events = 0; + poll_close_fd(pi); + return 1; +} + +static inline int poll_process_send(POLLJOB *p, POLLINFO *pi, struct pollfd *pf, time_t now) { + pi->last_sent_t = now; + pi->send_count++; + + pf->events = 0; + + // remember the slot, in case we need to close it later + // the callback may manipulate the socket list and our pf and pi pointers may be invalid after that call + size_t slot = pi->slot; + + if (unlikely(pi->snd_callback(pi, &pf->events) == -1)) + poll_close_fd(&p->inf[slot]); + + // IMPORTANT: + // pf and pi may be invalid below this point, they may have been reallocated. + + return 1; +} + +static inline int poll_process_tcp_read(POLLJOB *p, POLLINFO *pi, struct pollfd *pf, time_t now) { + pi->last_received_t = now; + pi->recv_count++; + + pf->events = 0; + + // remember the slot, in case we need to close it later + // the callback may manipulate the socket list and our pf and pi pointers may be invalid after that call + size_t slot = pi->slot; + + if (pi->rcv_callback(pi, &pf->events) == -1) + poll_close_fd(&p->inf[slot]); + + // IMPORTANT: + // pf and pi may be invalid below this point, they may have been reallocated. + + return 1; +} + +static inline int poll_process_udp_read(POLLINFO *pi, struct pollfd *pf, time_t now __maybe_unused) { + pi->last_received_t = now; + pi->recv_count++; + + // TODO: access_list is not applied to UDP + // but checking the access list on every UDP packet will destroy + // performance, especially for statsd. + + pf->events = 0; + if(pi->rcv_callback(pi, &pf->events) == -1) + return 0; + + // IMPORTANT: + // pf and pi may be invalid below this point, they may have been reallocated. + + return 1; +} + +static int poll_process_new_tcp_connection(POLLJOB *p, POLLINFO *pi, struct pollfd *pf, time_t now) { + pi->last_received_t = now; + pi->recv_count++; + + char client_ip[INET6_ADDRSTRLEN] = ""; + char client_port[NI_MAXSERV] = ""; + char client_host[NI_MAXHOST] = ""; + +#ifdef SOCK_NONBLOCK + int flags = SOCK_NONBLOCK; +#else + int flags = 0; +#endif + + int nfd = accept_socket( + pf->fd, flags, + client_ip, INET6_ADDRSTRLEN, client_port,NI_MAXSERV, client_host, NI_MAXHOST, + p->access_list, p->allow_dns + ); + +#ifndef SOCK_NONBLOCK + if (nfd > 0) { + int flags = fcntl(nfd, F_GETFL); + (void)fcntl(nfd, F_SETFL, flags| O_NONBLOCK); + } +#endif + + if (unlikely(nfd < 0)) { + // accept failed + + if(unlikely(errno == EMFILE)) { + nd_log_limit_static_global_var(erl, 10, 1000); + nd_log_limit(&erl, NDLS_DAEMON, NDLP_ERR, + "POLLFD: LISTENER: too many open files - used by this thread %zu, max for this thread %zu", + p->used, p->limit); + } + else if(unlikely(errno != EWOULDBLOCK && errno != EAGAIN)) + nd_log(NDLS_DAEMON, NDLP_ERR, + "POLLFD: LISTENER: accept() failed."); + + } + else { + // accept ok + + poll_add_fd(p + , nfd + , SOCK_STREAM + , pi->port_acl + , POLLINFO_FLAG_CLIENT_SOCKET + , client_ip + , client_port + , client_host + , p->add_callback + , p->del_callback + , p->rcv_callback + , p->snd_callback + , NULL + ); + + // IMPORTANT: + // pf and pi may be invalid below this point, they may have been reallocated. + + return 1; + } + + return 0; +} + +void poll_events(LISTEN_SOCKETS *sockets + , void *(*add_callback)(POLLINFO * /*pi*/, short int * /*events*/, void * /*data*/) + , void (*del_callback)(POLLINFO * /*pi*/) + , int (*rcv_callback)(POLLINFO * /*pi*/, short int * /*events*/) + , int (*snd_callback)(POLLINFO * /*pi*/, short int * /*events*/) + , void (*tmr_callback)(void * /*timer_data*/) + , bool (*check_to_stop_callback)(void) + , SIMPLE_PATTERN *access_list + , int allow_dns + , void *data + , time_t tcp_request_timeout_seconds + , time_t tcp_idle_timeout_seconds + , time_t timer_milliseconds + , void *timer_data + , size_t max_tcp_sockets +) { + if(!sockets || !sockets->opened) { + nd_log(NDLS_DAEMON, NDLP_ERR, + "POLLFD: internal error: no listening sockets are opened"); + return; + } + + if(timer_milliseconds <= 0) timer_milliseconds = 0; + + int retval; + + POLLJOB p = { + .slots = 0, + .used = 0, + .max = 0, + .limit = max_tcp_sockets, + .fds = NULL, + .inf = NULL, + .first_free = NULL, + + .complete_request_timeout = tcp_request_timeout_seconds, + .idle_timeout = tcp_idle_timeout_seconds, + .checks_every = (tcp_idle_timeout_seconds / 3) + 1, + + .access_list = access_list, + .allow_dns = allow_dns, + + .timer_milliseconds = timer_milliseconds, + .timer_data = timer_data, + + .add_callback = add_callback?add_callback:poll_default_add_callback, + .del_callback = del_callback?del_callback:poll_default_del_callback, + .rcv_callback = rcv_callback?rcv_callback:poll_default_rcv_callback, + .snd_callback = snd_callback?snd_callback:poll_default_snd_callback, + .tmr_callback = tmr_callback?tmr_callback:poll_default_tmr_callback + }; + + size_t i; + for(i = 0; i < sockets->opened ;i++) { + + POLLINFO *pi = poll_add_fd(&p + , sockets->fds[i] + , sockets->fds_types[i] + , sockets->fds_acl_flags[i] + , POLLINFO_FLAG_SERVER_SOCKET + , (sockets->fds_names[i])?sockets->fds_names[i]:"UNKNOWN" + , "" + , "" + , p.add_callback + , p.del_callback + , p.rcv_callback + , p.snd_callback + , NULL + ); + + pi->data = data; + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "POLLFD: LISTENER: listening on '%s'", + (sockets->fds_names[i])?sockets->fds_names[i]:"UNKNOWN"); + } + + int listen_sockets_active = 1; + + time_t last_check = now_boottime_sec(); + + usec_t timer_usec = timer_milliseconds * USEC_PER_MS; + usec_t now_usec = 0, next_timer_usec = 0, last_timer_usec = 0; + (void)last_timer_usec; + + if(unlikely(timer_usec)) { + now_usec = now_boottime_usec(); + next_timer_usec = now_usec - (now_usec % timer_usec) + timer_usec; + } + + CLEANUP_FUNCTION_REGISTER(poll_events_cleanup) cleanup_ptr = &p; + + while(!check_to_stop_callback() && !nd_thread_signaled_to_cancel()) { + if(unlikely(timer_usec)) { + now_usec = now_boottime_usec(); + + if(unlikely(timer_usec && now_usec >= next_timer_usec)) { + last_timer_usec = now_usec; + p.tmr_callback(p.timer_data); + now_usec = now_boottime_usec(); + next_timer_usec = now_usec - (now_usec % timer_usec) + timer_usec; + } + } + + // enable or disable the TCP listening sockets, based on the current number of sockets used and the limit set + if((listen_sockets_active && (p.limit && p.used >= p.limit)) || (!listen_sockets_active && (!p.limit || p.used < p.limit))) { + listen_sockets_active = !listen_sockets_active; + + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "%s listening sockets (used TCP sockets %zu, max allowed for this worker %zu)", + (listen_sockets_active)?"ENABLING":"DISABLING", p.used, p.limit); + + for (i = 0; i <= p.max; i++) { + if(p.inf[i].flags & POLLINFO_FLAG_SERVER_SOCKET && p.inf[i].socktype == SOCK_STREAM) { + p.fds[i].events = (short int) ((listen_sockets_active) ? POLLIN : 0); + } + } + } + + retval = poll(p.fds, p.max + 1, ND_CHECK_CANCELLABILITY_WHILE_WAITING_EVERY_MS); + time_t now = now_boottime_sec(); + + if(unlikely(retval == -1)) { + nd_log(NDLS_DAEMON, NDLP_ERR, + "POLLFD: LISTENER: poll() failed while waiting on %zu sockets.", + p.max + 1); + + break; + } + else if(unlikely(!retval)) { + // timeout + ; + } + else { + POLLINFO *pi; + struct pollfd *pf; + size_t idx, processed = 0; + short int revents; + + // keep fast lookup arrays per function + // to avoid looping through the entire list every time + size_t sends[p.max + 1], sends_max = 0; + size_t reads[p.max + 1], reads_max = 0; + size_t conns[p.max + 1], conns_max = 0; + size_t udprd[p.max + 1], udprd_max = 0; + + for (i = 0; i <= p.max; i++) { + pi = &p.inf[i]; + pf = &p.fds[i]; + revents = pf->revents; + + if(unlikely(revents == 0 || pf->fd == -1)) + continue; + + if (unlikely(revents & (POLLERR|POLLHUP|POLLNVAL))) { + // something is wrong to one of our sockets + + pf->revents = 0; + processed += poll_process_error(pi, pf, revents); + } + else if (likely(revents & POLLOUT)) { + // a client is ready to receive data + + sends[sends_max++] = i; + } + else if (likely(revents & (POLLIN|POLLPRI))) { + if (pi->flags & POLLINFO_FLAG_CLIENT_SOCKET) { + // a client sent data to us + + reads[reads_max++] = i; + } + else if (pi->flags & POLLINFO_FLAG_SERVER_SOCKET) { + // something is coming to our server sockets + + if(pi->socktype == SOCK_DGRAM) { + // UDP receive, directly on our listening socket + + udprd[udprd_max++] = i; + } + else if(pi->socktype == SOCK_STREAM) { + // new TCP connection + + conns[conns_max++] = i; + } + else + nd_log(NDLS_DAEMON, NDLP_ERR, + "POLLFD: LISTENER: server slot %zu (fd %d) connection from %s port %s using unhandled socket type %d." + , i + , pi->fd + , pi->client_ip ? pi->client_ip : "<undefined-ip>" + , pi->client_port ? pi->client_port : "<undefined-port>" + , pi->socktype + ); + } + else + nd_log(NDLS_DAEMON, NDLP_ERR, + "POLLFD: LISTENER: client slot %zu (fd %d) data from %s port %s using flags %08X is neither client nor server." + , i + , pi->fd + , pi->client_ip ? pi->client_ip : "<undefined-ip>" + , pi->client_port ? pi->client_port : "<undefined-port>" + , pi->flags + ); + } + else + nd_log(NDLS_DAEMON, NDLP_ERR, + "POLLFD: LISTENER: socket slot %zu (fd %d) client %s port %s unhandled event id %d." + , i + , pi->fd + , pi->client_ip ? pi->client_ip : "<undefined-ip>" + , pi->client_port ? pi->client_port : "<undefined-port>" + , revents + ); + } + + // process sends + for (idx = 0; idx < sends_max; idx++) { + i = sends[idx]; + pi = &p.inf[i]; + pf = &p.fds[i]; + pf->revents = 0; + processed += poll_process_send(&p, pi, pf, now); + } + + // process UDP reads + for (idx = 0; idx < udprd_max; idx++) { + i = udprd[idx]; + pi = &p.inf[i]; + pf = &p.fds[i]; + pf->revents = 0; + processed += poll_process_udp_read(pi, pf, now); + } + + // process TCP reads + for (idx = 0; idx < reads_max; idx++) { + i = reads[idx]; + pi = &p.inf[i]; + pf = &p.fds[i]; + pf->revents = 0; + processed += poll_process_tcp_read(&p, pi, pf, now); + } + + if(!processed && (!p.limit || p.used < p.limit)) { + // nothing processed above (rcv, snd) and we have room for another TCP connection + // so, accept one TCP connection + for (idx = 0; idx < conns_max; idx++) { + i = conns[idx]; + pi = &p.inf[i]; + pf = &p.fds[i]; + pf->revents = 0; + if (poll_process_new_tcp_connection(&p, pi, pf, now)) + break; + } + } + } + + if(unlikely(p.checks_every > 0 && now - last_check > p.checks_every)) { + last_check = now; + + // cleanup old sockets + for(i = 0; i <= p.max; i++) { + POLLINFO *pi = &p.inf[i]; + + if(likely(pi->flags & POLLINFO_FLAG_CLIENT_SOCKET)) { + if (unlikely(pi->send_count == 0 && p.complete_request_timeout > 0 && (now - pi->connected_t) >= p.complete_request_timeout)) { + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "POLLFD: LISTENER: client slot %zu (fd %d) from %s port %s has not sent a complete request in %zu seconds - closing it. " + , i + , pi->fd + , pi->client_ip ? pi->client_ip : "<undefined-ip>" + , pi->client_port ? pi->client_port : "<undefined-port>" + , (size_t) p.complete_request_timeout + ); + poll_close_fd(pi); + } + else if(unlikely(pi->recv_count && p.idle_timeout > 0 && now - ((pi->last_received_t > pi->last_sent_t) ? pi->last_received_t : pi->last_sent_t) >= p.idle_timeout )) { + nd_log(NDLS_DAEMON, NDLP_DEBUG, + "POLLFD: LISTENER: client slot %zu (fd %d) from %s port %s is idle for more than %zu seconds - closing it. " + , i + , pi->fd + , pi->client_ip ? pi->client_ip : "<undefined-ip>" + , pi->client_port ? pi->client_port : "<undefined-port>" + , (size_t) p.idle_timeout + ); + poll_close_fd(pi); + } + } + } + } + } +} diff --git a/src/libnetdata/socket/socket.h b/src/libnetdata/socket/socket.h new file mode 100644 index 00000000..8eab8bfd --- /dev/null +++ b/src/libnetdata/socket/socket.h @@ -0,0 +1,211 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_SOCKET_H +#define NETDATA_SOCKET_H + +#include "../libnetdata.h" + +#ifndef MAX_LISTEN_FDS +#define MAX_LISTEN_FDS 50 +#endif + +#define ND_CHECK_CANCELLABILITY_WHILE_WAITING_EVERY_MS 100 + +typedef struct listen_sockets { + struct config *config; // the config file to use + const char *config_section; // the netdata configuration section to read settings from + const char *default_bind_to; // the default bind to configuration string + uint16_t default_port; // the default port to use + int backlog; // the default listen backlog to use + + size_t opened; // the number of sockets opened + size_t failed; // the number of sockets attempted to open, but failed + int fds[MAX_LISTEN_FDS]; // the open sockets + char *fds_names[MAX_LISTEN_FDS]; // descriptions for the open sockets + int fds_types[MAX_LISTEN_FDS]; // the socktype for the open sockets (SOCK_STREAM, SOCK_DGRAM) + int fds_families[MAX_LISTEN_FDS]; // the family of the open sockets (AF_UNIX, AF_INET, AF_INET6) + HTTP_ACL fds_acl_flags[MAX_LISTEN_FDS]; // the acl to apply to the open sockets (dashboard, badges, streaming, netdata.conf, management) +} LISTEN_SOCKETS; + +char *strdup_client_description(int family, const char *protocol, const char *ip, uint16_t port); + +int listen_sockets_setup(LISTEN_SOCKETS *sockets); +void listen_sockets_close(LISTEN_SOCKETS *sockets); + +void foreach_entry_in_connection_string(const char *destination, bool (*callback)(char *entry, void *data), void *data); +int connect_to_this_ip46(int protocol, int socktype, const char *host, uint32_t scope_id, const char *service, struct timeval *timeout); +int connect_to_this(const char *definition, int default_port, struct timeval *timeout); +int connect_to_one_of(const char *destination, int default_port, struct timeval *timeout, size_t *reconnects_counter, char *connected_to, size_t connected_to_size); +int connect_to_one_of_urls(const char *destination, int default_port, struct timeval *timeout, size_t *reconnects_counter, char *connected_to, size_t connected_to_size); + + +#ifdef ENABLE_HTTPS +ssize_t recv_timeout(NETDATA_SSL *ssl,int sockfd, void *buf, size_t len, int flags, int timeout); +ssize_t send_timeout(NETDATA_SSL *ssl,int sockfd, void *buf, size_t len, int flags, int timeout); +int wait_on_socket_or_cancel_with_timeout(NETDATA_SSL *ssl, int fd, int timeout_ms, short int poll_events, short int *revents); +#else +ssize_t recv_timeout(int sockfd, void *buf, size_t len, int flags, int timeout); +ssize_t send_timeout(int sockfd, void *buf, size_t len, int flags, int timeout); +int wait_on_socket_or_cancel_with_timeout(int fd, int timeout_ms, short int poll_events, short int *revents); +#endif + +bool fd_is_socket(int fd); +bool sock_has_output_error(int fd); + +int sock_setnonblock(int fd); +int sock_delnonblock(int fd); +int sock_setreuse(int fd, int reuse); +void sock_setcloexec(int fd); +int sock_setreuse_port(int fd, int reuse); +int sock_enlarge_in(int fd); +int sock_enlarge_out(int fd); + +int connection_allowed(int fd, char *client_ip, char *client_host, size_t hostsize, + SIMPLE_PATTERN *access_list, const char *patname, int allow_dns); +int accept_socket(int fd, int flags, char *client_ip, size_t ipsize, char *client_port, size_t portsize, + char *client_host, size_t hostsize, SIMPLE_PATTERN *access_list, int allow_dns); + +#ifndef HAVE_ACCEPT4 +int accept4(int sock, struct sockaddr *addr, socklen_t *addrlen, int flags); +#endif /* #ifndef HAVE_ACCEPT4 */ + +#ifdef SOCK_CLOEXEC +#define DEFAULT_SOCKET_FLAGS SOCK_CLOEXEC +#else +#define DEFAULT_SOCKET_FLAGS 0 +#endif + + +// ---------------------------------------------------------------------------- +// poll() based listener + +#define POLLINFO_FLAG_SERVER_SOCKET 0x00000001 +#define POLLINFO_FLAG_CLIENT_SOCKET 0x00000002 +#define POLLINFO_FLAG_DONT_CLOSE 0x00000004 + +typedef struct poll POLLJOB; + +typedef struct pollinfo { + POLLJOB *p; // the parent + size_t slot; // the slot id + + int fd; // the file descriptor + int socktype; // the client socket type + HTTP_ACL port_acl; // the access lists permitted on this web server port (it's -1 for client sockets) + char *client_ip; // Max INET6_ADDRSTRLEN bytes + char *client_port; // Max NI_MAXSERV bytes + char *client_host; // Max NI_MAXHOST bytes + + time_t connected_t; // the time the socket connected + time_t last_received_t; // the time the socket last received data + time_t last_sent_t; // the time the socket last sent data + + size_t recv_count; // the number of times the socket was ready for inbound traffic + size_t send_count; // the number of times the socket was ready for outbound traffic + + uint32_t flags; // internal flags + + // callbacks for this socket + void (*del_callback)(struct pollinfo *pi); + int (*rcv_callback)(struct pollinfo *pi, short int *events); + int (*snd_callback)(struct pollinfo *pi, short int *events); + + // the user data + void *data; + + // linking of free pollinfo structures + // for quickly finding the next available + // this is like a stack, it grows and shrinks + // (with gaps - lower empty slots are preferred) + struct pollinfo *next; +} POLLINFO; + +struct poll { + size_t slots; + size_t used; + size_t min; + size_t max; + + size_t limit; + + time_t complete_request_timeout; + time_t idle_timeout; + time_t checks_every; + + time_t timer_milliseconds; + void *timer_data; + + struct pollfd *fds; + struct pollinfo *inf; + struct pollinfo *first_free; + + SIMPLE_PATTERN *access_list; + int allow_dns; + + void *(*add_callback)(POLLINFO *pi, short int *events, void *data); + void (*del_callback)(POLLINFO *pi); + int (*rcv_callback)(POLLINFO *pi, short int *events); + int (*snd_callback)(POLLINFO *pi, short int *events); + void (*tmr_callback)(void *timer_data); +}; + +#define pollinfo_from_slot(p, slot) (&((p)->inf[(slot)])) + +int poll_default_snd_callback(POLLINFO *pi, short int *events); +int poll_default_rcv_callback(POLLINFO *pi, short int *events); +void poll_default_del_callback(POLLINFO *pi); +void *poll_default_add_callback(POLLINFO *pi, short int *events, void *data); + +POLLINFO *poll_add_fd(POLLJOB *p + , int fd + , int socktype + , HTTP_ACL port_acl + , uint32_t flags + , const char *client_ip + , const char *client_port + , const char *client_host + , void *(*add_callback)(POLLINFO *pi, short int *events, void *data) + , void (*del_callback)(POLLINFO *pi) + , int (*rcv_callback)(POLLINFO *pi, short int *events) + , int (*snd_callback)(POLLINFO *pi, short int *events) + , void *data +); +void poll_close_fd(POLLINFO *pi); + +void poll_events(LISTEN_SOCKETS *sockets + , void *(*add_callback)(POLLINFO *pi, short int *events, void *data) + , void (*del_callback)(POLLINFO *pi) + , int (*rcv_callback)(POLLINFO *pi, short int *events) + , int (*snd_callback)(POLLINFO *pi, short int *events) + , void (*tmr_callback)(void *timer_data) + , bool (*check_to_stop_callback)(void) + , SIMPLE_PATTERN *access_list + , int allow_dns + , void *data + , time_t tcp_request_timeout_seconds + , time_t tcp_idle_timeout_seconds + , time_t timer_milliseconds + , void *timer_data + , size_t max_tcp_sockets +); + +#ifndef INET6_ADDRSTRLEN +#define INET6_ADDRSTRLEN 46 +#endif + +typedef struct socket_peers { + struct { + char ip[INET6_ADDRSTRLEN]; + int port; + } local; + + struct { + char ip[INET6_ADDRSTRLEN]; + int port; + } peer; +} SOCKET_PEERS; + +SOCKET_PEERS socket_peers(int sock_fd); +bool ip_to_hostname(const char *ip, char *dst, size_t dst_len); + +#endif //NETDATA_SOCKET_H diff --git a/src/libnetdata/statistical/README.md b/src/libnetdata/statistical/README.md new file mode 100644 index 00000000..1d1d2afd --- /dev/null +++ b/src/libnetdata/statistical/README.md @@ -0,0 +1,12 @@ +<!-- +title: "Statistical functions" +custom_edit_url: https://github.com/netdata/netdata/edit/master/src/libnetdata/statistical/README.md +sidebar_label: "Statistical functions" +learn_status: "Published" +learn_topic_type: "Tasks" +learn_rel_path: "Developers/libnetdata" +--> + +# Statistical functions + +A library for easy and fast calculations of statistical measurements like average, median etc. diff --git a/src/libnetdata/statistical/statistical.c b/src/libnetdata/statistical/statistical.c new file mode 100644 index 00000000..ef9fe4e5 --- /dev/null +++ b/src/libnetdata/statistical/statistical.c @@ -0,0 +1,460 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" + +NETDATA_DOUBLE default_single_exponential_smoothing_alpha = 0.1; + +void log_series_to_stderr(NETDATA_DOUBLE *series, size_t entries, NETDATA_DOUBLE result, const char *msg) { + const NETDATA_DOUBLE *value, *end = &series[entries]; + + fprintf(stderr, "%s of %zu entries [ ", msg, entries); + for(value = series; value < end ;value++) { + if(value != series) fprintf(stderr, ", "); + fprintf(stderr, "%" NETDATA_DOUBLE_MODIFIER, *value); + } + fprintf(stderr, " ] results in " NETDATA_DOUBLE_FORMAT "\n", result); +} + +// -------------------------------------------------------------------------------------------------------------------- + +inline NETDATA_DOUBLE sum_and_count(const NETDATA_DOUBLE *series, size_t entries, size_t *count) { + const NETDATA_DOUBLE *value, *end = &series[entries]; + NETDATA_DOUBLE sum = 0; + size_t c = 0; + + for(value = series; value < end ; value++) { + if(netdata_double_isnumber(*value)) { + sum += *value; + c++; + } + } + + if(unlikely(!c)) sum = NAN; + if(likely(count)) *count = c; + + return sum; +} + +inline NETDATA_DOUBLE sum(const NETDATA_DOUBLE *series, size_t entries) { + return sum_and_count(series, entries, NULL); +} + +inline NETDATA_DOUBLE average(const NETDATA_DOUBLE *series, size_t entries) { + size_t count = 0; + NETDATA_DOUBLE sum = sum_and_count(series, entries, &count); + + if(unlikely(!count)) return NAN; + return sum / (NETDATA_DOUBLE)count; +} + +// -------------------------------------------------------------------------------------------------------------------- + +NETDATA_DOUBLE moving_average(const NETDATA_DOUBLE *series, size_t entries, size_t period) { + if(unlikely(period <= 0)) + return 0.0; + + size_t i, count; + NETDATA_DOUBLE sum = 0, avg = 0; + NETDATA_DOUBLE p[period]; + + for(count = 0; count < period ; count++) + p[count] = 0.0; + + for(i = 0, count = 0; i < entries; i++) { + NETDATA_DOUBLE value = series[i]; + if(unlikely(!netdata_double_isnumber(value))) continue; + + if(unlikely(count < period)) { + sum += value; + avg = (count == period - 1) ? sum / (NETDATA_DOUBLE)period : 0; + } + else { + sum = sum - p[count % period] + value; + avg = sum / (NETDATA_DOUBLE)period; + } + + p[count % period] = value; + count++; + } + + return avg; +} + +// -------------------------------------------------------------------------------------------------------------------- + +static int qsort_compare(const void *a, const void *b) { + NETDATA_DOUBLE *p1 = (NETDATA_DOUBLE *)a, *p2 = (NETDATA_DOUBLE *)b; + NETDATA_DOUBLE n1 = *p1, n2 = *p2; + + if(unlikely(isnan(n1) || isnan(n2))) { + if(isnan(n1) && !isnan(n2)) return -1; + if(!isnan(n1) && isnan(n2)) return 1; + return 0; + } + if(unlikely(isinf(n1) || isinf(n2))) { + if(!isinf(n1) && isinf(n2)) return -1; + if(isinf(n1) && !isinf(n2)) return 1; + return 0; + } + + if(unlikely(n1 < n2)) return -1; + if(unlikely(n1 > n2)) return 1; + return 0; +} + +inline void sort_series(NETDATA_DOUBLE *series, size_t entries) { + qsort(series, entries, sizeof(NETDATA_DOUBLE), qsort_compare); +} + +inline NETDATA_DOUBLE *copy_series(const NETDATA_DOUBLE *series, size_t entries) { + NETDATA_DOUBLE *copy = mallocz(sizeof(NETDATA_DOUBLE) * entries); + memcpy(copy, series, sizeof(NETDATA_DOUBLE) * entries); + return copy; +} + +NETDATA_DOUBLE median_on_sorted_series(const NETDATA_DOUBLE *series, size_t entries) { + if(unlikely(entries == 0)) return NAN; + if(unlikely(entries == 1)) return series[0]; + if(unlikely(entries == 2)) return (series[0] + series[1]) / 2; + + NETDATA_DOUBLE average; + if(entries % 2 == 0) { + size_t m = entries / 2; + average = (series[m] + series[m + 1]) / 2; + } + else { + average = series[entries / 2]; + } + + return average; +} + +NETDATA_DOUBLE median(const NETDATA_DOUBLE *series, size_t entries) { + if(unlikely(entries == 0)) return NAN; + if(unlikely(entries == 1)) return series[0]; + + if(unlikely(entries == 2)) + return (series[0] + series[1]) / 2; + + NETDATA_DOUBLE *copy = copy_series(series, entries); + sort_series(copy, entries); + + NETDATA_DOUBLE avg = median_on_sorted_series(copy, entries); + + freez(copy); + return avg; +} + +// -------------------------------------------------------------------------------------------------------------------- + +NETDATA_DOUBLE moving_median(const NETDATA_DOUBLE *series, size_t entries, size_t period) { + if(entries <= period) + return median(series, entries); + + NETDATA_DOUBLE *data = copy_series(series, entries); + + size_t i; + for(i = period; i < entries; i++) { + data[i - period] = median(&series[i - period], period); + } + + NETDATA_DOUBLE avg = median(data, entries - period); + freez(data); + return avg; +} + +// -------------------------------------------------------------------------------------------------------------------- + +// http://stackoverflow.com/a/15150143/4525767 +NETDATA_DOUBLE running_median_estimate(const NETDATA_DOUBLE *series, size_t entries) { + NETDATA_DOUBLE median = 0.0f; + NETDATA_DOUBLE average = 0.0f; + size_t i; + + for(i = 0; i < entries ; i++) { + NETDATA_DOUBLE value = series[i]; + if(unlikely(!netdata_double_isnumber(value))) continue; + + average += ( value - average ) * 0.1f; // rough running average. + median += copysignndd( average * 0.01, value - median ); + } + + return median; +} + +// -------------------------------------------------------------------------------------------------------------------- + +NETDATA_DOUBLE standard_deviation(const NETDATA_DOUBLE *series, size_t entries) { + if(unlikely(entries == 0)) return NAN; + if(unlikely(entries == 1)) return series[0]; + + const NETDATA_DOUBLE *value, *end = &series[entries]; + size_t count; + NETDATA_DOUBLE sum; + + for(count = 0, sum = 0, value = series ; value < end ;value++) { + if(likely(netdata_double_isnumber(*value))) { + count++; + sum += *value; + } + } + + if(unlikely(count == 0)) return NAN; + if(unlikely(count == 1)) return sum; + + NETDATA_DOUBLE average = sum / (NETDATA_DOUBLE)count; + + for(count = 0, sum = 0, value = series ; value < end ;value++) { + if(netdata_double_isnumber(*value)) { + count++; + sum += powndd(*value - average, 2); + } + } + + if(unlikely(count == 0)) return NAN; + if(unlikely(count == 1)) return average; + + NETDATA_DOUBLE variance = sum / (NETDATA_DOUBLE)(count); // remove -1 from count to have a population stddev + NETDATA_DOUBLE stddev = sqrtndd(variance); + return stddev; +} + +// -------------------------------------------------------------------------------------------------------------------- + +NETDATA_DOUBLE single_exponential_smoothing(const NETDATA_DOUBLE *series, size_t entries, NETDATA_DOUBLE alpha) { + if(unlikely(entries == 0)) + return NAN; + + if(unlikely(isnan(alpha))) + alpha = default_single_exponential_smoothing_alpha; + + const NETDATA_DOUBLE *value = series, *end = &series[entries]; + NETDATA_DOUBLE level = (1.0 - alpha) * (*value); + + for(value++ ; value < end; value++) { + if(likely(netdata_double_isnumber(*value))) + level = alpha * (*value) + (1.0 - alpha) * level; + } + + return level; +} + +NETDATA_DOUBLE single_exponential_smoothing_reverse(const NETDATA_DOUBLE *series, size_t entries, NETDATA_DOUBLE alpha) { + if(unlikely(entries == 0)) + return NAN; + + if(unlikely(isnan(alpha))) + alpha = default_single_exponential_smoothing_alpha; + + const NETDATA_DOUBLE *value = &series[entries -1]; + NETDATA_DOUBLE level = (1.0 - alpha) * (*value); + + for(value++ ; value >= series; value--) { + if(likely(netdata_double_isnumber(*value))) + level = alpha * (*value) + (1.0 - alpha) * level; + } + + return level; +} + +// -------------------------------------------------------------------------------------------------------------------- + +// http://grisha.org/blog/2016/02/16/triple-exponential-smoothing-forecasting-part-ii/ +NETDATA_DOUBLE double_exponential_smoothing(const NETDATA_DOUBLE *series, size_t entries, + NETDATA_DOUBLE alpha, + NETDATA_DOUBLE beta, + NETDATA_DOUBLE *forecast) { + if(unlikely(entries == 0)) + return NAN; + + NETDATA_DOUBLE level, trend; + + if(unlikely(isnan(alpha))) + alpha = 0.3; + + if(unlikely(isnan(beta))) + beta = 0.05; + + level = series[0]; + + if(likely(entries > 1)) + trend = series[1] - series[0]; + else + trend = 0; + + const NETDATA_DOUBLE *value = series; + for(value++ ; value >= series; value--) { + if(likely(netdata_double_isnumber(*value))) { + NETDATA_DOUBLE last_level = level; + level = alpha * *value + (1.0 - alpha) * (level + trend); + trend = beta * (level - last_level) + (1.0 - beta) * trend; + + } + } + + if(forecast) + *forecast = level + trend; + + return level; +} + +// -------------------------------------------------------------------------------------------------------------------- + +/* + * Based on th R implementation + * + * a: level component + * b: trend component + * s: seasonal component + * + * Additive: + * + * Yhat[t+h] = a[t] + h * b[t] + s[t + 1 + (h - 1) mod p], + * a[t] = α (Y[t] - s[t-p]) + (1-α) (a[t-1] + b[t-1]) + * b[t] = β (a[t] - a[t-1]) + (1-β) b[t-1] + * s[t] = γ (Y[t] - a[t]) + (1-γ) s[t-p] + * + * Multiplicative: + * + * Yhat[t+h] = (a[t] + h * b[t]) * s[t + 1 + (h - 1) mod p], + * a[t] = α (Y[t] / s[t-p]) + (1-α) (a[t-1] + b[t-1]) + * b[t] = β (a[t] - a[t-1]) + (1-β) b[t-1] + * s[t] = γ (Y[t] / a[t]) + (1-γ) s[t-p] + */ +static int __HoltWinters( + const NETDATA_DOUBLE *series, + int entries, // start_time + h + + NETDATA_DOUBLE alpha, // alpha parameter of Holt-Winters Filter. + NETDATA_DOUBLE + beta, // beta parameter of Holt-Winters Filter. If set to 0, the function will do exponential smoothing. + NETDATA_DOUBLE + gamma, // gamma parameter used for the seasonal component. If set to 0, an non-seasonal model is fitted. + + const int *seasonal, + const int *period, + const NETDATA_DOUBLE *a, // Start value for level (a[0]). + const NETDATA_DOUBLE *b, // Start value for trend (b[0]). + NETDATA_DOUBLE *s, // Vector of start values for the seasonal component (s_1[0] ... s_p[0]) + + /* return values */ + NETDATA_DOUBLE *SSE, // The final sum of squared errors achieved in optimizing + NETDATA_DOUBLE *level, // Estimated values for the level component (size entries - t + 2) + NETDATA_DOUBLE *trend, // Estimated values for the trend component (size entries - t + 2) + NETDATA_DOUBLE *season // Estimated values for the seasonal component (size entries - t + 2) +) +{ + if(unlikely(entries < 4)) + return 0; + + int start_time = 2; + + NETDATA_DOUBLE res = 0, xhat = 0, stmp = 0; + int i, i0, s0; + + /* copy start values to the beginning of the vectors */ + level[0] = *a; + if(beta > 0) trend[0] = *b; + if(gamma > 0) memcpy(season, s, *period * sizeof(NETDATA_DOUBLE)); + + for(i = start_time - 1; i < entries; i++) { + /* indices for period i */ + i0 = i - start_time + 2; + s0 = i0 + *period - 1; + + /* forecast *for* period i */ + xhat = level[i0 - 1] + (beta > 0 ? trend[i0 - 1] : 0); + stmp = gamma > 0 ? season[s0 - *period] : (*seasonal != 1); + if (*seasonal == 1) + xhat += stmp; + else + xhat *= stmp; + + /* Sum of Squared Errors */ + res = series[i] - xhat; + *SSE += res * res; + + /* estimate of level *in* period i */ + if (*seasonal == 1) + level[i0] = alpha * (series[i] - stmp) + + (1 - alpha) * (level[i0 - 1] + trend[i0 - 1]); + else + level[i0] = alpha * (series[i] / stmp) + + (1 - alpha) * (level[i0 - 1] + trend[i0 - 1]); + + /* estimate of trend *in* period i */ + if (beta > 0) + trend[i0] = beta * (level[i0] - level[i0 - 1]) + + (1 - beta) * trend[i0 - 1]; + + /* estimate of seasonal component *in* period i */ + if (gamma > 0) { + if (*seasonal == 1) + season[s0] = gamma * (series[i] - level[i0]) + + (1 - gamma) * stmp; + else + season[s0] = gamma * (series[i] / level[i0]) + + (1 - gamma) * stmp; + } + } + + return 1; +} + +NETDATA_DOUBLE holtwinters(const NETDATA_DOUBLE *series, size_t entries, + NETDATA_DOUBLE alpha, + NETDATA_DOUBLE beta, + NETDATA_DOUBLE gamma, + NETDATA_DOUBLE *forecast) { + if(unlikely(isnan(alpha))) + alpha = 0.3; + + if(unlikely(isnan(beta))) + beta = 0.05; + + if(unlikely(isnan(gamma))) + gamma = 0; + + int seasonal = 0; + int period = 0; + NETDATA_DOUBLE a0 = series[0]; + NETDATA_DOUBLE b0 = 0; + NETDATA_DOUBLE s[] = {}; + + NETDATA_DOUBLE errors = 0.0; + size_t nb_computations = entries; + NETDATA_DOUBLE *estimated_level = callocz(nb_computations, sizeof(NETDATA_DOUBLE)); + NETDATA_DOUBLE *estimated_trend = callocz(nb_computations, sizeof(NETDATA_DOUBLE)); + NETDATA_DOUBLE *estimated_season = callocz(nb_computations, sizeof(NETDATA_DOUBLE)); + + int ret = __HoltWinters( + series, + (int)entries, + alpha, + beta, + gamma, + &seasonal, + &period, + &a0, + &b0, + s, + &errors, + estimated_level, + estimated_trend, + estimated_season + ); + + NETDATA_DOUBLE value = estimated_level[nb_computations - 1]; + + if(forecast) + *forecast = 0.0; + + freez(estimated_level); + freez(estimated_trend); + freez(estimated_season); + + if(!ret) + return 0.0; + + return value; +} diff --git a/src/libnetdata/statistical/statistical.h b/src/libnetdata/statistical/statistical.h new file mode 100644 index 00000000..f3ecfadb --- /dev/null +++ b/src/libnetdata/statistical/statistical.h @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_STATISTICAL_H +#define NETDATA_STATISTICAL_H 1 + +#include "../libnetdata.h" + +void log_series_to_stderr(NETDATA_DOUBLE *series, size_t entries, NETDATA_DOUBLE result, const char *msg); + +NETDATA_DOUBLE average(const NETDATA_DOUBLE *series, size_t entries); +NETDATA_DOUBLE moving_average(const NETDATA_DOUBLE *series, size_t entries, size_t period); +NETDATA_DOUBLE median(const NETDATA_DOUBLE *series, size_t entries); +NETDATA_DOUBLE moving_median(const NETDATA_DOUBLE *series, size_t entries, size_t period); +NETDATA_DOUBLE running_median_estimate(const NETDATA_DOUBLE *series, size_t entries); +NETDATA_DOUBLE standard_deviation(const NETDATA_DOUBLE *series, size_t entries); +NETDATA_DOUBLE single_exponential_smoothing(const NETDATA_DOUBLE *series, size_t entries, NETDATA_DOUBLE alpha); +extern NETDATA_DOUBLE +single_exponential_smoothing_reverse(const NETDATA_DOUBLE *series, size_t entries, NETDATA_DOUBLE alpha); +NETDATA_DOUBLE double_exponential_smoothing(const NETDATA_DOUBLE *series, size_t entries, + NETDATA_DOUBLE alpha, + NETDATA_DOUBLE beta, + NETDATA_DOUBLE *forecast); +NETDATA_DOUBLE holtwinters(const NETDATA_DOUBLE *series, size_t entries, + NETDATA_DOUBLE alpha, + NETDATA_DOUBLE beta, + NETDATA_DOUBLE gamma, + NETDATA_DOUBLE *forecast); +NETDATA_DOUBLE sum_and_count(const NETDATA_DOUBLE *series, size_t entries, size_t *count); +NETDATA_DOUBLE sum(const NETDATA_DOUBLE *series, size_t entries); +NETDATA_DOUBLE median_on_sorted_series(const NETDATA_DOUBLE *series, size_t entries); +NETDATA_DOUBLE *copy_series(const NETDATA_DOUBLE *series, size_t entries); +void sort_series(NETDATA_DOUBLE *series, size_t entries); + +#endif //NETDATA_STATISTICAL_H diff --git a/src/libnetdata/storage-point.h b/src/libnetdata/storage-point.h new file mode 100644 index 00000000..53e7506f --- /dev/null +++ b/src/libnetdata/storage-point.h @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_STORAGE_POINT_H +#define NETDATA_STORAGE_POINT_H + +#include "storage_number/storage_number.h" + +typedef struct storage_point { + NETDATA_DOUBLE min; // when count > 1, this is the minimum among them + NETDATA_DOUBLE max; // when count > 1, this is the maximum among them + NETDATA_DOUBLE sum; // the point sum - divided by count gives the average + + // end_time - start_time = point duration + time_t start_time_s; // the time the point starts + time_t end_time_s; // the time the point ends + + uint32_t count; // the number of original points aggregated + uint32_t anomaly_count; // the number of original points found anomalous + + SN_FLAGS flags; // flags stored with the point +} STORAGE_POINT; + +#define storage_point_unset(x) do { \ + (x).min = (x).max = (x).sum = NAN; \ + (x).count = 0; \ + (x).anomaly_count = 0; \ + (x).flags = SN_FLAG_NONE; \ + (x).start_time_s = 0; \ + (x).end_time_s = 0; \ + } while(0) + +#define storage_point_empty(x, start_s, end_s) do { \ + (x).min = (x).max = (x).sum = NAN; \ + (x).count = 1; \ + (x).anomaly_count = 0; \ + (x).flags = SN_FLAG_NONE; \ + (x).start_time_s = start_s; \ + (x).end_time_s = end_s; \ + } while(0) + +#define STORAGE_POINT_UNSET (STORAGE_POINT){ .min = NAN, .max = NAN, .sum = NAN, .count = 0, .anomaly_count = 0, .flags = SN_FLAG_NONE, .start_time_s = 0, .end_time_s = 0 } + +#define storage_point_is_unset(x) (!(x).count) +#define storage_point_is_gap(x) (!netdata_double_isnumber((x).sum)) +#define storage_point_is_zero(x) (!(x).count || (netdata_double_is_zero((x).min) && netdata_double_is_zero((x).max) && netdata_double_is_zero((x).sum) && (x).anomaly_count == 0)) + +#define storage_point_merge_to(dst, src) do { \ + if(storage_point_is_unset(dst)) \ + (dst) = (src); \ + \ + else if(!storage_point_is_unset(src) && \ + !storage_point_is_gap(src)) { \ + \ + if((src).start_time_s < (dst).start_time_s) \ + (dst).start_time_s = (src).start_time_s;\ + \ + if((src).end_time_s > (dst).end_time_s) \ + (dst).end_time_s = (src).end_time_s; \ + \ + if((src).min < (dst).min) \ + (dst).min = (src).min; \ + \ + if((src).max > (dst).max) \ + (dst).max = (src).max; \ + \ + (dst).sum += (src).sum; \ + \ + (dst).count += (src).count; \ + (dst).anomaly_count += (src).anomaly_count; \ + \ + (dst).flags |= (src).flags & SN_FLAG_RESET; \ + } \ +} while(0) + +#define storage_point_add_to(dst, src) do { \ + if(storage_point_is_unset(dst)) \ + (dst) = (src); \ + \ + else if(!storage_point_is_unset(src) && \ + !storage_point_is_gap(src)) { \ + \ + if((src).start_time_s < (dst).start_time_s) \ + (dst).start_time_s = (src).start_time_s;\ + \ + if((src).end_time_s > (dst).end_time_s) \ + (dst).end_time_s = (src).end_time_s; \ + \ + (dst).min += (src).min; \ + (dst).max += (src).max; \ + (dst).sum += (src).sum; \ + \ + (dst).count += (src).count; \ + (dst).anomaly_count += (src).anomaly_count; \ + \ + (dst).flags |= (src).flags & SN_FLAG_RESET; \ + } \ +} while(0) + +#define storage_point_make_positive(sp) do { \ + if(!storage_point_is_unset(sp) && \ + !storage_point_is_gap(sp)) { \ + \ + if(unlikely(signbit((sp).sum))) \ + (sp).sum = -(sp).sum; \ + \ + if(unlikely(signbit((sp).min))) \ + (sp).min = -(sp).min; \ + \ + if(unlikely(signbit((sp).max))) \ + (sp).max = -(sp).max; \ + \ + if(unlikely((sp).min > (sp).max)) { \ + NETDATA_DOUBLE t = (sp).min; \ + (sp).min = (sp).max; \ + (sp).max = t; \ + } \ + } \ +} while(0) + +#define storage_point_anomaly_rate(sp) \ + (NETDATA_DOUBLE)(storage_point_is_unset(sp) ? 0.0 : (NETDATA_DOUBLE)((sp).anomaly_count) * 100.0 / (NETDATA_DOUBLE)((sp).count)) + +#define storage_point_average_value(sp) \ + ((sp).count ? (sp).sum / (NETDATA_DOUBLE)((sp).count) : 0.0) + + +#endif //NETDATA_STORAGE_POINT_H diff --git a/src/libnetdata/storage_number/README.md b/src/libnetdata/storage_number/README.md new file mode 100644 index 00000000..f0096fb9 --- /dev/null +++ b/src/libnetdata/storage_number/README.md @@ -0,0 +1,21 @@ +<!-- +title: "Netdata storage number" +custom_edit_url: https://github.com/netdata/netdata/edit/master/src/libnetdata/storage_number/README.md +sidebar_label: "Storage number" +learn_status: "Published" +learn_topic_type: "Tasks" +learn_rel_path: "Developers/libnetdata" +--> + +# Netdata storage number + +Although `netdata` does all its calculations using `long double`, it stores all values using +a **custom-made 32-bit number**. + +This custom-made number can store in 29 bits values from `-167772150000000.0` to `167772150000000.0` +with a precision of 0.00001 (yes, it's a floating point number, meaning that higher integer values +have less decimal precision) and 3 bits for flags. + +This provides an extremely optimized memory footprint with just 0.0001% max accuracy loss. + + diff --git a/src/libnetdata/storage_number/storage_number.c b/src/libnetdata/storage_number/storage_number.c new file mode 100644 index 00000000..89a67a53 --- /dev/null +++ b/src/libnetdata/storage_number/storage_number.c @@ -0,0 +1,170 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" + +bool is_system_ieee754_double(void) { + static bool logged = false; + + struct { + NETDATA_DOUBLE original; + + union { + uint64_t i; + NETDATA_DOUBLE d; + }; + } tests[] = { + { .original = 1.25, .i = 0x3FF4000000000000 }, + { .original = 1.0, .i = 0x3FF0000000000000 }, + { .original = 2.0, .i = 0x4000000000000000 }, + { .original = 4.0, .i = 0x4010000000000000 }, + { .original = 8.8, .i = 0x402199999999999A }, + { .original = 16.16, .i = 0x403028F5C28F5C29 }, + { .original = 32.32, .i = 0x404028F5C28F5C29 }, + { .original = 64.64, .i = 0x405028F5C28F5C29 }, + { .original = 128.128, .i = 0x406004189374BC6A }, + { .original = 32768.32768, .i = 0x40E0000A7C5AC472 }, + { .original = 65536.65536, .i = 0x40F0000A7C5AC472 }, + { .original = -65536.65536, .i = 0xC0F0000A7C5AC472 }, + { .original = 65535.65535, .i = 0x40EFFFF4F8A0902E }, + { .original = -65535.65535, .i = 0xC0EFFFF4F8A0902E }, + { .original = 4.503599627e15, .i = 0x432FFFFFFFF4B180 }, + { .original = -4.503599627e15, .i = 0xC32FFFFFFFF4B180 }, + { .original = 1.25e25, .i = 0x4524ADF4B7320335 }, + { .original = 1.25e307, .i = 0x7FB1CCF385EBC8A0 }, + { .original = 1.25e-25, .i = 0x3AC357C299A88EA7 }, + { .original = 1.25e-100, .i = 0x2B317F7D4ED8C33E }, + { .original = NAN, .i = 0x7FF8000000000000 }, + { .original = -INFINITY, .i = 0xFFF0000000000000 }, + { .original = INFINITY, .i = 0x7FF0000000000000 }, + { .original = 1.25e-132, .i = 0x248C6463225AB7EC }, + { .original = 0.0, .i = 0x0000000000000000 }, + { .original = -0.0, .i = 0x8000000000000000 }, + { .original = DBL_MIN, .i = 0x0010000000000000 }, + { .original = DBL_MAX, .i = 0x7FEFFFFFFFFFFFFF }, + { .original = -DBL_MIN, .i = 0x8010000000000000 }, + { .original = -DBL_MAX, .i = 0xFFEFFFFFFFFFFFFF }, + }; + + size_t errors = 0; + size_t elements = sizeof(tests) / sizeof(tests[0]); + for(size_t i = 0; i < elements ; i++) { + uint64_t *ptr = (uint64_t *)&tests[i].original; + + if(*ptr != tests[i].i && (tests[i].original == tests[i].d || (isnan(tests[i].original) && isnan(tests[i].d)))) { + if(!logged) + netdata_log_info("IEEE754: test #%zu, value " NETDATA_DOUBLE_FORMAT_G " is represented in this system as %lX, but it was expected as %lX", + i+1, tests[i].original, *ptr, tests[i].i); + errors++; + } + } + + if(!errors && sizeof(NETDATA_DOUBLE) == sizeof(uint64_t)) { + if(!logged) + netdata_log_info("IEEE754: system is using IEEE754 DOUBLE PRECISION values"); + + logged = true; + return true; + } + else { + if(!logged) + netdata_log_info("IEEE754: system is NOT compatible with IEEE754 DOUBLE PRECISION values"); + + logged = true; + return false; + } +} + +storage_number pack_storage_number(NETDATA_DOUBLE value, SN_FLAGS flags) { + // bit 32 = sign 0:positive, 1:negative + // bit 31 = 0:divide, 1:multiply + // bit 30, 29, 28 = (multiplier or divider) 0-7 (8 total) + // bit 27 SN_EXISTS_100 + // bit 26 SN_EXISTS_RESET + // bit 25 SN_ANOMALY_BIT = 0: anomalous, 1: not anomalous + // bit 24 to bit 1 = the value + + if(unlikely(fpclassify(value) == FP_NAN || fpclassify(value) == FP_INFINITE)) + return SN_EMPTY_SLOT; + + storage_number r = flags & SN_USER_FLAGS; + + if(unlikely(fpclassify(value) == FP_ZERO || fpclassify(value) == FP_SUBNORMAL)) + return r; + + int m = 0; + NETDATA_DOUBLE n = value, factor = 10; + + // if the value is negative + // add the sign bit and make it positive + if(n < 0) { + r += SN_FLAG_NEGATIVE; // the sign bit 32 + n = -n; + } + + if(n / 10000000.0 > 0x00ffffff) { + factor = 100; + r |= SN_FLAG_NOT_EXISTS_MUL100; + } + + // make its integer part fit in 0x00ffffff + // by dividing it by 10 up to 7 times + // and increasing the multiplier + while(m < 7 && n > (NETDATA_DOUBLE)0x00ffffff) { + n /= factor; + m++; + } + + if(m) { + // the value was too big, and we divided it + // so, we add a multiplier to unpack it + r += SN_FLAG_MULTIPLY + (m << 27); // the multiplier m + + if(n > (NETDATA_DOUBLE)0x00ffffff) { + #ifdef NETDATA_INTERNAL_CHECKS + netdata_log_error("Number " NETDATA_DOUBLE_FORMAT " is too big.", value); + #endif + r += 0x00ffffff; + return r; + } + } + else { + // 0x0019999e is the number that can be multiplied + // by 10 to give 0x00ffffff + // while the value is below 0x0019999e we can + // multiply it by 10, up to 7 times, increasing + // the multiplier + while(m < 7 && n < (NETDATA_DOUBLE)0x0019999e) { + n *= 10; + m++; + } + + if (unlikely(n > (NETDATA_DOUBLE)0x00ffffff)) { + n /= 10; + m--; + } + // the value was small enough, and we multiplied it + // so, we add a divider to unpack it + r += (m << 27); // the divider m + } + + r += lrint((double) n); + + return r; +} + +// Lookup table to make storage number unpacking efficient. +NETDATA_DOUBLE unpack_storage_number_lut10x[4 * 8]; + +__attribute__((constructor)) void initialize_lut(void) { + // The lookup table is partitioned in 4 subtables based on the + // values of the factor and exp bits. + for (int i = 0; i < 8; i++) { + // factor = 0 + unpack_storage_number_lut10x[0 * 8 + i] = 1 / pow(10, i); // exp = 0 + unpack_storage_number_lut10x[1 * 8 + i] = pow(10, i); // exp = 1 + + // factor = 1 + unpack_storage_number_lut10x[2 * 8 + i] = 1 / pow(100, i); // exp = 0 + unpack_storage_number_lut10x[3 * 8 + i] = pow(100, i); // exp = 1 + } +} diff --git a/src/libnetdata/storage_number/storage_number.h b/src/libnetdata/storage_number/storage_number.h new file mode 100644 index 00000000..9a95203c --- /dev/null +++ b/src/libnetdata/storage_number/storage_number.h @@ -0,0 +1,178 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_STORAGE_NUMBER_H +#define NETDATA_STORAGE_NUMBER_H 1 + +#include <math.h> +#include "../libnetdata.h" + +#ifdef NETDATA_WITH_LONG_DOUBLE + +typedef long double NETDATA_DOUBLE; +#define NETDATA_DOUBLE_FORMAT "%0.7Lf" +#define NETDATA_DOUBLE_FORMAT_ZERO "%0.0Lf" +#define NETDATA_DOUBLE_FORMAT_AUTO "%Lf" +#define NETDATA_DOUBLE_MODIFIER "Lf" +#define NETDATA_DOUBLE_FORMAT_G "%0.19Le" + +#define NETDATA_DOUBLE_MAX LDBL_MAX + +#define strtondd(s, endptr) strtold(s, endptr) +#define powndd(x, y) powl(x, y) +#define llrintndd(x) llrintl(x) +#define roundndd(x) roundl(x) +#define sqrtndd(x) sqrtl(x) +#define copysignndd(x, y) copysignl(x, y) +#define modfndd(x, y) modfl(x, y) +#define fabsndd(x) fabsl(x) +#define floorndd(x) floorl(x) +#define ceilndd(x) ceill(x) +#define log10ndd(x) log10l(x) + +#else // NETDATA_WITH_LONG_DOUBLE + +typedef double NETDATA_DOUBLE; +#define NETDATA_DOUBLE_FORMAT "%0.7f" +#define NETDATA_DOUBLE_FORMAT_ZERO "%0.0f" +#define NETDATA_DOUBLE_FORMAT_AUTO "%f" +#define NETDATA_DOUBLE_MODIFIER "f" +#define NETDATA_DOUBLE_FORMAT_G "%0.19e" + +#define NETDATA_DOUBLE_MAX DBL_MAX + +#define strtondd(s, endptr) strtod(s, endptr) +#define powndd(x, y) pow(x, y) +#define llrintndd(x) llrint(x) +#define roundndd(x) round(x) +#define sqrtndd(x) sqrt(x) +#define copysignndd(x, y) copysign(x, y) +#define modfndd(x, y) modf(x, y) +#define fabsndd(x) fabs(x) +#define floorndd(x) floor(x) +#define ceilndd(x) ceil(x) +#define log10ndd(x) log10(x) + +#endif // NETDATA_WITH_LONG_DOUBLE + +typedef long long collected_number; +#define COLLECTED_NUMBER_FORMAT "%lld" + +#define epsilonndd (NETDATA_DOUBLE)0.0000001 +#define considered_equal_ndd(a, b) (fabsndd((a) - (b)) < epsilonndd) + +#if defined(HAVE_ISFINITE) || defined(isfinite) +// The isfinite() macro shall determine whether its argument has a +// finite value (zero, subnormal, or normal, and not infinite or NaN). +#define netdata_double_isnumber(a) (isfinite(a)) +#elif defined(HAVE_FINITE) || defined(finite) +#define netdata_double_isnumber(a) (finite(a)) +#else +#define netdata_double_isnumber(a) (fpclassify(a) != FP_NAN && fpclassify(a) != FP_INFINITE) +#endif + +#define netdata_double_is_zero(a) (!netdata_double_isnumber(a) || considered_equal_ndd(a, 0.0)) +#define netdata_double_is_nonzero(a) (!netdata_double_is_zero(a)) + +typedef uint32_t storage_number; + +typedef struct storage_number_tier1 { + float sum_value; + float min_value; + float max_value; + uint16_t count; + uint16_t anomaly_count; +} storage_number_tier1_t; + +#define STORAGE_NUMBER_FORMAT "%u" + +typedef enum { + SN_FLAG_NONE = 0, + SN_FLAG_NOT_ANOMALOUS = (1 << 24), // the anomaly bit of the value (0:anomalous, 1:not anomalous) + SN_FLAG_RESET = (1 << 25), // the value has been overflown + SN_FLAG_NOT_EXISTS_MUL100 = (1 << 26), // very large value (multiplier is 100 instead of 10) + SN_FLAG_MULTIPLY = (1 << 30), // multiply, else divide + SN_FLAG_NEGATIVE = (1 << 31), // negative, else positive +} SN_FLAGS; + +#define SN_USER_FLAGS (SN_FLAG_NOT_ANOMALOUS | SN_FLAG_RESET) + +// default flags for all storage numbers +// anomaly bit is reversed, so we set it by default +#define SN_DEFAULT_FLAGS SN_FLAG_NOT_ANOMALOUS + +// When the calculated number is zero and the value is anomalous (ie. it's bit +// is zero) we want to return a storage_number representation that is +// different from the empty slot. We achieve this by mapping zero to +// SN_EXISTS_100. Unpacking the SN_EXISTS_100 value will return zero because +// its fraction field (as well as its exponent factor field) will be zero. +#define SN_EMPTY_SLOT SN_FLAG_NOT_EXISTS_MUL100 + +// checks +#define does_storage_number_exist(value) (((storage_number)(value)) != SN_EMPTY_SLOT) +#define did_storage_number_reset(value) ((((storage_number)(value)) & SN_FLAG_RESET)) +#define is_storage_number_anomalous(value) (does_storage_number_exist(value) && !(((storage_number)(value)) & SN_FLAG_NOT_ANOMALOUS)) + +storage_number pack_storage_number(NETDATA_DOUBLE value, SN_FLAGS flags) __attribute__((const)); +static inline NETDATA_DOUBLE unpack_storage_number(storage_number value) __attribute__((const)); + +// sign div/mul <--- multiplier / divider ---> 10/100 RESET EXISTS VALUE +#define STORAGE_NUMBER_POSITIVE_MAX_RAW (storage_number)( (0U << 31) | (1U << 30) | (1U << 29) | (1U << 28) | (1U << 27) | (1U << 26) | (0U << 25) | (1U << 24) | 0x00ffffff ) +#define STORAGE_NUMBER_POSITIVE_MIN_RAW (storage_number)( (0U << 31) | (0U << 30) | (1U << 29) | (1U << 28) | (1U << 27) | (0U << 26) | (0U << 25) | (1U << 24) | 0x00000001 ) +#define STORAGE_NUMBER_NEGATIVE_MAX_RAW (storage_number)( (1U << 31) | (0U << 30) | (1U << 29) | (1U << 28) | (1U << 27) | (0U << 26) | (0U << 25) | (1U << 24) | 0x00000001 ) +#define STORAGE_NUMBER_NEGATIVE_MIN_RAW (storage_number)( (1U << 31) | (1U << 30) | (1U << 29) | (1U << 28) | (1U << 27) | (1U << 26) | (0U << 25) | (1U << 24) | 0x00ffffff ) + +// accepted accuracy loss +#define ACCURACY_LOSS_ACCEPTED_PERCENT 0.0001 +#define accuracy_loss(t1, t2) (((t1) == (t2) || (t1) == 0.0 || (t2) == 0.0) ? 0.0 : (100.0 - (((t1) > (t2)) ? ((t2) * 100.0 / (t1) ) : ((t1) * 100.0 / (t2))))) + +// Maximum acceptable rate of increase for counters. With a rate of 10% netdata can safely detect overflows with a +// period of at least every other 10 samples. +#define MAX_INCREMENTAL_PERCENT_RATE 10 + + +static inline NETDATA_DOUBLE unpack_storage_number(storage_number value) { + extern NETDATA_DOUBLE unpack_storage_number_lut10x[4 * 8]; + + if(unlikely(value == SN_EMPTY_SLOT)) + return NAN; + + int sign = 1, exp = 0; + int factor = 0; + + // bit 32 = 0:positive, 1:negative + if(unlikely(value & SN_FLAG_NEGATIVE)) + sign = -1; + + // bit 31 = 0:divide, 1:multiply + if(unlikely(value & SN_FLAG_MULTIPLY)) + exp = 1; + + // bit 27 SN_FLAG_NOT_EXISTS_MUL100 + if(unlikely(value & SN_FLAG_NOT_EXISTS_MUL100)) + factor = 1; + + // bit 26 SN_FLAG_RESET + // bit 25 SN_FLAG_NOT_ANOMALOUS + + // bit 30, 29, 28 = (multiplier or divider) 0-7 (8 total) + int mul = (int)((value & ((1U<<29)|(1U<<28)|(1U<<27))) >> 27); + + // bit 24 to bit 1 = the value, so remove all other bits + value ^= value & ((1U <<31)|(1U <<30)|(1U <<29)|(1U <<28)|(1U <<27)|(1U <<26)|(1U <<25)|(1U<<24)); + + NETDATA_DOUBLE n = value; + + // fprintf(stderr, "UNPACK: %08X, sign = %d, exp = %d, mul = %d, factor = %d, n = " CALCULATED_NUMBER_FORMAT "\n", value, sign, exp, mul, factor, n); + + return sign * unpack_storage_number_lut10x[(factor * 16) + (exp * 8) + mul] * n; +} + +// all these prefixes should use characters that are not allowed in the numbers they represent +#define HEX_PREFIX "0x" // we check 2 characters when parsing +#define IEEE754_UINT64_B64_PREFIX "#" // we check the 1st character during parsing +#define IEEE754_DOUBLE_B64_PREFIX "@" // we check the 1st character during parsing +#define IEEE754_DOUBLE_HEX_PREFIX "%" // we check the 1st character during parsing + +bool is_system_ieee754_double(void); + +#endif /* NETDATA_STORAGE_NUMBER_H */ diff --git a/src/libnetdata/storage_number/tests/test_storage_number.c b/src/libnetdata/storage_number/tests/test_storage_number.c new file mode 100644 index 00000000..19309e5c --- /dev/null +++ b/src/libnetdata/storage_number/tests/test_storage_number.c @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../../libnetdata.h" +#include "../../required_dummies.h" +#include <setjmp.h> +#include <cmocka.h> + +static void test_number_printing(void **state) +{ + (void)state; + + char value[50]; + + print_netdata_double(value, 0); + assert_string_equal(value, "0"); + + print_netdata_double(value, 0.0000001); + assert_string_equal(value, "0.0000001"); + + print_netdata_double(value, 0.00000009); + assert_string_equal(value, "0.0000001"); + + print_netdata_double(value, 0.000000001); + assert_string_equal(value, "0"); + + print_netdata_double(value, 99.99999999999999999); + assert_string_equal(value, "100"); + + print_netdata_double(value, -99.99999999999999999); + assert_string_equal(value, "-100"); + + print_netdata_double(value, 123.4567890123456789); + assert_string_equal(value, "123.456789"); + + print_netdata_double(value, 9999.9999999); + assert_string_equal(value, "9999.9999999"); + + print_netdata_double(value, -9999.9999999); + assert_string_equal(value, "-9999.9999999"); + + print_netdata_double(value, unpack_storage_number(pack_storage_number(16.777218L, SN_DEFAULT_FLAGS))); + assert_string_equal(value, "16.77722"); +} + +int main(void) +{ + const struct CMUnitTest tests[] = { + cmocka_unit_test(test_number_printing) + }; + + return cmocka_run_group_tests_name("storage_number", tests, NULL, NULL); +} diff --git a/src/libnetdata/string/README.md b/src/libnetdata/string/README.md new file mode 100644 index 00000000..54c90594 --- /dev/null +++ b/src/libnetdata/string/README.md @@ -0,0 +1,25 @@ +<!-- +title: "String" +custom_edit_url: https://github.com/netdata/netdata/edit/master/src/libnetdata/string/README.md +sidebar_label: "String" +learn_status: "Published" +learn_topic_type: "Tasks" +learn_rel_path: "Developers/libnetdata" +--> + +# STRING + +STRING provides a way to allocate and free text strings, while de-duplicating them. + +It can be used similarly to libc string functions: + + - `strdup()` and `strdupz()` become `string_strdupz()`. + - `strlen()` becomes `string_strlen()` (and it does not walkthrough the bytes of the string). + - `free()` and `freez()` become `string_freez()`. + +There is also a special `string_dup()` function that increases the reference counter of a STRING, avoiding the +index lookup to find it. + +Once there is a `STRING *`, the actual `const char *` can be accessed with `string2str()`. + +All STRING should be constant. Changing the contents of a `const char *` that has been acquired by `string2str()` should never happen. diff --git a/src/libnetdata/string/string.c b/src/libnetdata/string/string.c new file mode 100644 index 00000000..94c11f4b --- /dev/null +++ b/src/libnetdata/string/string.c @@ -0,0 +1,704 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" +#include <Judy.h> + +typedef int32_t REFCOUNT; + +// ---------------------------------------------------------------------------- +// STRING implementation - dedup all STRING + +#define STRING_PARTITION_SHIFTS (0) +#define STRING_PARTITIONS (256 >> STRING_PARTITION_SHIFTS) +#define string_partition_str(str) ((uint8_t)((str)[0]) >> STRING_PARTITION_SHIFTS) +#define string_partition(string) (string_partition_str((string)->str)) + +struct netdata_string { + uint32_t length; // the string length including the terminating '\0' + + REFCOUNT refcount; // how many times this string is used + // We use a signed number to be able to detect duplicate frees of a string. + // If at any point this goes below zero, we have a duplicate free. + + const char str[]; // the string itself, is appended to this structure +}; + +static struct string_partition { + RW_SPINLOCK spinlock; // the R/W spinlock to protect the Judy array + + Pvoid_t JudyHSArray; // the Judy array - hashtable + + size_t inserts; // the number of successful inserts to the index + size_t deletes; // the number of successful deleted from the index + + long int entries; // the number of entries in the index + long int memory; // the memory used, without the JudyHS index + +#ifdef NETDATA_INTERNAL_CHECKS + // internal statistics + + struct { + size_t searches; // the number of successful searches in the index + size_t releases; // when a string is unreferenced + size_t duplications; // when a string is referenced + long int active_references; // the number of active references alive + } atomic; + + size_t found_deleted_on_search; + size_t found_available_on_search; + size_t found_deleted_on_insert; + size_t found_available_on_insert; + size_t spins; +#endif + +} string_base[STRING_PARTITIONS] = { 0 }; + +#ifdef NETDATA_INTERNAL_CHECKS +#define string_stats_atomic_increment(partition, var) __atomic_add_fetch(&string_base[partition].atomic.var, 1, __ATOMIC_RELAXED) +#define string_stats_atomic_decrement(partition, var) __atomic_sub_fetch(&string_base[partition].atomic.var, 1, __ATOMIC_RELAXED) +#define string_internal_stats_add(partition, var, val) __atomic_add_fetch(&string_base[partition].var, val, __ATOMIC_RELAXED) +#else +#define string_stats_atomic_increment(partition, var) do {;} while(0) +#define string_stats_atomic_decrement(partition, var) do {;} while(0) +#define string_internal_stats_add(partition, var, val) do {;} while(0) +#endif + +void string_statistics(size_t *inserts, size_t *deletes, size_t *searches, size_t *entries, size_t *references, size_t *memory, size_t *duplications, size_t *releases) { + if (inserts) *inserts = 0; + if (deletes) *deletes = 0; + if (searches) *searches = 0; + if (entries) *entries = 0; + if (references) *references = 0; + if (memory) *memory = 0; + if (duplications) *duplications = 0; + if (releases) *releases = 0; + + for(size_t i = 0; i < STRING_PARTITIONS ;i++) { + if (inserts) *inserts += string_base[i].inserts; + if (deletes) *deletes += string_base[i].deletes; + if (entries) *entries += (size_t) string_base[i].entries; + if (memory) *memory += (size_t) string_base[i].memory; + +#ifdef NETDATA_INTERNAL_CHECKS + if (searches) *searches += string_base[i].atomic.searches; + if (references) *references += (size_t) string_base[i].atomic.active_references; + if (duplications) *duplications += string_base[i].atomic.duplications; + if (releases) *releases += string_base[i].atomic.releases; +#endif + } +} + +#define string_entry_acquire(se) __atomic_add_fetch(&((se)->refcount), 1, __ATOMIC_SEQ_CST) +#define string_entry_release(se) __atomic_sub_fetch(&((se)->refcount), 1, __ATOMIC_SEQ_CST) + +static inline bool string_entry_check_and_acquire(STRING *se) { +#ifdef NETDATA_INTERNAL_CHECKS + uint8_t partition = string_partition(se); +#endif + + REFCOUNT expected, desired, count = 0; + + expected = __atomic_load_n(&se->refcount, __ATOMIC_SEQ_CST); + + do { + count++; + + if(expected <= 0) { + // We cannot use this. + // The reference counter reached value zero, + // so another thread is deleting this. + string_internal_stats_add(partition, spins, count - 1); + return false; + } + + desired = expected + 1; + + } while(!__atomic_compare_exchange_n(&se->refcount, &expected, desired, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)); + + string_internal_stats_add(partition, spins, count - 1); + + // statistics + // string_base.active_references is altered at the in string_strdupz() and string_freez() + string_stats_atomic_increment(partition, duplications); + + return true; +} + +STRING *string_dup(STRING *string) { + if(unlikely(!string)) return NULL; + +#ifdef NETDATA_INTERNAL_CHECKS + if(unlikely(__atomic_load_n(&string->refcount, __ATOMIC_SEQ_CST) <= 0)) + fatal("STRING: tried to %s() a string that is freed (it has %d references).", __FUNCTION__, string->refcount); +#endif + + string_entry_acquire(string); + +#ifdef NETDATA_INTERNAL_CHECKS + uint8_t partition = string_partition(string); +#endif + + // statistics + string_stats_atomic_increment(partition, active_references); + string_stats_atomic_increment(partition, duplications); + + return string; +} + +// Search the index and return an ACQUIRED string entry, or NULL +static inline STRING *string_index_search(const char *str, size_t length) { + STRING *string; + + uint8_t partition = string_partition_str(str); + + // Find the string in the index + // With a read-lock so that multiple readers can use the index concurrently. + + rw_spinlock_read_lock(&string_base[partition].spinlock); + + Pvoid_t *Rc; + Rc = JudyHSGet(string_base[partition].JudyHSArray, (void *)str, length - 1); + if(likely(Rc)) { + // found in the hash table + string = *Rc; + + if(string_entry_check_and_acquire(string)) { + // we can use this entry + string_internal_stats_add(partition, found_available_on_search, 1); + } + else { + // this entry is about to be deleted by another thread + // do not touch it, let it go... + string = NULL; + string_internal_stats_add(partition, found_deleted_on_search, 1); + } + } + else { + // not found in the hash table + string = NULL; + } + + string_stats_atomic_increment(partition, searches); + rw_spinlock_read_unlock(&string_base[partition].spinlock); + + return string; +} + +// Insert a string to the index and return an ACQUIRED string entry, +// or NULL if the call needs to be retried (a deleted entry with the same key is still in the index) +// The returned entry is ACQUIRED, and it can either be: +// 1. a new item inserted, or +// 2. an item found in the index that is not currently deleted +static inline STRING *string_index_insert(const char *str, size_t length) { + STRING *string; + + uint8_t partition = string_partition_str(str); + + rw_spinlock_write_lock(&string_base[partition].spinlock); + + STRING **ptr; + { + JError_t J_Error; + Pvoid_t *Rc = JudyHSIns(&string_base[partition].JudyHSArray, (void *)str, length - 1, &J_Error); + if (unlikely(Rc == PJERR)) { + fatal( + "STRING: Cannot insert entry with name '%s' to JudyHS, JU_ERRNO_* == %u, ID == %d", + str, + JU_ERRNO(&J_Error), + JU_ERRID(&J_Error)); + } + ptr = (STRING **)Rc; + } + + if (likely(*ptr == 0)) { + // a new item added to the index + size_t mem_size = sizeof(STRING) + length; + string = mallocz(mem_size); + strcpy((char *)string->str, str); + string->length = length; + string->refcount = 1; + *ptr = string; + string_base[partition].inserts++; + string_base[partition].entries++; + string_base[partition].memory += (long)(mem_size + JUDYHS_INDEX_SIZE_ESTIMATE(length)); + } + else { + // the item is already in the index + string = *ptr; + + if(string_entry_check_and_acquire(string)) { + // we can use this entry + string_internal_stats_add(partition, found_available_on_insert, 1); + } + else { + // this entry is about to be deleted by another thread + // do not touch it, let it go... + string = NULL; + string_internal_stats_add(partition, found_deleted_on_insert, 1); + } + + string_stats_atomic_increment(partition, searches); + } + + rw_spinlock_write_unlock(&string_base[partition].spinlock); + return string; +} + +// delete an entry from the index +static inline void string_index_delete(STRING *string) { + uint8_t partition = string_partition(string); + + rw_spinlock_write_lock(&string_base[partition].spinlock); + +#ifdef NETDATA_INTERNAL_CHECKS + if(unlikely(__atomic_load_n(&string->refcount, __ATOMIC_SEQ_CST) != 0)) + fatal("STRING: tried to delete a string at %s() that is already freed (it has %d references).", __FUNCTION__, string->refcount); +#endif + + bool deleted = false; + + if (likely(string_base[partition].JudyHSArray)) { + JError_t J_Error; + int ret = JudyHSDel(&string_base[partition].JudyHSArray, (void *)string->str, string->length - 1, &J_Error); + if (unlikely(ret == JERR)) { + netdata_log_error( + "STRING: Cannot delete entry with name '%s' from JudyHS, JU_ERRNO_* == %u, ID == %d", + string->str, + JU_ERRNO(&J_Error), + JU_ERRID(&J_Error)); + } else + deleted = true; + } + + if (unlikely(!deleted)) + netdata_log_error("STRING: tried to delete '%s' that is not in the index. Ignoring it.", string->str); + else { + size_t mem_size = sizeof(STRING) + string->length; + string_base[partition].deletes++; + string_base[partition].entries--; + string_base[partition].memory -= (long)(mem_size + JUDYHS_INDEX_SIZE_ESTIMATE(string->length)); + freez(string); + } + + rw_spinlock_write_unlock(&string_base[partition].spinlock); +} + +STRING *string_strdupz(const char *str) { + if(unlikely(!str || !*str)) return NULL; + +#ifdef NETDATA_INTERNAL_CHECKS + uint8_t partition = string_partition_str(str); +#endif + + size_t length = strlen(str) + 1; + STRING *string = string_index_search(str, length); + + while(!string) { + // The search above did not find anything, + // We loop here, because during insert we may find an entry that is being deleted by another thread. + // So, we have to let it go and retry to insert it again. + + string = string_index_insert(str, length); + } + + // statistics + string_stats_atomic_increment(partition, active_references); + + return string; +} + +STRING *string_strndupz(const char *str, size_t len) { + if(unlikely(!str || !*str || !len)) return NULL; + +#ifdef NETDATA_INTERNAL_CHECKS + uint8_t partition = string_partition_str(str); +#endif + + char buf[len + 1]; + memcpy(buf, str, len); + buf[len] = '\0'; + + STRING *string = string_index_search(buf, len + 1); + while(!string) + string = string_index_insert(buf, len + 1); + + string_stats_atomic_increment(partition, active_references); + return string; +} + +void string_freez(STRING *string) { + if(unlikely(!string)) return; + +#ifdef NETDATA_INTERNAL_CHECKS + uint8_t partition = string_partition(string); +#endif + REFCOUNT refcount = string_entry_release(string); + +#ifdef NETDATA_INTERNAL_CHECKS + if(unlikely(refcount < 0)) + fatal("STRING: tried to %s() a string that is already freed (it has %d references).", __FUNCTION__, string->refcount); +#endif + + if(unlikely(refcount == 0)) + string_index_delete(string); + + // statistics + string_stats_atomic_decrement(partition, active_references); + string_stats_atomic_increment(partition, releases); +} + +inline size_t string_strlen(STRING *string) { + if(unlikely(!string)) return 0; + return string->length - 1; +} + +inline const char *string2str(STRING *string) { + if(unlikely(!string)) return ""; + return string->str; +} + +STRING *string_2way_merge(STRING *a, STRING *b) { + static STRING *X = NULL; + + if(unlikely(!X)) { + X = string_strdupz("[x]"); + } + + if(unlikely(a == b)) return string_dup(a); + if(unlikely(a == X)) return string_dup(a); + if(unlikely(b == X)) return string_dup(b); + if(unlikely(!a)) return string_dup(X); + if(unlikely(!b)) return string_dup(X); + + size_t alen = string_strlen(a); + size_t blen = string_strlen(b); + size_t length = alen + blen + string_strlen(X) + 1; + char buf1[length + 1], buf2[length + 1], *dst1; + const char *s1, *s2; + + s1 = string2str(a); + s2 = string2str(b); + dst1 = buf1; + for( ; *s1 && *s2 && *s1 == *s2 ;s1++, s2++) + *dst1++ = *s1; + + *dst1 = '\0'; + + if(*s1 != '\0' || *s2 != '\0') { + *dst1++ = '['; + *dst1++ = 'x'; + *dst1++ = ']'; + + s1 = &(string2str(a))[alen - 1]; + s2 = &(string2str(b))[blen - 1]; + char *dst2 = &buf2[length]; + *dst2 = '\0'; + for (; *s1 && *s2 && *s1 == *s2; s1--, s2--) + *(--dst2) = *s1; + + strcpy(dst1, dst2); + } + + return string_strdupz(buf1); +} + +// ---------------------------------------------------------------------------- +// STRING unit test + +struct thread_unittest { + int join; + int dups; +}; + +static void *string_thread(void *arg) { + struct thread_unittest *tu = arg; + + for(; 1 ;) { + if(__atomic_load_n(&tu->join, __ATOMIC_RELAXED)) + break; + + STRING *s = string_strdupz("string thread checking 1234567890"); + + for(int i = 0; i < tu->dups ; i++) + string_dup(s); + + for(int i = 0; i < tu->dups ; i++) + string_freez(s); + + string_freez(s); + } + + return arg; +} + +static char **string_unittest_generate_names(size_t entries) { + char **names = mallocz(sizeof(char *) * entries); + for(size_t i = 0; i < entries ;i++) { + char buf[25 + 1] = ""; + snprintfz(buf, sizeof(buf) - 1, "name.%zu.0123456789.%zu \t !@#$%%^&*(),./[]{}\\|~`", i, entries / 2 + i); + names[i] = strdupz(buf); + } + return names; +} + +static void string_unittest_free_char_pp(char **pp, size_t entries) { + for(size_t i = 0; i < entries ;i++) + freez(pp[i]); + + freez(pp); +} + +static long unittest_string_entries(void) { + long entries = 0; + for(size_t p = 0; p < STRING_PARTITIONS ;p++) + entries += string_base[p].entries; + + return entries; +} + +#ifdef NETDATA_INTERNAL_CHECKS + +static size_t unittest_string_found_deleted_on_search(void) { + size_t entries = 0; + for(size_t p = 0; p < STRING_PARTITIONS ;p++) + entries += string_base[p].found_deleted_on_search; + + return entries; +} +static size_t unittest_string_found_available_on_search(void) { + size_t entries = 0; + for(size_t p = 0; p < STRING_PARTITIONS ;p++) + entries += string_base[p].found_available_on_search; + + return entries; +} +static size_t unittest_string_found_deleted_on_insert(void) { + size_t entries = 0; + for(size_t p = 0; p < STRING_PARTITIONS ;p++) + entries += string_base[p].found_deleted_on_insert; + + return entries; +} +static size_t unittest_string_found_available_on_insert(void) { + size_t entries = 0; + for(size_t p = 0; p < STRING_PARTITIONS ;p++) + entries += string_base[p].found_available_on_insert; + + return entries; +} +static size_t unittest_string_spins(void) { + size_t entries = 0; + for(size_t p = 0; p < STRING_PARTITIONS ;p++) + entries += string_base[p].spins; + + return entries; +} + +#endif // NETDATA_INTERNAL_CHECKS + +int string_unittest(size_t entries) { + size_t errors = 0; + + fprintf(stderr, "Generating %zu names and values...\n", entries); + char **names = string_unittest_generate_names(entries); + + // check string + { + long entries_starting = unittest_string_entries(); + + fprintf(stderr, "\nChecking strings...\n"); + + STRING *s1 = string_strdupz("hello unittest"); + STRING *s2 = string_strdupz("hello unittest"); + if(s1 != s2) { + errors++; + fprintf(stderr, "ERROR: duplicating strings are not deduplicated\n"); + } + else + fprintf(stderr, "OK: duplicating string are deduplicated\n"); + + STRING *s3 = string_dup(s1); + if(s3 != s1) { + errors++; + fprintf(stderr, "ERROR: cloning strings are not deduplicated\n"); + } + else + fprintf(stderr, "OK: cloning string are deduplicated\n"); + + if(s1->refcount != 3) { + errors++; + fprintf(stderr, "ERROR: string refcount is not 3\n"); + } + else + fprintf(stderr, "OK: string refcount is 3\n"); + + STRING *s4 = string_strdupz("world unittest"); + if(s4 == s1) { + errors++; + fprintf(stderr, "ERROR: string is sharing pointers on different strings\n"); + } + else + fprintf(stderr, "OK: string is properly handling different strings\n"); + + usec_t start_ut, end_ut; + STRING **strings = mallocz(entries * sizeof(STRING *)); + + start_ut = now_realtime_usec(); + for(size_t i = 0; i < entries ;i++) { + strings[i] = string_strdupz(names[i]); + } + end_ut = now_realtime_usec(); + fprintf(stderr, "Created %zu strings in %"PRIu64" usecs\n", entries, end_ut - start_ut); + + start_ut = now_realtime_usec(); + for(size_t i = 0; i < entries ;i++) { + strings[i] = string_dup(strings[i]); + } + end_ut = now_realtime_usec(); + fprintf(stderr, "Cloned %zu strings in %"PRIu64" usecs\n", entries, end_ut - start_ut); + + start_ut = now_realtime_usec(); + for(size_t i = 0; i < entries ;i++) { + strings[i] = string_strdupz(string2str(strings[i])); + } + end_ut = now_realtime_usec(); + fprintf(stderr, "Found %zu existing strings in %"PRIu64" usecs\n", entries, end_ut - start_ut); + + start_ut = now_realtime_usec(); + for(size_t i = 0; i < entries ;i++) { + string_freez(strings[i]); + } + end_ut = now_realtime_usec(); + fprintf(stderr, "Released %zu referenced strings in %"PRIu64" usecs\n", entries, end_ut - start_ut); + + start_ut = now_realtime_usec(); + for(size_t i = 0; i < entries ;i++) { + string_freez(strings[i]); + } + end_ut = now_realtime_usec(); + fprintf(stderr, "Released (again) %zu referenced strings in %"PRIu64" usecs\n", entries, end_ut - start_ut); + + start_ut = now_realtime_usec(); + for(size_t i = 0; i < entries ;i++) { + string_freez(strings[i]); + } + end_ut = now_realtime_usec(); + fprintf(stderr, "Freed %zu strings in %"PRIu64" usecs\n", entries, end_ut - start_ut); + + freez(strings); + + if(unittest_string_entries() != entries_starting + 2) { + errors++; + fprintf(stderr, "ERROR: strings dictionary should have %ld items but it has %ld\n", + entries_starting + 2, unittest_string_entries()); + } + else + fprintf(stderr, "OK: strings dictionary has 2 items\n"); + } + + // check 2-way merge + { + struct testcase { + char *src1; char *src2; char *expected; + } tests[] = { + { "", "", ""}, + { "a", "", "[x]"}, + { "", "a", "[x]"}, + { "a", "a", "a"}, + { "abcd", "abcd", "abcd"}, + { "foo_cs", "bar_cs", "[x]_cs"}, + { "cp_UNIQUE_INFIX_cs", "cp_unique_infix_cs", "cp_[x]_cs"}, + { "cp_UNIQUE_INFIX_ci_unique_infix_cs", "cp_unique_infix_ci_UNIQUE_INFIX_cs", "cp_[x]_cs"}, + { "foo[1234]", "foo[4321]", "foo[[x]]"}, + { NULL, NULL, NULL }, + }; + + for (struct testcase *tc = &tests[0]; tc->expected != NULL; tc++) { + STRING *src1 = string_strdupz(tc->src1); + STRING *src2 = string_strdupz(tc->src2); + STRING *expected = string_strdupz(tc->expected); + + STRING *result = string_2way_merge(src1, src2); + if (string_cmp(result, expected) != 0) { + fprintf(stderr, "string_2way_merge(\"%s\", \"%s\") -> \"%s\" (expected=\"%s\")\n", + string2str(src1), + string2str(src2), + string2str(result), + string2str(expected)); + errors++; + } + + string_freez(src1); + string_freez(src2); + string_freez(expected); + string_freez(result); + } + } + + // threads testing of string + { + struct thread_unittest tu = { + .dups = 1, + .join = 0, + }; + +#ifdef NETDATA_INTERNAL_CHECKS + size_t ofound_deleted_on_search = unittest_string_found_deleted_on_search(), + ofound_available_on_search = unittest_string_found_available_on_search(), + ofound_deleted_on_insert = unittest_string_found_deleted_on_insert(), + ofound_available_on_insert = unittest_string_found_available_on_insert(), + ospins = unittest_string_spins(); +#endif + + size_t oinserts, odeletes, osearches, oentries, oreferences, omemory, oduplications, oreleases; + string_statistics(&oinserts, &odeletes, &osearches, &oentries, &oreferences, &omemory, &oduplications, &oreleases); + + time_t seconds_to_run = 5; + int threads_to_create = 2; + fprintf( + stderr, + "Checking string concurrency with %d threads for %lld seconds...\n", + threads_to_create, + (long long)seconds_to_run); + // check string concurrency + ND_THREAD *threads[threads_to_create]; + tu.join = 0; + for (int i = 0; i < threads_to_create; i++) { + char buf[100 + 1]; + snprintf(buf, 100, "string%d", i); + threads[i] = nd_thread_create(buf, NETDATA_THREAD_OPTION_DONT_LOG | NETDATA_THREAD_OPTION_JOINABLE, string_thread, &tu); + } + sleep_usec(seconds_to_run * USEC_PER_SEC); + + __atomic_store_n(&tu.join, 1, __ATOMIC_RELAXED); + for (int i = 0; i < threads_to_create; i++) + nd_thread_join(threads[i]); + + size_t inserts, deletes, searches, sentries, references, memory, duplications, releases; + string_statistics(&inserts, &deletes, &searches, &sentries, &references, &memory, &duplications, &releases); + + fprintf(stderr, "inserts %zu, deletes %zu, searches %zu, entries %zu, references %zu, memory %zu, duplications %zu, releases %zu\n", + inserts - oinserts, deletes - odeletes, searches - osearches, sentries - oentries, references - oreferences, memory - omemory, duplications - oduplications, releases - oreleases); + +#ifdef NETDATA_INTERNAL_CHECKS + size_t found_deleted_on_search = unittest_string_found_deleted_on_search(), + found_available_on_search = unittest_string_found_available_on_search(), + found_deleted_on_insert = unittest_string_found_deleted_on_insert(), + found_available_on_insert = unittest_string_found_available_on_insert(), + spins = unittest_string_spins(); + + fprintf(stderr, "on insert: %zu ok + %zu deleted\non search: %zu ok + %zu deleted\nspins: %zu\n", + found_available_on_insert - ofound_available_on_insert, + found_deleted_on_insert - ofound_deleted_on_insert, + found_available_on_search - ofound_available_on_search, + found_deleted_on_search - ofound_deleted_on_search, + spins - ospins + ); +#endif + } + + string_unittest_free_char_pp(names, entries); + + fprintf(stderr, "\n%zu errors found\n", errors); + return errors ? 1 : 0; +} diff --git a/src/libnetdata/string/string.h b/src/libnetdata/string/string.h new file mode 100644 index 00000000..f2ff9666 --- /dev/null +++ b/src/libnetdata/string/string.h @@ -0,0 +1,37 @@ + +#ifndef NETDATA_STRING_H +#define NETDATA_STRING_H 1 + +#include "../libnetdata.h" + +// ---------------------------------------------------------------------------- +// STRING implementation + +typedef struct netdata_string STRING; + +STRING *string_strdupz(const char *str); +STRING *string_strndupz(const char *str, size_t len); + +STRING *string_dup(STRING *string); +void string_freez(STRING *string); +size_t string_strlen(STRING *string); +const char *string2str(STRING *string) NEVERNULL; + +// keep common prefix/suffix and replace everything else with [x] +STRING *string_2way_merge(STRING *a, STRING *b); + +static inline int string_cmp(STRING *s1, STRING *s2) { + // STRINGs are deduplicated, so the same strings have the same pointer + // when they differ, we do the typical strcmp() comparison + return (s1 == s2)?0:strcmp(string2str(s1), string2str(s2)); +} + +static inline int string_strcmp(STRING *string, const char *s) { + return strcmp(string2str(string), s); +} + +void string_statistics(size_t *inserts, size_t *deletes, size_t *searches, size_t *entries, size_t *references, size_t *memory, size_t *duplications, size_t *releases); + +int string_unittest(size_t entries); + +#endif diff --git a/src/libnetdata/string/utf8.h b/src/libnetdata/string/utf8.h new file mode 100644 index 00000000..3e6c8c28 --- /dev/null +++ b/src/libnetdata/string/utf8.h @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_STRING_UTF8_H +#define NETDATA_STRING_UTF8_H 1 + +#define IS_UTF8_BYTE(x) ((x) & 0x80) +#define IS_UTF8_STARTBYTE(x) (IS_UTF8_BYTE(x)&&((x) & 0x40)) + +#endif /* NETDATA_STRING_UTF8_H */ diff --git a/src/libnetdata/template-enum.h b/src/libnetdata/template-enum.h new file mode 100644 index 00000000..393a6a94 --- /dev/null +++ b/src/libnetdata/template-enum.h @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_TEMPLATE_ENUM_H +#define NETDATA_TEMPLATE_ENUM_H + +#define ENUM_STR_MAP_DEFINE(type) \ + static struct { \ + type id; \ + const char *name; \ + } type ## _names[] + +#define ENUM_STR_DEFINE_FUNCTIONS_EXTERN(type) \ + type type ## _2id(const char *str); \ + const char *type##_2str(type id); + +#define ENUM_STR_DEFINE_FUNCTIONS(type, def, def_str) \ + type type##_2id(const char *str) \ + { \ + if (!str || !*str) \ + return def; \ + \ + for (size_t i = 0; type ## _names[i].name; i++) { \ + if (strcmp(type ## _names[i].name, str) == 0) \ + return type ## _names[i].id; \ + } \ + \ + return def; \ + } \ + \ + const char *type##_2str(type id) \ + { \ + for (size_t i = 0; type ## _names[i].name; i++) { \ + if (id == type ## _names[i].id) \ + return type ## _names[i].name; \ + } \ + \ + return def_str; \ + } + +#endif //NETDATA_TEMPLATE_ENUM_H diff --git a/src/libnetdata/tests/test_str2ld.c b/src/libnetdata/tests/test_str2ld.c new file mode 100644 index 00000000..8b97a70f --- /dev/null +++ b/src/libnetdata/tests/test_str2ld.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" +#include "../required_dummies.h" +#include <setjmp.h> +#include <cmocka.h> + +static void test_str2ld(void **state) +{ + (void)state; + char *values[] = { + "1.2345678", + "-35.6", + "0.00123", + "23842384234234.2", + ".1", + "1.2e-10", + "hello", + "1wrong", + "nan", + "inf", + NULL + }; + + for (int i = 0; values[i]; i++) { + char *e_mine = "hello", *e_sys = "world"; + NETDATA_DOUBLE mine = str2ndd(values[i], &e_mine); + NETDATA_DOUBLE sys = strtondd(values[i], &e_sys); + + if (isnan(mine)) + assert_true(isnan(sys)); + else if (isinf(mine)) + assert_true(isinf(sys)); + else if (mine != sys) + assert_false(ABS(mine - sys) > 0.000001); + + assert_ptr_equal(e_mine, e_sys); + } +} + +int main(void) +{ + const struct CMUnitTest tests[] = { + cmocka_unit_test(test_str2ld) + }; + + return cmocka_run_group_tests_name("str2ld", tests, NULL, NULL); +} diff --git a/src/libnetdata/threads/README.md b/src/libnetdata/threads/README.md new file mode 100644 index 00000000..906f4795 --- /dev/null +++ b/src/libnetdata/threads/README.md @@ -0,0 +1,12 @@ +<!-- +title: Threads +custom_edit_url: https://github.com/netdata/netdata/edit/master/src/libnetdata/threads/README.md +sidebar_label: "Threads" +learn_status: "Published" +learn_topic_type: "Tasks" +learn_rel_path: "Developers/libnetdata" +--> + +# Threads + +Netdata uses a custom threads library diff --git a/src/libnetdata/threads/threads.c b/src/libnetdata/threads/threads.c new file mode 100644 index 00000000..0e12d173 --- /dev/null +++ b/src/libnetdata/threads/threads.c @@ -0,0 +1,437 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" + +#define nd_thread_status_get(nti) __atomic_load_n(&((nti)->options), __ATOMIC_ACQUIRE) +#define nd_thread_status_check(nti, flag) (__atomic_load_n(&((nti)->options), __ATOMIC_ACQUIRE) & (flag)) +#define nd_thread_status_set(nti, flag) __atomic_or_fetch(&((nti)->options), flag, __ATOMIC_RELEASE) +#define nd_thread_status_clear(nti, flag) __atomic_and_fetch(&((nti)->options), ~(flag), __ATOMIC_RELEASE) + +typedef void (*nd_thread_canceller)(void *data); + +struct nd_thread { + void *arg; + pid_t tid; + char tag[ND_THREAD_TAG_MAX + 1]; + void *ret; // the return value of start routine + void *(*start_routine) (void *); + NETDATA_THREAD_OPTIONS options; + pthread_t thread; + bool cancel_atomic; + +#ifdef NETDATA_INTERNAL_CHECKS + // keep track of the locks currently held + // used to detect locks that are left locked during exit + int rwlocks_read_locks; + int rwlocks_write_locks; + int mutex_locks; + int spinlock_locks; + int rwspinlock_read_locks; + int rwspinlock_write_locks; +#endif + + struct { + SPINLOCK spinlock; + nd_thread_canceller cb; + void *data; + } canceller; + + struct nd_thread *prev, *next; +}; + +static struct { + struct { + SPINLOCK spinlock; + ND_THREAD *list; + } exited; + + struct { + SPINLOCK spinlock; + ND_THREAD *list; + } running; + + pthread_attr_t *attr; +} threads_globals = { + .exited = { + .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .list = NULL, + }, + .running = { + .spinlock = NETDATA_SPINLOCK_INITIALIZER, + .list = NULL, + }, + .attr = NULL, +}; + +static __thread ND_THREAD *_nd_thread_info = NULL; +static __thread char _nd_thread_os_name[ND_THREAD_TAG_MAX + 1] = ""; + +// -------------------------------------------------------------------------------------------------------------------- +// O/S abstraction + +// get the thread name from the operating system +static inline void os_get_thread_name(char *out, size_t size) { +#if defined(__FreeBSD__) + pthread_get_name_np(pthread_self(), out, size); + if(strcmp(_nd_thread_os_name, "netdata") == 0) + strncpyz(out, "MAIN", size - 1); +#elif defined(HAVE_PTHREAD_GETNAME_NP) + pthread_getname_np(pthread_self(), out, size - 1); + if(strcmp(out, "netdata") == 0) + strncpyz(out, "MAIN", size - 1); +#else + strncpyz(out, "MAIN", size - 1); +#endif +} + +// set the thread name to the operating system +static inline void os_set_thread_name(const char *name) { +#if defined(__FreeBSD__) + pthread_set_name_np(pthread_self(), name); +#elif defined(__APPLE__) + pthread_setname_np(name); +#else + pthread_setname_np(pthread_self(), name); +#endif +} + +// -------------------------------------------------------------------------------------------------------------------- +// internal API for managing names + +inline int nd_thread_has_tag(void) { + return (_nd_thread_info && _nd_thread_info->tag[0]); +} + +// For threads created by netdata, return the tag of the thread. +// For threads created by others (libuv, webrtc, etc), return the tag of the operating system. +// This caches the response, so that it won't query the operating system multiple times. +static inline const char *nd_thread_get_name(bool recheck) { + if(nd_thread_has_tag()) + return _nd_thread_info->tag; + + if(!recheck && _nd_thread_os_name[0]) + return _nd_thread_os_name; + + os_get_thread_name(_nd_thread_os_name, sizeof(_nd_thread_os_name)); + + return _nd_thread_os_name; +} + +const char *nd_thread_tag(void) { + return nd_thread_get_name(false); +} + +void nd_thread_tag_set(const char *tag) { + if(!tag || !*tag) return; + + if(_nd_thread_info) + strncpyz(_nd_thread_info->tag, tag, sizeof(_nd_thread_info->tag) - 1); + + strncpyz(_nd_thread_os_name, tag, sizeof(_nd_thread_os_name) - 1); + + os_set_thread_name(_nd_thread_os_name); +} + +// -------------------------------------------------------------------------------------------------------------------- + +static __thread bool libuv_name_set = false; +void uv_thread_set_name_np(const char* name) { + if(libuv_name_set) return; + + strncpyz(_nd_thread_os_name, name, sizeof(_nd_thread_os_name) - 1); + os_set_thread_name(_nd_thread_os_name); + libuv_name_set = true; +} + +// -------------------------------------------------------------------------------------------------------------------- + +static size_t webrtc_id = 0; +static __thread bool webrtc_name_set = false; +void webrtc_set_thread_name(void) { + if(_nd_thread_info || webrtc_name_set) return; + + webrtc_name_set = true; + + char tmp[ND_THREAD_TAG_MAX + 1] = ""; + os_get_thread_name(tmp, sizeof(tmp)); + + if(!tmp[0] || strcmp(tmp, "netdata") == 0) { + char name[ND_THREAD_TAG_MAX + 1]; + snprintfz(name, ND_THREAD_TAG_MAX, "WEBRTC[%zu]", __atomic_fetch_add(&webrtc_id, 1, __ATOMIC_RELAXED)); + os_set_thread_name(name); + } + + nd_thread_get_name(true); +} + +// -------------------------------------------------------------------------------------------------------------------- +// locks tracking + +#ifdef NETDATA_INTERNAL_CHECKS +void nd_thread_rwlock_read_locked(void) { if(_nd_thread_info) _nd_thread_info->rwlocks_read_locks++; } +void nd_thread_rwlock_read_unlocked(void) { if(_nd_thread_info) _nd_thread_info->rwlocks_read_locks--; } +void nd_thread_rwlock_write_locked(void) { if(_nd_thread_info) _nd_thread_info->rwlocks_write_locks++; } +void nd_thread_rwlock_write_unlocked(void) { if(_nd_thread_info) _nd_thread_info->rwlocks_write_locks--; } +void nd_thread_mutex_locked(void) { if(_nd_thread_info) _nd_thread_info->mutex_locks++; } +void nd_thread_mutex_unlocked(void) { if(_nd_thread_info) _nd_thread_info->mutex_locks--; } +void nd_thread_spinlock_locked(void) { if(_nd_thread_info) _nd_thread_info->spinlock_locks++; } +void nd_thread_spinlock_unlocked(void) { if(_nd_thread_info) _nd_thread_info->spinlock_locks--; } +void nd_thread_rwspinlock_read_locked(void) { if(_nd_thread_info) _nd_thread_info->rwspinlock_read_locks++; } +void nd_thread_rwspinlock_read_unlocked(void) { if(_nd_thread_info) _nd_thread_info->rwspinlock_read_locks--; } +void nd_thread_rwspinlock_write_locked(void) { if(_nd_thread_info) _nd_thread_info->rwspinlock_write_locks++; } +void nd_thread_rwspinlock_write_unlocked(void) { if(_nd_thread_info) _nd_thread_info->rwspinlock_write_locks--; } +#endif + +// -------------------------------------------------------------------------------------------------------------------- +// early initialization + +size_t netdata_threads_init(void) { + int i; + + if(!threads_globals.attr) { + threads_globals.attr = callocz(1, sizeof(pthread_attr_t)); + i = pthread_attr_init(threads_globals.attr); + if (i != 0) + fatal("pthread_attr_init() failed with code %d.", i); + } + + // get the required stack size of the threads of netdata + size_t stacksize = 0; + i = pthread_attr_getstacksize(threads_globals.attr, &stacksize); + if(i != 0) + fatal("pthread_attr_getstacksize() failed with code %d.", i); + + return stacksize; +} + +// ---------------------------------------------------------------------------- +// late initialization + +void netdata_threads_init_after_fork(size_t stacksize) { + int i; + + // set pthread stack size + if(threads_globals.attr && stacksize > (size_t)PTHREAD_STACK_MIN) { + i = pthread_attr_setstacksize(threads_globals.attr, stacksize); + if(i != 0) + nd_log(NDLS_DAEMON, NDLP_WARNING, "pthread_attr_setstacksize() to %zu bytes, failed with code %d.", stacksize, i); + else + nd_log(NDLS_DAEMON, NDLP_DEBUG, "Set threads stack size to %zu bytes", stacksize); + } + else + nd_log(NDLS_DAEMON, NDLP_WARNING, "Invalid pthread stacksize %zu", stacksize); +} + +// ---------------------------------------------------------------------------- +// threads init for external plugins + +void netdata_threads_init_for_external_plugins(size_t stacksize) { + size_t default_stacksize = netdata_threads_init(); + if(default_stacksize < 1 * 1024 * 1024) + default_stacksize = 1 * 1024 * 1024; + + netdata_threads_init_after_fork(stacksize ? stacksize : default_stacksize); +} + +// ---------------------------------------------------------------------------- + +void rrdset_thread_rda_free(void); +void sender_thread_buffer_free(void); +void query_target_free(void); +void service_exits(void); +void rrd_collector_finished(void); + +static void nd_thread_join_exited_detached_threads(void) { + while(1) { + spinlock_lock(&threads_globals.exited.spinlock); + + ND_THREAD *nti = threads_globals.exited.list; + while (nti && nd_thread_status_check(nti, NETDATA_THREAD_OPTION_JOINABLE) == 0) + nti = nti->next; + + if(nti) + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(threads_globals.exited.list, nti, prev, next); + + spinlock_unlock(&threads_globals.exited.spinlock); + + if(nti) { + nd_log(NDLS_DAEMON, NDLP_INFO, "Joining detached thread '%s', tid %d", nti->tag, nti->tid); + nd_thread_join(nti); + } + else + break; + } +} + +static void nd_thread_exit(void *pptr) { + ND_THREAD *nti = CLEANUP_FUNCTION_GET_PTR(pptr); + + if(nti != _nd_thread_info || !nti || !_nd_thread_info) { + nd_log(NDLS_DAEMON, NDLP_ERR, + "THREADS: internal error - thread local variable does not match the one passed to this function. " + "Expected thread '%s', passed thread '%s'", + _nd_thread_info ? _nd_thread_info->tag : "(null)", nti ? nti->tag : "(null)"); + + if(!nti) nti = _nd_thread_info; + } + + if(!nti) return; + + internal_fatal(nti->rwlocks_read_locks != 0, + "THREAD '%s' WITH PID %d HAS %d RWLOCKS READ ACQUIRED WHILE EXITING !!!", + (nti) ? nti->tag : "(unset)", gettid_cached(), nti->rwlocks_read_locks); + + internal_fatal(nti->rwlocks_write_locks != 0, + "THREAD '%s' WITH PID %d HAS %d RWLOCKS WRITE ACQUIRED WHILE EXITING !!!", + (nti) ? nti->tag : "(unset)", gettid_cached(), nti->rwlocks_write_locks); + + internal_fatal(nti->mutex_locks != 0, + "THREAD '%s' WITH PID %d HAS %d MUTEXES ACQUIRED WHILE EXITING !!!", + (nti) ? nti->tag : "(unset)", gettid_cached(), nti->mutex_locks); + + internal_fatal(nti->spinlock_locks != 0, + "THREAD '%s' WITH PID %d HAS %d SPINLOCKS ACQUIRED WHILE EXITING !!!", + (nti) ? nti->tag : "(unset)", gettid_cached(), nti->spinlock_locks); + + internal_fatal(nti->rwspinlock_read_locks != 0, + "THREAD '%s' WITH PID %d HAS %d RWSPINLOCKS READ ACQUIRED WHILE EXITING !!!", + (nti) ? nti->tag : "(unset)", gettid_cached(), nti->rwspinlock_read_locks); + + internal_fatal(nti->rwspinlock_write_locks != 0, + "THREAD '%s' WITH PID %d HAS %d RWSPINLOCKS WRITE ACQUIRED WHILE EXITING !!!", + (nti) ? nti->tag : "(unset)", gettid_cached(), nti->rwspinlock_write_locks); + + if(nd_thread_status_check(nti, NETDATA_THREAD_OPTION_DONT_LOG_CLEANUP) != NETDATA_THREAD_OPTION_DONT_LOG_CLEANUP) + nd_log(NDLS_DAEMON, NDLP_DEBUG, "thread with task id %d finished", nti->tid); + + rrd_collector_finished(); + sender_thread_buffer_free(); + rrdset_thread_rda_free(); + query_target_free(); + thread_cache_destroy(); + service_exits(); + worker_unregister(); + + nd_thread_status_set(nti, NETDATA_THREAD_STATUS_FINISHED); + + spinlock_lock(&threads_globals.running.spinlock); + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(threads_globals.running.list, nti, prev, next); + spinlock_unlock(&threads_globals.running.spinlock); + + if (nd_thread_status_check(nti, NETDATA_THREAD_OPTION_JOINABLE) != NETDATA_THREAD_OPTION_JOINABLE) { + spinlock_lock(&threads_globals.exited.spinlock); + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(threads_globals.exited.list, nti, prev, next); + spinlock_unlock(&threads_globals.exited.spinlock); + } +} + +static void *nd_thread_starting_point(void *ptr) { + ND_THREAD *nti = _nd_thread_info = (ND_THREAD *)ptr; + nd_thread_status_set(nti, NETDATA_THREAD_STATUS_STARTED); + + nti->tid = gettid_cached(); + nd_thread_tag_set(nti->tag); + + if(nd_thread_status_check(nti, NETDATA_THREAD_OPTION_DONT_LOG_STARTUP) != NETDATA_THREAD_OPTION_DONT_LOG_STARTUP) + nd_log(NDLS_DAEMON, NDLP_DEBUG, "thread created with task id %d", gettid_cached()); + + if(pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL) != 0) + nd_log(NDLS_DAEMON, NDLP_WARNING, "cannot set pthread cancel type to DEFERRED."); + + if(pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL) != 0) + nd_log(NDLS_DAEMON, NDLP_WARNING, "cannot set pthread cancel state to ENABLE."); + + CLEANUP_FUNCTION_REGISTER(nd_thread_exit) cleanup_ptr = nti; + + // run the thread code + nti->ret = nti->start_routine(nti->arg); + + return nti; +} + +ND_THREAD *nd_thread_self(void) { + return _nd_thread_info; +} + +bool nd_thread_is_me(ND_THREAD *nti) { + return nti && nti->thread == pthread_self(); +} + +ND_THREAD *nd_thread_create(const char *tag, NETDATA_THREAD_OPTIONS options, void *(*start_routine)(void *), void *arg) { + nd_thread_join_exited_detached_threads(); + + ND_THREAD *nti = callocz(1, sizeof(*nti)); + spinlock_init(&nti->canceller.spinlock); + nti->arg = arg; + nti->start_routine = start_routine; + nti->options = options & NETDATA_THREAD_OPTIONS_ALL; + strncpyz(nti->tag, tag, ND_THREAD_TAG_MAX); + + spinlock_lock(&threads_globals.running.spinlock); + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(threads_globals.running.list, nti, prev, next); + spinlock_unlock(&threads_globals.running.spinlock); + + int ret = pthread_create(&nti->thread, threads_globals.attr, nd_thread_starting_point, nti); + if(ret != 0) { + nd_log(NDLS_DAEMON, NDLP_ERR, + "failed to create new thread for %s. pthread_create() failed with code %d", + tag, ret); + + spinlock_lock(&threads_globals.running.spinlock); + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(threads_globals.running.list, nti, prev, next); + spinlock_unlock(&threads_globals.running.spinlock); + freez(nti); + return NULL; + } + + return nti; +} + +// -------------------------------------------------------------------------------------------------------------------- + +void nd_thread_register_canceller(nd_thread_canceller cb, void *data) { + ND_THREAD *nti = _nd_thread_info; + if(!nti) return; + + spinlock_lock(&nti->canceller.spinlock); + nti->canceller.cb = cb; + nti->canceller.data = data; + spinlock_unlock(&nti->canceller.spinlock); +} + +void nd_thread_signal_cancel(ND_THREAD *nti) { + if(!nti) return; + + __atomic_store_n(&nti->cancel_atomic, true, __ATOMIC_RELAXED); + + spinlock_lock(&nti->canceller.spinlock); + if(nti->canceller.cb) + nti->canceller.cb(nti->canceller.data); + spinlock_unlock(&nti->canceller.spinlock); +} + +bool nd_thread_signaled_to_cancel(void) { + if(!_nd_thread_info) return false; + return __atomic_load_n(&_nd_thread_info->cancel_atomic, __ATOMIC_RELAXED); +} + +// ---------------------------------------------------------------------------- +// nd_thread_join + +void nd_thread_join(ND_THREAD *nti) { + if(!nti) return; + + int ret = pthread_join(nti->thread, NULL); + if(ret != 0) + nd_log(NDLS_DAEMON, NDLP_WARNING, "cannot join thread. pthread_join() failed with code %d.", ret); + else { + nd_thread_status_set(nti, NETDATA_THREAD_STATUS_JOINED); + + spinlock_lock(&threads_globals.exited.spinlock); + if(nti->prev) + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(threads_globals.exited.list, nti, prev, next); + spinlock_unlock(&threads_globals.exited.spinlock); + + freez(nti); + } +} diff --git a/src/libnetdata/threads/threads.h b/src/libnetdata/threads/threads.h new file mode 100644 index 00000000..a7204e2a --- /dev/null +++ b/src/libnetdata/threads/threads.h @@ -0,0 +1,114 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_THREADS_H +#define NETDATA_THREADS_H 1 + +#include "../libnetdata.h" + +typedef enum __attribute__((packed)) { + NETDATA_THREAD_OPTION_DEFAULT = 0 << 0, + NETDATA_THREAD_OPTION_JOINABLE = 1 << 0, + NETDATA_THREAD_OPTION_DONT_LOG_STARTUP = 1 << 1, + NETDATA_THREAD_OPTION_DONT_LOG_CLEANUP = 1 << 2, + NETDATA_THREAD_STATUS_STARTED = 1 << 3, + NETDATA_THREAD_STATUS_FINISHED = 1 << 4, + NETDATA_THREAD_STATUS_JOINED = 1 << 5, +} NETDATA_THREAD_OPTIONS; + +#define NETDATA_THREAD_OPTIONS_ALL (NETDATA_THREAD_OPTION_JOINABLE | NETDATA_THREAD_OPTION_DONT_LOG_STARTUP | NETDATA_THREAD_OPTION_DONT_LOG_CLEANUP) +#define NETDATA_THREAD_OPTION_DONT_LOG (NETDATA_THREAD_OPTION_DONT_LOG_STARTUP | NETDATA_THREAD_OPTION_DONT_LOG_CLEANUP) + +#define netdata_thread_cleanup_push(func, arg) pthread_cleanup_push(func, arg) +#define netdata_thread_cleanup_pop(execute) pthread_cleanup_pop(execute) + +void nd_thread_tag_set(const char *tag); + +typedef struct nd_thread ND_THREAD; + +struct netdata_static_thread { + // the name of the thread as it should appear in the logs + char *name; + + // the section of netdata.conf to check if this is enabled or not + char *config_section; + + // the name of the config option to check if it is true or false + char *config_name; + + // the current status of the thread + volatile sig_atomic_t enabled; + + // internal use, to maintain a pointer to the created thread + ND_THREAD *thread; + + // an initialization function to run before spawning the thread + void (*init_routine) (void); + + // the threaded worker + void *(*start_routine) (void *); + + // the environment variable to create + char *env_name; + + // global variable + bool *global_variable; +}; + +#define NETDATA_MAIN_THREAD_RUNNING CONFIG_BOOLEAN_YES +#define NETDATA_MAIN_THREAD_EXITING (CONFIG_BOOLEAN_YES + 1) +#define NETDATA_MAIN_THREAD_EXITED CONFIG_BOOLEAN_NO + +#define NETDATA_THREAD_TAG_MAX 100 +const char *nd_thread_tag(void); +int nd_thread_has_tag(void); + +#define THREAD_TAG_STREAM_RECEIVER "RCVR" +#define THREAD_TAG_STREAM_SENDER "SNDR" + +size_t netdata_threads_init(void); +void netdata_threads_init_after_fork(size_t stacksize); +void netdata_threads_init_for_external_plugins(size_t stacksize); + +ND_THREAD *nd_thread_create(const char *tag, NETDATA_THREAD_OPTIONS options, void *(*start_routine) (void *), void *arg); +void nd_thread_join(ND_THREAD * nti); +ND_THREAD *nd_thread_self(void); +bool nd_thread_is_me(ND_THREAD *nti); + +typedef void (*nd_thread_canceller)(void *data); +void nd_thread_register_canceller(nd_thread_canceller cb, void *data); +void nd_thread_signal_cancel(ND_THREAD *nti); +bool nd_thread_signaled_to_cancel(void); + +#define ND_THREAD_TAG_MAX 15 +void uv_thread_set_name_np(const char* name); +void webrtc_set_thread_name(void); + +#ifdef NETDATA_INTERNAL_CHECKS +void nd_thread_rwlock_read_locked(void); +void nd_thread_rwlock_read_unlocked(void); +void nd_thread_rwlock_write_locked(void); +void nd_thread_rwlock_write_unlocked(void); +void nd_thread_mutex_locked(void); +void nd_thread_mutex_unlocked(void); +void nd_thread_spinlock_locked(void); +void nd_thread_spinlock_unlocked(void); +void nd_thread_rwspinlock_read_locked(void); +void nd_thread_rwspinlock_read_unlocked(void); +void nd_thread_rwspinlock_write_locked(void); +void nd_thread_rwspinlock_write_unlocked(void); +#else +#define nd_thread_rwlock_read_locked() debug_dummy() +#define nd_thread_rwlock_read_unlocked() debug_dummy() +#define nd_thread_rwlock_write_locked() debug_dummy() +#define nd_thread_rwlock_write_unlocked() debug_dummy() +#define nd_thread_mutex_locked() debug_dummy() +#define nd_thread_mutex_unlocked() debug_dummy() +#define nd_thread_spinlock_locked() debug_dummy() +#define nd_thread_spinlock_unlocked() debug_dummy() +#define nd_thread_rwspinlock_read_locked() debug_dummy() +#define nd_thread_rwspinlock_read_unlocked() debug_dummy() +#define nd_thread_rwspinlock_write_locked() debug_dummy() +#define nd_thread_rwspinlock_write_unlocked() debug_dummy() +#endif + +#endif //NETDATA_THREADS_H diff --git a/src/libnetdata/url/README.md b/src/libnetdata/url/README.md new file mode 100644 index 00000000..01a2dddb --- /dev/null +++ b/src/libnetdata/url/README.md @@ -0,0 +1,14 @@ +<!-- +title: "URL" +custom_edit_url: https://github.com/netdata/netdata/edit/master/src/libnetdata/url/README.md +sidebar_label: "URL" +learn_status: "Published" +learn_topic_type: "Tasks" +learn_rel_path: "Developers/libnetdata" +--> + +# URL + +The URL library contains common functions useful for URLs, like conversion from/to hex, +URL encode/decode and query string parsing. + diff --git a/src/libnetdata/url/url.c b/src/libnetdata/url/url.c new file mode 100644 index 00000000..720a703d --- /dev/null +++ b/src/libnetdata/url/url.c @@ -0,0 +1,313 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" + +// ---------------------------------------------------------------------------- +// URL encode / decode +// code from: http://www.geekhideout.com/urlcode.shtml + +/* Converts a hex character to its integer value */ +char from_hex(char ch) { + return (char)(isdigit(ch) ? ch - '0' : tolower(ch) - 'a' + 10); +} + +/* Converts an integer value to its hex character*/ +char to_hex(char code) { + static char hex[] = "0123456789abcdef"; + return hex[code & 15]; +} + +/* Returns an url-encoded version of str */ +/* IMPORTANT: be sure to free() the returned string after use */ +char *url_encode(char *str) { + char *buf, *pbuf; + + pbuf = buf = mallocz(strlen(str) * 3 + 1); + + while (*str) { + if (isalnum((uint8_t)*str) || *str == '-' || *str == '_' || *str == '.' || *str == '~') + *pbuf++ = *str; + + else if (*str == ' ') + *pbuf++ = '+'; + + else{ + *pbuf++ = '%'; + *pbuf++ = to_hex((char)(*str >> 4)); + *pbuf++ = to_hex((char)(*str & 15)); + } + + str++; + } + *pbuf = '\0'; + + pbuf = strdupz(buf); + freez(buf); + return pbuf; +} + +/** + * Percentage escape decode + * + * Decode %XX character or return 0 if cannot + * + * @param s the string to decode + * + * @return The character decoded on success and 0 otherwise + */ +char url_percent_escape_decode(const char *s) { + if(likely(s[1] && s[2])) + return (char)(from_hex(s[1]) << 4 | from_hex(s[2])); + return 0; +} + +/** + * Get byte length + * + * This (utf8 string related) should be moved in separate file in future + * + * @param c is the utf8 character + * * + * @return It returns the length of the specific character. + */ +char url_utf8_get_byte_length(char c) { + if(!IS_UTF8_BYTE(c)) + return 1; + + char length = 0; + while(likely(c & 0x80)) { + length++; + c <<= 1; + } + //4 byte is max size for UTF-8 char + //10XX XXXX is not valid character -> check length == 1 + if(length > 4 || length == 1) + return -1; + + return length; +} + +/** + * Decode Multibyte UTF8 + * + * Decode % encoded UTF-8 characters and copy them to *d + * + * @param s first address + * @param d + * @param d_end last address + * + * @return count of bytes written to *d + */ +char url_decode_multibyte_utf8(const char *s, char *d, const char *d_end) { + char first_byte = url_percent_escape_decode(s); + + if(unlikely(!first_byte || !IS_UTF8_STARTBYTE(first_byte))) + return 0; + + char byte_length = url_utf8_get_byte_length(first_byte); + + if(unlikely(byte_length <= 0 || d+byte_length >= d_end)) + return 0; + + char to_read = byte_length; + while(to_read > 0) { + char c = url_percent_escape_decode(s); + + if(unlikely( !IS_UTF8_BYTE(c) )) + return 0; + if((to_read != byte_length) && IS_UTF8_STARTBYTE(c)) + return 0; + + *d++ = c; + s+=3; + to_read--; + } + + return byte_length; +} + +/* + * The utf8_check() function scans the '\0'-terminated string starting + * at s. It returns a pointer to the first byte of the first malformed + * or overlong UTF-8 sequence found, or NULL if the string contains + * only correct UTF-8. It also spots UTF-8 sequences that could cause + * trouble if converted to UTF-16, namely surrogate characters + * (U+D800..U+DFFF) and non-Unicode positions (U+FFFE..U+FFFF). This + * routine is very likely to find a malformed sequence if the input + * uses any other encoding than UTF-8. It therefore can be used as a + * very effective heuristic for distinguishing between UTF-8 and other + * encodings. + * + * Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> -- 2005-03-30 + * License: http://www.cl.cam.ac.uk/~mgk25/short-license.html + */ +unsigned char *utf8_check(unsigned char *s) +{ + while (*s) + { + if (*s < 0x80) + /* 0xxxxxxx */ + s++; + else if ((s[0] & 0xe0) == 0xc0) + { + /* 110XXXXx 10xxxxxx */ + if ((s[1] & 0xc0) != 0x80 || + (s[0] & 0xfe) == 0xc0) /* overlong? */ + return s; + else + s += 2; + } + else if ((s[0] & 0xf0) == 0xe0) + { + /* 1110XXXX 10Xxxxxx 10xxxxxx */ + if ((s[1] & 0xc0) != 0x80 || + (s[2] & 0xc0) != 0x80 || + (s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) || /* overlong? */ + (s[0] == 0xed && (s[1] & 0xe0) == 0xa0) || /* surrogate? */ + (s[0] == 0xef && s[1] == 0xbf && + (s[2] & 0xfe) == 0xbe)) /* U+FFFE or U+FFFF? */ + return s; + else + s += 3; + } + else if ((s[0] & 0xf8) == 0xf0) + { + /* 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx */ + if ((s[1] & 0xc0) != 0x80 || + (s[2] & 0xc0) != 0x80 || + (s[3] & 0xc0) != 0x80 || + (s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) || /* overlong? */ + (s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4) /* > U+10FFFF? */ + return s; + else + s += 4; + } + else + return s; + } + + return NULL; +} + +char *url_decode_r(char *to, const char *url, size_t size) { + const char *s = url; // source + char *d = to, // destination + *e = &to[size - 1]; // destination end + + while(*s && d < e) { + if(unlikely(*s == '%')) { + char t = url_percent_escape_decode(s); + if(IS_UTF8_BYTE(t)) { + char bytes_written = url_decode_multibyte_utf8(s, d, e); + if(likely(bytes_written)){ + d += bytes_written; + s += (bytes_written * 3)-1; + } + else { + goto fail_cleanup; + } + } + else if(likely(t) && isprint(t)) { + // avoid HTTP header injection + *d++ = t; + s += 2; + } + else + goto fail_cleanup; + } + else if(unlikely(*s == '+')) + *d++ = ' '; + + else + *d++ = *s; + + s++; + } + + *d = '\0'; + + if(unlikely( utf8_check((unsigned char *)to) )) //NULL means success here + return NULL; + + return to; + +fail_cleanup: + *d = '\0'; + return NULL; +} + +inline bool +url_is_request_complete_and_extract_payload(const char *begin, const char *end, size_t length, BUFFER **post_payload) { + if (begin == end || length < 4) + return false; + + if(likely(strncmp(begin, "GET ", 4)) == 0) { + return strstr(end - 4, "\r\n\r\n"); + } + else if(unlikely(strncmp(begin, "POST ", 5) == 0 || strncmp(begin, "PUT ", 4) == 0)) { + const char *cl = strcasestr(begin, "Content-Length: "); + if(!cl) return false; + cl = &cl[16]; + + size_t content_length = str2ul(cl); + + const char *payload = strstr(cl, "\r\n\r\n"); + if(!payload) return false; + payload += 4; + + size_t payload_length = length - (payload - begin); + + if(payload_length == content_length) { + if(!*post_payload) + *post_payload = buffer_create(payload_length + 1, NULL); + + buffer_contents_replace(*post_payload, payload, payload_length); + + // parse the content type + const char *ct = strcasestr(begin, "Content-Type: "); + if(ct) { + ct = &ct[14]; + while (*ct && isspace((uint8_t)*ct)) ct++; + const char *space = ct; + while (*space && !isspace((uint8_t)*space) && *space != ';') space++; + size_t ct_len = space - ct; + + char ct_copy[ct_len + 1]; + memcpy(ct_copy, ct, ct_len); + ct_copy[ct_len] = '\0'; + + (*post_payload)->content_type = content_type_string2id(ct_copy); + } + else + (*post_payload)->content_type = CT_TEXT_PLAIN; + + return true; + } + + return false; + } + else { + return strstr(end - 4, "\r\n\r\n"); + } +} + +/** + * Find protocol + * + * Search for the string ' HTTP/' in the message given. + * + * @param s is the start of the user request. + * @return + */ +inline char *url_find_protocol(char *s) { + while(*s) { + // find the next space + while (*s && *s != ' ') s++; + + // is it SPACE + "HTTP/" ? + if(*s && !strncmp(s, " HTTP/", 6)) break; + else s++; + } + + return s; +} diff --git a/src/libnetdata/url/url.h b/src/libnetdata/url/url.h new file mode 100644 index 00000000..f7a67dd5 --- /dev/null +++ b/src/libnetdata/url/url.h @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_URL_H +#define NETDATA_URL_H 1 + +#include "../libnetdata.h" + +// ---------------------------------------------------------------------------- +// URL encode / decode +// code from: http://www.geekhideout.com/urlcode.shtml + +/* Converts a hex character to its integer value */ +char from_hex(char ch); + +/* Converts an integer value to its hex character*/ +char to_hex(char code); + +/* Returns a url-encoded version of str */ +/* IMPORTANT: be sure to free() the returned string after use */ +char *url_encode(char *str); + +/* Returns a url-decoded version of str */ +/* IMPORTANT: be sure to free() the returned string after use */ +char *url_decode(char *str); + +char *url_decode_r(char *to, const char *url, size_t size); + +bool url_is_request_complete_and_extract_payload(const char *begin, const char *end, size_t length, BUFFER **post_payload); +char *url_find_protocol(char *s); + +#endif /* NETDATA_URL_H */ diff --git a/src/libnetdata/uuid/README.md b/src/libnetdata/uuid/README.md new file mode 100644 index 00000000..a0da380a --- /dev/null +++ b/src/libnetdata/uuid/README.md @@ -0,0 +1,13 @@ +<!-- +title: "UUID" +custom_edit_url: https://github.com/netdata/netdata/edit/master/src/libnetdata/uuid/README.md +sidebar_label: "UUID" +learn_topic_type: "Tasks" +learn_rel_path: "Developers/libnetdata" +--> + +# UUID + +Netdata uses libuuid for managing UUIDs. + +In this folder are a few custom helpers.
\ No newline at end of file diff --git a/src/libnetdata/uuid/uuid.c b/src/libnetdata/uuid/uuid.c new file mode 100644 index 00000000..6b05229f --- /dev/null +++ b/src/libnetdata/uuid/uuid.c @@ -0,0 +1,220 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../libnetdata.h" + +ND_UUID UUID_generate_from_hash(const void *payload, size_t payload_len) { + assert(sizeof(XXH128_hash_t) == sizeof(ND_UUID)); + + ND_UUID uuid = UUID_ZERO; + XXH128_hash_t *xxh3_128 = (XXH128_hash_t *)&uuid; + + // Hash the payload using XXH128 + // Assume xxh128_hash_function is your function to generate XXH128 hash + *xxh3_128 = XXH3_128bits(payload, payload_len); + + // Set the UUID version (here, setting it to 4) + uuid.uuid[6] = (uuid.uuid[6] & 0x0F) | 0x40; // Version 4 + + // Set the UUID variant (standard variant for UUID) + uuid.uuid[8] = (uuid.uuid[8] & 0x3F) | 0x80; // Variant is 10xxxxxx + + return uuid; +} + +void uuid_unparse_lower_compact(const nd_uuid_t uuid, char *out) { + static const char *hex_chars = "0123456789abcdef"; + for (int i = 0; i < 16; i++) { + out[i * 2] = hex_chars[(uuid[i] >> 4) & 0x0F]; + out[i * 2 + 1] = hex_chars[uuid[i] & 0x0F]; + } + out[32] = '\0'; // Null-terminate the string +} + +static inline void nd_uuid_unparse_full(const nd_uuid_t uuid, char *out, const char *hex_chars) { + int shifts = 0; + for (int i = 0; i < 16; i++) { + if (i == 4 || i == 6 || i == 8 || i == 10) { + out[i * 2 + shifts] = '-'; + shifts++; + } + out[i * 2 + shifts] = hex_chars[(uuid[i] >> 4) & 0x0F]; + out[i * 2 + 1 + shifts] = hex_chars[uuid[i] & 0x0F]; + } + out[36] = '\0'; // Null-terminate the string +} + +// Wrapper functions for lower and upper case hexadecimal representation +void nd_uuid_unparse_lower(const nd_uuid_t uuid, char *out) { + nd_uuid_unparse_full(uuid, out, "0123456789abcdef"); +} + +void nd_uuid_unparse_upper(const nd_uuid_t uuid, char *out) { + nd_uuid_unparse_full(uuid, out, "0123456789ABCDEF"); +} + +inline int uuid_parse_compact(const char *in, nd_uuid_t uuid) { + if (strlen(in) != 32) + return -1; // Invalid input length + + for (int i = 0; i < 16; i++) { + int high = hex_char_to_int(in[i * 2]); + int low = hex_char_to_int(in[i * 2 + 1]); + + if (high < 0 || low < 0) + return -1; // Invalid hexadecimal character + + uuid[i] = (high << 4) | low; + } + + return 0; // Success +} + +int uuid_parse_flexi(const char *in, nd_uuid_t uu) { + if(!in || !*in) + return -1; + + size_t hexCharCount = 0; + size_t hyphenCount = 0; + const char *s = in; + int byteIndex = 0; + nd_uuid_t uuid; // work on a temporary place, to not corrupt the previous value of uu if we fail + + while (*s && byteIndex < 16) { + if (*s == '-') { + s++; + hyphenCount++; + + if (unlikely(hyphenCount > 4)) + // Too many hyphens + return -2; + } + + if (likely(isxdigit((uint8_t)*s))) { + int high = hex_char_to_int(*s++); + hexCharCount++; + + if (likely(isxdigit((uint8_t)*s))) { + int low = hex_char_to_int(*s++); + hexCharCount++; + + uuid[byteIndex++] = (high << 4) | low; + } + else + // Not a valid UUID (expected a pair of hex digits) + return -3; + } + else + // Not a valid UUID + return -4; + } + + if (unlikely(byteIndex < 16)) + // Not enough data to form a UUID + return -5; + + if (unlikely(hexCharCount != 32)) + // wrong number of hex digits + return -6; + + if(unlikely(hyphenCount != 0 && hyphenCount != 4)) + // wrong number of hyphens + return -7; + + // copy the final value + memcpy(uu, uuid, sizeof(nd_uuid_t)); + + return 0; +} + + +// ---------------------------------------------------------------------------- +// unit test + +static inline void remove_hyphens(const char *uuid_with_hyphens, char *uuid_without_hyphens) { + while (*uuid_with_hyphens) { + if (*uuid_with_hyphens != '-') { + *uuid_without_hyphens++ = *uuid_with_hyphens; + } + uuid_with_hyphens++; + } + *uuid_without_hyphens = '\0'; +} + +int uuid_unittest(void) { + const int num_tests = 100000; + int failed_tests = 0; + + int i; + for (i = 0; i < num_tests; i++) { + nd_uuid_t original_uuid, parsed_uuid; + char uuid_str_with_hyphens[UUID_STR_LEN], uuid_str_without_hyphens[UUID_COMPACT_STR_LEN]; + + // Generate a random UUID + switch(i % 2) { + case 0: + uuid_generate(original_uuid); + break; + + case 1: + uuid_generate_random(original_uuid); + break; + } + + // Unparse it with hyphens + bool lower = false; + switch(i % 3) { + case 0: + uuid_unparse_lower(original_uuid, uuid_str_with_hyphens); + lower = true; + break; + + case 1: + uuid_unparse(original_uuid, uuid_str_with_hyphens); + break; + + case 2: + uuid_unparse_upper(original_uuid, uuid_str_with_hyphens); + break; + } + + // Remove the hyphens + remove_hyphens(uuid_str_with_hyphens, uuid_str_without_hyphens); + + if(lower) { + char test[UUID_COMPACT_STR_LEN]; + uuid_unparse_lower_compact(original_uuid, test); + if(strcmp(test, uuid_str_without_hyphens) != 0) { + printf("uuid_unparse_lower_compact() failed, expected '%s', got '%s'\n", + uuid_str_without_hyphens, test); + failed_tests++; + } + } + + // Parse the UUID string with hyphens + int parse_result = uuid_parse_flexi(uuid_str_with_hyphens, parsed_uuid); + if (parse_result != 0) { + printf("uuid_parse_flexi() returned -1 (parsing error) for UUID with hyphens: %s\n", uuid_str_with_hyphens); + failed_tests++; + } else if (uuid_compare(original_uuid, parsed_uuid) != 0) { + printf("uuid_parse_flexi() parsed value mismatch for UUID with hyphens: %s\n", uuid_str_with_hyphens); + failed_tests++; + } + + // Parse the UUID string without hyphens + parse_result = uuid_parse_flexi(uuid_str_without_hyphens, parsed_uuid); + if (parse_result != 0) { + printf("uuid_parse_flexi() returned -1 (parsing error) for UUID without hyphens: %s\n", uuid_str_without_hyphens); + failed_tests++; + } + else if(uuid_compare(original_uuid, parsed_uuid) != 0) { + printf("uuid_parse_flexi() parsed value mismatch for UUID without hyphens: %s\n", uuid_str_without_hyphens); + failed_tests++; + } + + if(failed_tests) + break; + } + + printf("UUID: failed %d out of %d tests.\n", failed_tests, i); + return failed_tests; +} diff --git a/src/libnetdata/uuid/uuid.h b/src/libnetdata/uuid/uuid.h new file mode 100644 index 00000000..cde45761 --- /dev/null +++ b/src/libnetdata/uuid/uuid.h @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_UUID_H +#define NETDATA_UUID_H + +// for compatibility with libuuid +typedef unsigned char nd_uuid_t[16]; + +// for quickly managing it as 2x 64-bit numbers +typedef struct _uuid { + union { + nd_uuid_t uuid; + struct { + uint64_t hig64; + uint64_t low64; + } parts; + }; +} ND_UUID; + +#ifdef __GNUC__ +#define ND_UUID_DEFINE(name,u0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,u13,u14,u15) \ + static const nd_uuid_t name __attribute__ ((unused)) = {u0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,u13,u14,u15} +#else +#define ND_UUID_DEFINE(name,u0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,u13,u14,u15) \ + static const nd_uuid_t name = {u0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,u13,u14,u15} +#endif + +static const ND_UUID UUID_ZERO = (ND_UUID){ { .parts = { .hig64 = 0, .low64 = 0 } }}; +ND_UUID_DEFINE(streaming_from_child_msgid, 0xed,0x4c,0xdb, 0x8f, 0x1b, 0xeb, 0x4a, 0xd3, 0xb5, 0x7c, 0xb3, 0xca, 0xe2, 0xd1, 0x62, 0xfa); +ND_UUID_DEFINE(streaming_to_parent_msgid, 0x6e, 0x2e, 0x38, 0x39, 0x06, 0x76, 0x48, 0x96, 0x8b, 0x64, 0x60, 0x45, 0xdb, 0xf2, 0x8d, 0x66); +ND_UUID_DEFINE(health_alert_transition_msgid, 0x9c, 0xe0, 0xcb, 0x58, 0xab, 0x8b, 0x44, 0xdf, 0x82, 0xc4, 0xbf, 0x1a, 0xd9, 0xee, 0x22, 0xde); + +// this is also defined in alarm-notify.sh.in +ND_UUID_DEFINE(health_alert_notification_msgid, 0x6d, 0xb0, 0x01, 0x8e, 0x83, 0xe3, 0x43, 0x20, 0xae, 0x2a, 0x65, 0x9d, 0x78, 0x01, 0x9f, 0xb7); + +ND_UUID UUID_generate_from_hash(const void *payload, size_t payload_len); + +#define UUIDeq(a, b) ((a).parts.hig64 == (b).parts.hig64 && (a).parts.low64 == (b).parts.low64) + +static inline ND_UUID uuid2UUID(const nd_uuid_t uu1) { + // uu1 may not be aligned, so copy it to the output + ND_UUID copy; + memcpy(copy.uuid, uu1, sizeof(nd_uuid_t)); + return copy; +} + +#ifndef UUID_STR_LEN +// CentOS 7 has older version that doesn't define this +// same goes for MacOS +#define UUID_STR_LEN 37 +#endif + +#define UUID_COMPACT_STR_LEN 33 + +void uuid_unparse_lower_compact(const nd_uuid_t uuid, char *out); +int uuid_parse_compact(const char *in, nd_uuid_t uuid); + +int uuid_parse_flexi(const char *in, nd_uuid_t uuid); +#define uuid_parse(in, uuid) uuid_parse_flexi(in, uuid) + +static inline int hex_char_to_int(char c) { + if (c >= '0' && c <= '9') return c - '0'; + if (c >= 'a' && c <= 'f') return c - 'a' + 10; + if (c >= 'A' && c <= 'F') return c - 'A' + 10; + return -1; // Invalid hexadecimal character +} + +static inline void nd_uuid_clear(nd_uuid_t uu) { + memset(uu, 0, sizeof(nd_uuid_t)); +} + +// Netdata does not need to sort UUIDs lexicographically and this kind +// of sorting does not need to be portable between little/big endian. +// So, any kind of sorting will work, as long as it compares UUIDs. +// The fastest possible, is good enough. +static inline int nd_uuid_compare(const nd_uuid_t uu1, const nd_uuid_t uu2) { + // IMPORTANT: + // uu1 or uu2 may not be aligned to word boundaries on this call, + // so casting this to a struct may give SIGBUS on some architectures. + return memcmp(uu1, uu2, sizeof(nd_uuid_t)); +} + +static inline void nd_uuid_copy(nd_uuid_t dst, const nd_uuid_t src) { + memcpy(dst, src, sizeof(nd_uuid_t)); +} + +static inline bool nd_uuid_eq(const nd_uuid_t uu1, const nd_uuid_t uu2) { + return nd_uuid_compare(uu1, uu2) == 0; +} + +static inline int nd_uuid_is_null(const nd_uuid_t uu) { + return nd_uuid_compare(uu, UUID_ZERO.uuid) == 0; +} + +void nd_uuid_unparse_lower(const nd_uuid_t uuid, char *out); +void nd_uuid_unparse_upper(const nd_uuid_t uuid, char *out); + +#define uuid_is_null(uu) nd_uuid_is_null(uu) +#define uuid_clear(uu) nd_uuid_clear(uu) +#define uuid_compare(uu1, uu2) nd_uuid_compare(uu1, uu2) +#define uuid_copy(dst, src) nd_uuid_copy(dst, src) +#define uuid_eq(uu1, uu2) nd_uuid_eq(uu1, uu2) + +#define uuid_generate(out) os_uuid_generate(out) +#define uuid_generate_random(out) os_uuid_generate_random(out) +#define uuid_generate_time(out) os_uuid_generate_time(out) + +#define uuid_unparse(uu, out) nd_uuid_unparse_lower(uu, out) +#define uuid_unparse_lower(uu, out) nd_uuid_unparse_lower(uu, out) +#define uuid_unparse_upper(uu, out) nd_uuid_unparse_upper(uu, out) + +#endif //NETDATA_UUID_H diff --git a/src/libnetdata/worker_utilization/README.md b/src/libnetdata/worker_utilization/README.md new file mode 100644 index 00000000..1a354376 --- /dev/null +++ b/src/libnetdata/worker_utilization/README.md @@ -0,0 +1,94 @@ +<!-- +title: "Worker Utilization" +custom_edit_url: https://github.com/netdata/netdata/edit/master/src/libnetdata/worker_utilization/README.md +sidebar_label: "Worker Utilization" +learn_status: "Published" +learn_topic_type: "References" +learn_rel_path: "Developers/libnetdata" +--> + +# Worker Utilization + +This library is to be used when there are 1 or more worker threads accepting requests +of some kind and servicing them. The goal is to provide a very simple way to monitor +worker threads utilization, as a percentage of the time they are busy and the amount +of requests served. + +## Design goals + +1. Minimal, if any, impact on the performance of the workers +2. Easy to be integrated into any kind of worker +3. No state of any kind at the worker side + +## How to use + +When a working thread starts, call: + +```c +void worker_register(const char *name); +``` + +This will create the necessary structures for the library to work. +No need to keep a pointer to them. They are allocated as `__thread` variables. + +Then job types need to be defined. Job types are anything a worker does that can be +counted and their execution time needs to be reported. The library is fast enough to +be integrated even on workers that perform hundreds of thousands of actions per second. + +Job types are defined like this: + +```c +void worker_register_job_type(size_t id, const char *name); +``` + +`id` is a number starting from zero. The library is compiled with a fixed size of 50 +ids (0 to 49). More can be allocated by setting `WORKER_UTILIZATION_MAX_JOB_TYPES` in +`worker_utilization.h`. `name` can be any string up to 22 characters. This can be +changed by setting `WORKER_UTILIZATION_MAX_JOB_NAME_LENGTH` in `worker_utilization.h`. + +Each thread that calls `worker_register(name)` will allocate about 3kB for maintaining +the information required. + +When the thread stops, call: + +```c +void worker_unregister(void); +``` + +Again, no parameters, or return values. + +> IMPORTANT: cancellable threads need to add a call to `worker_unregister()` to the +> `pop` function that cleans up the thread. Failure to do so, will result in about +> 3kB of memory leak for every thread that is stopped. + +When you are about to do some work in the working thread, call: + +```c +void worker_is_busy(size_t id); +``` + +When you finish doing the job, call: + +```c +void worker_is_idle(void); +``` + +Calls to `worker_is_busy(id)` can be made one after another (without calling +`worker_is_idle()` between them) to switch jobs without losing any time between +them and eliminating one of the 2 clock calls involved. + +## Implementation details + +Totally lockless, extremely fast, it should not introduce any kind of problems to the +workers. Every time `worker_is_busy(id)` or `worker_is_idle()` are called, a call to +`now_realtime_usec()` is done and a couple of variables are updated. That's it! + +The worker does not need to update the variables regularly. Based on the last status +of the worker, the statistics collector of netdata will calculate if the thread is +busy or idle all the time or part of the time. Works well for both thousands of jobs +per second and unlimited working time (being totally busy with a single request for +ages). + +The statistics collector is called by the global statistics thread of netdata. So, +even if the workers are extremely busy with their jobs, netdata will be able to know +how busy they are. diff --git a/src/libnetdata/worker_utilization/worker_utilization.c b/src/libnetdata/worker_utilization/worker_utilization.c new file mode 100644 index 00000000..4c61ea92 --- /dev/null +++ b/src/libnetdata/worker_utilization/worker_utilization.c @@ -0,0 +1,398 @@ +#include "worker_utilization.h" + +#define WORKER_IDLE 'I' +#define WORKER_BUSY 'B' + +struct worker_job_type { + STRING *name; + STRING *units; + + // statistics controlled variables + size_t statistics_last_jobs_started; + usec_t statistics_last_busy_time; + NETDATA_DOUBLE statistics_last_custom_value; + + // worker controlled variables + volatile size_t worker_jobs_started; + volatile usec_t worker_busy_time; + + WORKER_METRIC_TYPE type; + NETDATA_DOUBLE custom_value; +}; + +struct worker { + pid_t pid; + const char *tag; + const char *workname; + + // statistics controlled variables + volatile usec_t statistics_last_checkpoint; + size_t statistics_last_jobs_started; + usec_t statistics_last_busy_time; + + // the worker controlled variables + size_t worker_max_job_id; + volatile size_t job_id; + volatile size_t jobs_started; + volatile usec_t busy_time; + volatile usec_t last_action_timestamp; + volatile char last_action; + + struct worker_job_type per_job_type[WORKER_UTILIZATION_MAX_JOB_TYPES]; + + struct worker *next; + struct worker *prev; +}; + +struct workers_workname { // this is what we add to JudyHS + SPINLOCK spinlock; + struct worker *base; +}; + +static struct workers_globals { + bool enabled; + + SPINLOCK spinlock; + Pvoid_t worknames_JudyHS; + size_t memory; + +} workers_globals = { // workers globals, the base of all worknames + .enabled = false, + .spinlock = NETDATA_SPINLOCK_INITIALIZER, // a lock for the worknames index + .worknames_JudyHS = NULL, // the worknames index +}; + +static __thread struct worker *worker = NULL; // the current thread worker + +static inline usec_t worker_now_monotonic_usec(void) { +#ifdef NETDATA_WITHOUT_WORKERS_LATENCY + return 0; +#else + return now_monotonic_usec(); +#endif +} + +void workers_utilization_enable(void) { + workers_globals.enabled = true; +} + +size_t workers_allocated_memory(void) { + if(!workers_globals.enabled) + return 0; + + spinlock_lock(&workers_globals.spinlock); + size_t memory = workers_globals.memory; + spinlock_unlock(&workers_globals.spinlock); + + return memory; +} + +void worker_register(const char *name) { + if(unlikely(worker || !workers_globals.enabled)) + return; + + worker = callocz(1, sizeof(struct worker)); + worker->pid = gettid_cached(); + worker->tag = strdupz(nd_thread_tag()); + worker->workname = strdupz(name); + + usec_t now = worker_now_monotonic_usec(); + worker->statistics_last_checkpoint = now; + worker->last_action_timestamp = now; + worker->last_action = WORKER_IDLE; + + size_t name_size = strlen(name) + 1; + spinlock_lock(&workers_globals.spinlock); + + workers_globals.memory += sizeof(struct worker) + strlen(worker->tag) + 1 + strlen(worker->workname) + 1; + + Pvoid_t *PValue = JudyHSIns(&workers_globals.worknames_JudyHS, (void *)name, name_size, PJE0); + + struct workers_workname *workname = *PValue; + if(!workname) { + workname = mallocz(sizeof(struct workers_workname)); + spinlock_init(&workname->spinlock); + workname->base = NULL; + *PValue = workname; + + workers_globals.memory += sizeof(struct workers_workname) + JUDYHS_INDEX_SIZE_ESTIMATE(name_size); + } + + spinlock_lock(&workname->spinlock); + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(workname->base, worker, prev, next); + spinlock_unlock(&workname->spinlock); + + spinlock_unlock(&workers_globals.spinlock); +} + +void worker_register_job_custom_metric(size_t job_id, const char *name, const char *units, WORKER_METRIC_TYPE type) { + if(unlikely(!worker)) return; + + if(unlikely(job_id >= WORKER_UTILIZATION_MAX_JOB_TYPES)) { + netdata_log_error("WORKER_UTILIZATION: job_id %zu is too big. Max is %zu", job_id, (size_t)(WORKER_UTILIZATION_MAX_JOB_TYPES - 1)); + return; + } + + if(job_id > worker->worker_max_job_id) + worker->worker_max_job_id = job_id; + + if(worker->per_job_type[job_id].name) { + if(strcmp(string2str(worker->per_job_type[job_id].name), name) != 0 || worker->per_job_type[job_id].type != type || strcmp(string2str(worker->per_job_type[job_id].units), units) != 0) + netdata_log_error("WORKER_UTILIZATION: duplicate job registration: worker '%s' job id %zu is '%s', ignoring the later '%s'", worker->workname, job_id, string2str(worker->per_job_type[job_id].name), name); + return; + } + + worker->per_job_type[job_id].name = string_strdupz(name); + worker->per_job_type[job_id].units = string_strdupz(units); + worker->per_job_type[job_id].type = type; +} + +void worker_register_job_name(size_t job_id, const char *name) { + worker_register_job_custom_metric(job_id, name, "", WORKER_METRIC_IDLE_BUSY); +} + +void worker_unregister(void) { + if(unlikely(!worker)) return; + + size_t workname_size = strlen(worker->workname) + 1; + spinlock_lock(&workers_globals.spinlock); + Pvoid_t *PValue = JudyHSGet(workers_globals.worknames_JudyHS, (void *)worker->workname, workname_size); + if(PValue) { + struct workers_workname *workname = *PValue; + spinlock_lock(&workname->spinlock); + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(workname->base, worker, prev, next); + spinlock_unlock(&workname->spinlock); + + if(!workname->base) { + JudyHSDel(&workers_globals.worknames_JudyHS, (void *) worker->workname, workname_size, PJE0); + freez(workname); + workers_globals.memory -= sizeof(struct workers_workname) + JUDYHS_INDEX_SIZE_ESTIMATE(workname_size); + } + } + workers_globals.memory -= sizeof(struct worker) + strlen(worker->tag) + 1 + strlen(worker->workname) + 1; + spinlock_unlock(&workers_globals.spinlock); + + for(int i = 0; i < WORKER_UTILIZATION_MAX_JOB_TYPES ;i++) { + string_freez(worker->per_job_type[i].name); + string_freez(worker->per_job_type[i].units); + } + + freez((void *)worker->tag); + freez((void *)worker->workname); + freez(worker); + + worker = NULL; +} + +static inline void worker_is_idle_with_time(usec_t now) { + usec_t delta = now - worker->last_action_timestamp; + worker->busy_time += delta; + worker->per_job_type[worker->job_id].worker_busy_time += delta; + + // the worker was busy + // set it to idle before we set the timestamp + + worker->last_action = WORKER_IDLE; + if(likely(worker->last_action_timestamp < now)) + worker->last_action_timestamp = now; +} + +void worker_is_idle(void) { + if(unlikely(!worker || worker->last_action != WORKER_BUSY)) return; + + worker_is_idle_with_time(worker_now_monotonic_usec()); +} + +void worker_is_busy(size_t job_id) { + if(unlikely(!worker || job_id >= WORKER_UTILIZATION_MAX_JOB_TYPES)) + return; + + usec_t now = worker_now_monotonic_usec(); + + if(worker->last_action == WORKER_BUSY) + worker_is_idle_with_time(now); + + // the worker was idle + // set the timestamp and then set it to busy + + worker->job_id = job_id; + worker->per_job_type[job_id].worker_jobs_started++; + worker->jobs_started++; + worker->last_action_timestamp = now; + worker->last_action = WORKER_BUSY; +} + +void worker_set_metric(size_t job_id, NETDATA_DOUBLE value) { + if(unlikely(!worker)) return; + + if(unlikely(job_id >= WORKER_UTILIZATION_MAX_JOB_TYPES)) + return; + + switch(worker->per_job_type[job_id].type) { + case WORKER_METRIC_INCREMENT: + worker->per_job_type[job_id].custom_value += value; + break; + + case WORKER_METRIC_INCREMENTAL_TOTAL: + case WORKER_METRIC_ABSOLUTE: + default: + worker->per_job_type[job_id].custom_value = value; + break; + } +} + +// statistics interface + +void workers_foreach(const char *name, void (*callback)( + void *data + , pid_t pid + , const char *thread_tag + , size_t max_job_id + , size_t utilization_usec + , size_t duration_usec + , size_t jobs_started, size_t is_running + , STRING **job_types_names + , STRING **job_types_units + , WORKER_METRIC_TYPE *job_metric_types + , size_t *job_types_jobs_started + , usec_t *job_types_busy_time + , NETDATA_DOUBLE *job_custom_values + ) + , void *data) { + if(!workers_globals.enabled) + return; + + spinlock_lock(&workers_globals.spinlock); + usec_t busy_time, delta; + size_t i, jobs_started, jobs_running; + + size_t workname_size = strlen(name) + 1; + struct workers_workname *workname; + Pvoid_t *PValue = JudyHSGet(workers_globals.worknames_JudyHS, (void *)name, workname_size); + if(PValue) { + workname = *PValue; + spinlock_lock(&workname->spinlock); + } + else + workname = NULL; + + spinlock_unlock(&workers_globals.spinlock); + + if(!workname) + return; + + struct worker *p; + DOUBLE_LINKED_LIST_FOREACH_FORWARD(workname->base, p, prev, next) { + usec_t now = worker_now_monotonic_usec(); + + // find per job type statistics + STRING *per_job_type_name[WORKER_UTILIZATION_MAX_JOB_TYPES]; + STRING *per_job_type_units[WORKER_UTILIZATION_MAX_JOB_TYPES]; + WORKER_METRIC_TYPE per_job_metric_type[WORKER_UTILIZATION_MAX_JOB_TYPES]; + size_t per_job_type_jobs_started[WORKER_UTILIZATION_MAX_JOB_TYPES]; + usec_t per_job_type_busy_time[WORKER_UTILIZATION_MAX_JOB_TYPES]; + NETDATA_DOUBLE per_job_custom_values[WORKER_UTILIZATION_MAX_JOB_TYPES]; + + size_t max_job_id = p->worker_max_job_id; + for(i = 0; i <= max_job_id ;i++) { + per_job_type_name[i] = p->per_job_type[i].name; + per_job_type_units[i] = p->per_job_type[i].units; + per_job_metric_type[i] = p->per_job_type[i].type; + + switch(p->per_job_type[i].type) { + default: + case WORKER_METRIC_EMPTY: { + per_job_type_jobs_started[i] = 0; + per_job_type_busy_time[i] = 0; + per_job_custom_values[i] = NAN; + break; + } + + case WORKER_METRIC_IDLE_BUSY: { + size_t tmp_jobs_started = p->per_job_type[i].worker_jobs_started; + per_job_type_jobs_started[i] = tmp_jobs_started - p->per_job_type[i].statistics_last_jobs_started; + p->per_job_type[i].statistics_last_jobs_started = tmp_jobs_started; + + usec_t tmp_busy_time = p->per_job_type[i].worker_busy_time; + per_job_type_busy_time[i] = tmp_busy_time - p->per_job_type[i].statistics_last_busy_time; + p->per_job_type[i].statistics_last_busy_time = tmp_busy_time; + + per_job_custom_values[i] = NAN; + break; + } + + case WORKER_METRIC_ABSOLUTE: { + per_job_type_jobs_started[i] = 0; + per_job_type_busy_time[i] = 0; + + per_job_custom_values[i] = p->per_job_type[i].custom_value; + break; + } + + case WORKER_METRIC_INCREMENTAL_TOTAL: + case WORKER_METRIC_INCREMENT: { + per_job_type_jobs_started[i] = 0; + per_job_type_busy_time[i] = 0; + + NETDATA_DOUBLE tmp_custom_value = p->per_job_type[i].custom_value; + per_job_custom_values[i] = tmp_custom_value - p->per_job_type[i].statistics_last_custom_value; + p->per_job_type[i].statistics_last_custom_value = tmp_custom_value; + + break; + } + } + } + + // get a copy of the worker variables + size_t worker_job_id = p->job_id; + usec_t worker_busy_time = p->busy_time; + size_t worker_jobs_started = p->jobs_started; + char worker_last_action = p->last_action; + usec_t worker_last_action_timestamp = p->last_action_timestamp; + + delta = now - p->statistics_last_checkpoint; + p->statistics_last_checkpoint = now; + + // this is the only variable both the worker thread and the statistics thread are writing + // we set this only when the worker is busy, so that the worker will not + // accumulate all the busy time, but only the time after the point we collected statistics + if(worker_last_action == WORKER_BUSY && p->last_action_timestamp == worker_last_action_timestamp && p->last_action == WORKER_BUSY) + p->last_action_timestamp = now; + + // calculate delta busy time + busy_time = worker_busy_time - p->statistics_last_busy_time; + p->statistics_last_busy_time = worker_busy_time; + + // calculate delta jobs done + jobs_started = worker_jobs_started - p->statistics_last_jobs_started; + p->statistics_last_jobs_started = worker_jobs_started; + + jobs_running = 0; + if(worker_last_action == WORKER_BUSY) { + // the worker is still busy with something + // let's add that busy time to the reported one + usec_t dt = now - worker_last_action_timestamp; + busy_time += dt; + per_job_type_busy_time[worker_job_id] += dt; + jobs_running = 1; + } + + callback(data + , p->pid + , p->tag + , max_job_id + , busy_time + , delta + , jobs_started + , jobs_running + , per_job_type_name + , per_job_type_units + , per_job_metric_type + , per_job_type_jobs_started + , per_job_type_busy_time + , per_job_custom_values + ); + } + + spinlock_unlock(&workname->spinlock); +} diff --git a/src/libnetdata/worker_utilization/worker_utilization.h b/src/libnetdata/worker_utilization/worker_utilization.h new file mode 100644 index 00000000..e2f46c5a --- /dev/null +++ b/src/libnetdata/worker_utilization/worker_utilization.h @@ -0,0 +1,49 @@ +#ifndef WORKER_UTILIZATION_H +#define WORKER_UTILIZATION_H 1 + +#include "../libnetdata.h" + +// workers interfaces + +#define WORKER_UTILIZATION_MAX_JOB_TYPES 50 + +typedef enum __attribute__((packed)) { + WORKER_METRIC_EMPTY = 0, + WORKER_METRIC_IDLE_BUSY = 1, + WORKER_METRIC_ABSOLUTE = 2, + WORKER_METRIC_INCREMENT = 3, + WORKER_METRIC_INCREMENTAL_TOTAL = 4, +} WORKER_METRIC_TYPE; + +void workers_utilization_enable(void); +size_t workers_allocated_memory(void); +void worker_register(const char *name); +void worker_register_job_name(size_t job_id, const char *name); +void worker_register_job_custom_metric(size_t job_id, const char *name, const char *units, WORKER_METRIC_TYPE type); +void worker_unregister(void); + +void worker_is_idle(void); +void worker_is_busy(size_t job_id); +void worker_set_metric(size_t job_id, NETDATA_DOUBLE value); + +// statistics interface + +void workers_foreach(const char *name, void (*callback)( + void *data + , pid_t pid + , const char *thread_tag + , size_t max_job_id + , size_t utilization_usec + , size_t duration_usec + , size_t jobs_started + , size_t is_running + , STRING **job_types_names + , STRING **job_types_units + , WORKER_METRIC_TYPE *job_metric_types + , size_t *job_types_jobs_started + , usec_t *job_types_busy_time + , NETDATA_DOUBLE *job_custom_values + ) + , void *data); + +#endif // WORKER_UTILIZATION_H diff --git a/src/libnetdata/xxhash.h b/src/libnetdata/xxhash.h new file mode 100644 index 00000000..5e2c0ed2 --- /dev/null +++ b/src/libnetdata/xxhash.h @@ -0,0 +1,6773 @@ +/* + * xxHash - Extremely Fast Hash algorithm + * Header File + * Copyright (C) 2012-2023 Yann Collet + * + * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at: + * - xxHash homepage: https://www.xxhash.com + * - xxHash source repository: https://github.com/Cyan4973/xxHash + */ + +/*! + * @mainpage xxHash + * + * xxHash is an extremely fast non-cryptographic hash algorithm, working at RAM speed + * limits. + * + * It is proposed in four flavors, in three families: + * 1. @ref XXH32_family + * - Classic 32-bit hash function. Simple, compact, and runs on almost all + * 32-bit and 64-bit systems. + * 2. @ref XXH64_family + * - Classic 64-bit adaptation of XXH32. Just as simple, and runs well on most + * 64-bit systems (but _not_ 32-bit systems). + * 3. @ref XXH3_family + * - Modern 64-bit and 128-bit hash function family which features improved + * strength and performance across the board, especially on smaller data. + * It benefits greatly from SIMD and 64-bit without requiring it. + * + * Benchmarks + * --- + * The reference system uses an Intel i7-9700K CPU, and runs Ubuntu x64 20.04. + * The open source benchmark program is compiled with clang v10.0 using -O3 flag. + * + * | Hash Name | ISA ext | Width | Large Data Speed | Small Data Velocity | + * | -------------------- | ------- | ----: | ---------------: | ------------------: | + * | XXH3_64bits() | @b AVX2 | 64 | 59.4 GB/s | 133.1 | + * | MeowHash | AES-NI | 128 | 58.2 GB/s | 52.5 | + * | XXH3_128bits() | @b AVX2 | 128 | 57.9 GB/s | 118.1 | + * | CLHash | PCLMUL | 64 | 37.1 GB/s | 58.1 | + * | XXH3_64bits() | @b SSE2 | 64 | 31.5 GB/s | 133.1 | + * | XXH3_128bits() | @b SSE2 | 128 | 29.6 GB/s | 118.1 | + * | RAM sequential read | | N/A | 28.0 GB/s | N/A | + * | ahash | AES-NI | 64 | 22.5 GB/s | 107.2 | + * | City64 | | 64 | 22.0 GB/s | 76.6 | + * | T1ha2 | | 64 | 22.0 GB/s | 99.0 | + * | City128 | | 128 | 21.7 GB/s | 57.7 | + * | FarmHash | AES-NI | 64 | 21.3 GB/s | 71.9 | + * | XXH64() | | 64 | 19.4 GB/s | 71.0 | + * | SpookyHash | | 64 | 19.3 GB/s | 53.2 | + * | Mum | | 64 | 18.0 GB/s | 67.0 | + * | CRC32C | SSE4.2 | 32 | 13.0 GB/s | 57.9 | + * | XXH32() | | 32 | 9.7 GB/s | 71.9 | + * | City32 | | 32 | 9.1 GB/s | 66.0 | + * | Blake3* | @b AVX2 | 256 | 4.4 GB/s | 8.1 | + * | Murmur3 | | 32 | 3.9 GB/s | 56.1 | + * | SipHash* | | 64 | 3.0 GB/s | 43.2 | + * | Blake3* | @b SSE2 | 256 | 2.4 GB/s | 8.1 | + * | HighwayHash | | 64 | 1.4 GB/s | 6.0 | + * | FNV64 | | 64 | 1.2 GB/s | 62.7 | + * | Blake2* | | 256 | 1.1 GB/s | 5.1 | + * | SHA1* | | 160 | 0.8 GB/s | 5.6 | + * | MD5* | | 128 | 0.6 GB/s | 7.8 | + * @note + * - Hashes which require a specific ISA extension are noted. SSE2 is also noted, + * even though it is mandatory on x64. + * - Hashes with an asterisk are cryptographic. Note that MD5 is non-cryptographic + * by modern standards. + * - Small data velocity is a rough average of algorithm's efficiency for small + * data. For more accurate information, see the wiki. + * - More benchmarks and strength tests are found on the wiki: + * https://github.com/Cyan4973/xxHash/wiki + * + * Usage + * ------ + * All xxHash variants use a similar API. Changing the algorithm is a trivial + * substitution. + * + * @pre + * For functions which take an input and length parameter, the following + * requirements are assumed: + * - The range from [`input`, `input + length`) is valid, readable memory. + * - The only exception is if the `length` is `0`, `input` may be `NULL`. + * - For C++, the objects must have the *TriviallyCopyable* property, as the + * functions access bytes directly as if it was an array of `unsigned char`. + * + * @anchor single_shot_example + * **Single Shot** + * + * These functions are stateless functions which hash a contiguous block of memory, + * immediately returning the result. They are the easiest and usually the fastest + * option. + * + * XXH32(), XXH64(), XXH3_64bits(), XXH3_128bits() + * + * @code{.c} + * #include <string.h> + * #include "xxhash.h" + * + * // Example for a function which hashes a null terminated string with XXH32(). + * XXH32_hash_t hash_string(const char* string, XXH32_hash_t seed) + * { + * // NULL pointers are only valid if the length is zero + * size_t length = (string == NULL) ? 0 : strlen(string); + * return XXH32(string, length, seed); + * } + * @endcode + * + * @anchor streaming_example + * **Streaming** + * + * These groups of functions allow incremental hashing of unknown size, even + * more than what would fit in a size_t. + * + * XXH32_reset(), XXH64_reset(), XXH3_64bits_reset(), XXH3_128bits_reset() + * + * @code{.c} + * #include <stdio.h> + * #include <assert.h> + * #include "xxhash.h" + * // Example for a function which hashes a FILE incrementally with XXH3_64bits(). + * XXH64_hash_t hashFile(FILE* f) + * { + * // Allocate a state struct. Do not just use malloc() or new. + * XXH3_state_t* state = XXH3_createState(); + * assert(state != NULL && "Out of memory!"); + * // Reset the state to start a new hashing session. + * XXH3_64bits_reset(state); + * char buffer[4096]; + * size_t count; + * // Read the file in chunks + * while ((count = fread(buffer, 1, sizeof(buffer), f)) != 0) { + * // Run update() as many times as necessary to process the data + * XXH3_64bits_update(state, buffer, count); + * } + * // Retrieve the finalized hash. This will not change the state. + * XXH64_hash_t result = XXH3_64bits_digest(state); + * // Free the state. Do not use free(). + * XXH3_freeState(state); + * return result; + * } + * @endcode + * + * @file xxhash.h + * xxHash prototypes and implementation + */ + +#if defined (__cplusplus) +extern "C" { +#endif + +/* **************************** + * INLINE mode + ******************************/ +/*! + * @defgroup public Public API + * Contains details on the public xxHash functions. + * @{ + */ +#ifdef XXH_DOXYGEN +/*! + * @brief Gives access to internal state declaration, required for static allocation. + * + * Incompatible with dynamic linking, due to risks of ABI changes. + * + * Usage: + * @code{.c} + * #define XXH_STATIC_LINKING_ONLY + * #include "xxhash.h" + * @endcode + */ +# define XXH_STATIC_LINKING_ONLY +/* Do not undef XXH_STATIC_LINKING_ONLY for Doxygen */ + +/*! + * @brief Gives access to internal definitions. + * + * Usage: + * @code{.c} + * #define XXH_STATIC_LINKING_ONLY + * #define XXH_IMPLEMENTATION + * #include "xxhash.h" + * @endcode + */ +# define XXH_IMPLEMENTATION +/* Do not undef XXH_IMPLEMENTATION for Doxygen */ + +/*! + * @brief Exposes the implementation and marks all functions as `inline`. + * + * Use these build macros to inline xxhash into the target unit. + * Inlining improves performance on small inputs, especially when the length is + * expressed as a compile-time constant: + * + * https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html + * + * It also keeps xxHash symbols private to the unit, so they are not exported. + * + * Usage: + * @code{.c} + * #define XXH_INLINE_ALL + * #include "xxhash.h" + * @endcode + * Do not compile and link xxhash.o as a separate object, as it is not useful. + */ +# define XXH_INLINE_ALL +# undef XXH_INLINE_ALL +/*! + * @brief Exposes the implementation without marking functions as inline. + */ +# define XXH_PRIVATE_API +# undef XXH_PRIVATE_API +/*! + * @brief Emulate a namespace by transparently prefixing all symbols. + * + * If you want to include _and expose_ xxHash functions from within your own + * library, but also want to avoid symbol collisions with other libraries which + * may also include xxHash, you can use @ref XXH_NAMESPACE to automatically prefix + * any public symbol from xxhash library with the value of @ref XXH_NAMESPACE + * (therefore, avoid empty or numeric values). + * + * Note that no change is required within the calling program as long as it + * includes `xxhash.h`: Regular symbol names will be automatically translated + * by this header. + */ +# define XXH_NAMESPACE /* YOUR NAME HERE */ +# undef XXH_NAMESPACE +#endif + +#if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \ + && !defined(XXH_INLINE_ALL_31684351384) + /* this section should be traversed only once */ +# define XXH_INLINE_ALL_31684351384 + /* give access to the advanced API, required to compile implementations */ +# undef XXH_STATIC_LINKING_ONLY /* avoid macro redef */ +# define XXH_STATIC_LINKING_ONLY + /* make all functions private */ +# undef XXH_PUBLIC_API +# if defined(__GNUC__) +# define XXH_PUBLIC_API static __inline __attribute__((unused)) +# elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +# define XXH_PUBLIC_API static inline +# elif defined(_MSC_VER) +# define XXH_PUBLIC_API static __inline +# else + /* note: this version may generate warnings for unused static functions */ +# define XXH_PUBLIC_API static +# endif + + /* + * This part deals with the special case where a unit wants to inline xxHash, + * but "xxhash.h" has previously been included without XXH_INLINE_ALL, + * such as part of some previously included *.h header file. + * Without further action, the new include would just be ignored, + * and functions would effectively _not_ be inlined (silent failure). + * The following macros solve this situation by prefixing all inlined names, + * avoiding naming collision with previous inclusions. + */ + /* Before that, we unconditionally #undef all symbols, + * in case they were already defined with XXH_NAMESPACE. + * They will then be redefined for XXH_INLINE_ALL + */ +# undef XXH_versionNumber + /* XXH32 */ +# undef XXH32 +# undef XXH32_createState +# undef XXH32_freeState +# undef XXH32_reset +# undef XXH32_update +# undef XXH32_digest +# undef XXH32_copyState +# undef XXH32_canonicalFromHash +# undef XXH32_hashFromCanonical + /* XXH64 */ +# undef XXH64 +# undef XXH64_createState +# undef XXH64_freeState +# undef XXH64_reset +# undef XXH64_update +# undef XXH64_digest +# undef XXH64_copyState +# undef XXH64_canonicalFromHash +# undef XXH64_hashFromCanonical + /* XXH3_64bits */ +# undef XXH3_64bits +# undef XXH3_64bits_withSecret +# undef XXH3_64bits_withSeed +# undef XXH3_64bits_withSecretandSeed +# undef XXH3_createState +# undef XXH3_freeState +# undef XXH3_copyState +# undef XXH3_64bits_reset +# undef XXH3_64bits_reset_withSeed +# undef XXH3_64bits_reset_withSecret +# undef XXH3_64bits_update +# undef XXH3_64bits_digest +# undef XXH3_generateSecret + /* XXH3_128bits */ +# undef XXH128 +# undef XXH3_128bits +# undef XXH3_128bits_withSeed +# undef XXH3_128bits_withSecret +# undef XXH3_128bits_reset +# undef XXH3_128bits_reset_withSeed +# undef XXH3_128bits_reset_withSecret +# undef XXH3_128bits_reset_withSecretandSeed +# undef XXH3_128bits_update +# undef XXH3_128bits_digest +# undef XXH128_isEqual +# undef XXH128_cmp +# undef XXH128_canonicalFromHash +# undef XXH128_hashFromCanonical + /* Finally, free the namespace itself */ +# undef XXH_NAMESPACE + + /* employ the namespace for XXH_INLINE_ALL */ +# define XXH_NAMESPACE XXH_INLINE_ + /* + * Some identifiers (enums, type names) are not symbols, + * but they must nonetheless be renamed to avoid redeclaration. + * Alternative solution: do not redeclare them. + * However, this requires some #ifdefs, and has a more dispersed impact. + * Meanwhile, renaming can be achieved in a single place. + */ +# define XXH_IPREF(Id) XXH_NAMESPACE ## Id +# define XXH_OK XXH_IPREF(XXH_OK) +# define XXH_ERROR XXH_IPREF(XXH_ERROR) +# define XXH_errorcode XXH_IPREF(XXH_errorcode) +# define XXH32_canonical_t XXH_IPREF(XXH32_canonical_t) +# define XXH64_canonical_t XXH_IPREF(XXH64_canonical_t) +# define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t) +# define XXH32_state_s XXH_IPREF(XXH32_state_s) +# define XXH32_state_t XXH_IPREF(XXH32_state_t) +# define XXH64_state_s XXH_IPREF(XXH64_state_s) +# define XXH64_state_t XXH_IPREF(XXH64_state_t) +# define XXH3_state_s XXH_IPREF(XXH3_state_s) +# define XXH3_state_t XXH_IPREF(XXH3_state_t) +# define XXH128_hash_t XXH_IPREF(XXH128_hash_t) + /* Ensure the header is parsed again, even if it was previously included */ +# undef XXHASH_H_5627135585666179 +# undef XXHASH_H_STATIC_13879238742 +#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */ + +/* **************************************************************** + * Stable API + *****************************************************************/ +#ifndef XXHASH_H_5627135585666179 +#define XXHASH_H_5627135585666179 1 + +/*! @brief Marks a global symbol. */ +#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API) +# if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT)) +# ifdef XXH_EXPORT +# define XXH_PUBLIC_API __declspec(dllexport) +# elif XXH_IMPORT +# define XXH_PUBLIC_API __declspec(dllimport) +# endif +# else +# define XXH_PUBLIC_API /* do nothing */ +# endif +#endif + +#ifdef XXH_NAMESPACE +# define XXH_CAT(A,B) A##B +# define XXH_NAME2(A,B) XXH_CAT(A,B) +# define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber) +/* XXH32 */ +# define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32) +# define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState) +# define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState) +# define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset) +# define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update) +# define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest) +# define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState) +# define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash) +# define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical) +/* XXH64 */ +# define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64) +# define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState) +# define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState) +# define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset) +# define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update) +# define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest) +# define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState) +# define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash) +# define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical) +/* XXH3_64bits */ +# define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits) +# define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret) +# define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed) +# define XXH3_64bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecretandSeed) +# define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState) +# define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState) +# define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState) +# define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset) +# define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed) +# define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret) +# define XXH3_64bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecretandSeed) +# define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update) +# define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest) +# define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret) +# define XXH3_generateSecret_fromSeed XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret_fromSeed) +/* XXH3_128bits */ +# define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128) +# define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits) +# define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed) +# define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret) +# define XXH3_128bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecretandSeed) +# define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset) +# define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed) +# define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret) +# define XXH3_128bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecretandSeed) +# define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update) +# define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest) +# define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual) +# define XXH128_cmp XXH_NAME2(XXH_NAMESPACE, XXH128_cmp) +# define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash) +# define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical) +#endif + + +/* ************************************* +* Compiler specifics +***************************************/ + +/* specific declaration modes for Windows */ +#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API) +# if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT)) +# ifdef XXH_EXPORT +# define XXH_PUBLIC_API __declspec(dllexport) +# elif XXH_IMPORT +# define XXH_PUBLIC_API __declspec(dllimport) +# endif +# else +# define XXH_PUBLIC_API /* do nothing */ +# endif +#endif + +#if defined (__GNUC__) +# define XXH_CONSTF __attribute__((const)) +# define XXH_PUREF __attribute__((pure)) +# define XXH_MALLOCF __attribute__((malloc)) +#else +# define XXH_CONSTF /* disable */ +# define XXH_PUREF +# define XXH_MALLOCF +#endif + +/* ************************************* +* Version +***************************************/ +#define XXH_VERSION_MAJOR 0 +#define XXH_VERSION_MINOR 8 +#define XXH_VERSION_RELEASE 2 +/*! @brief Version number, encoded as two digits each */ +#define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE) + +/*! + * @brief Obtains the xxHash version. + * + * This is mostly useful when xxHash is compiled as a shared library, + * since the returned value comes from the library, as opposed to header file. + * + * @return @ref XXH_VERSION_NUMBER of the invoked library. + */ +XXH_PUBLIC_API XXH_CONSTF unsigned XXH_versionNumber (void); + + +/* **************************** +* Common basic types +******************************/ +#include <stddef.h> /* size_t */ +/*! + * @brief Exit code for the streaming API. + */ +typedef enum { + XXH_OK = 0, /*!< OK */ + XXH_ERROR /*!< Error */ +} XXH_errorcode; + + +/*-********************************************************************** +* 32-bit hash +************************************************************************/ +#if defined(XXH_DOXYGEN) /* Don't show <stdint.h> include */ +/*! + * @brief An unsigned 32-bit integer. + * + * Not necessarily defined to `uint32_t` but functionally equivalent. + */ +typedef uint32_t XXH32_hash_t; + +#elif !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include <stdint.h> + typedef uint32_t XXH32_hash_t; + +#else +# include <limits.h> +# if UINT_MAX == 0xFFFFFFFFUL + typedef unsigned int XXH32_hash_t; +# elif ULONG_MAX == 0xFFFFFFFFUL + typedef unsigned long XXH32_hash_t; +# else +# error "unsupported platform: need a 32-bit type" +# endif +#endif + +/*! + * @} + * + * @defgroup XXH32_family XXH32 family + * @ingroup public + * Contains functions used in the classic 32-bit xxHash algorithm. + * + * @note + * XXH32 is useful for older platforms, with no or poor 64-bit performance. + * Note that the @ref XXH3_family provides competitive speed for both 32-bit + * and 64-bit systems, and offers true 64/128 bit hash results. + * + * @see @ref XXH64_family, @ref XXH3_family : Other xxHash families + * @see @ref XXH32_impl for implementation details + * @{ + */ + +/*! + * @brief Calculates the 32-bit hash of @p input using xxHash32. + * + * Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s + * + * See @ref single_shot_example "Single Shot Example" for an example. + * + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * @param seed The 32-bit seed to alter the hash's output predictably. + * + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return The calculated 32-bit hash value. + * + * @see + * XXH64(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128(): + * Direct equivalents for the other variants of xxHash. + * @see + * XXH32_createState(), XXH32_update(), XXH32_digest(): Streaming version. + */ +XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed); + +#ifndef XXH_NO_STREAM +/*! + * Streaming functions generate the xxHash value from an incremental input. + * This method is slower than single-call functions, due to state management. + * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized. + * + * An XXH state must first be allocated using `XXH*_createState()`. + * + * Start a new hash by initializing the state with a seed using `XXH*_reset()`. + * + * Then, feed the hash state by calling `XXH*_update()` as many times as necessary. + * + * The function returns an error code, with 0 meaning OK, and any other value + * meaning there is an error. + * + * Finally, a hash value can be produced anytime, by using `XXH*_digest()`. + * This function returns the nn-bits hash as an int or long long. + * + * It's still possible to continue inserting input into the hash state after a + * digest, and generate new hash values later on by invoking `XXH*_digest()`. + * + * When done, release the state using `XXH*_freeState()`. + * + * @see streaming_example at the top of @ref xxhash.h for an example. + */ + +/*! + * @typedef struct XXH32_state_s XXH32_state_t + * @brief The opaque state struct for the XXH32 streaming API. + * + * @see XXH32_state_s for details. + */ +typedef struct XXH32_state_s XXH32_state_t; + +/*! + * @brief Allocates an @ref XXH32_state_t. + * + * Must be freed with XXH32_freeState(). + * @return An allocated XXH32_state_t on success, `NULL` on failure. + */ +XXH_PUBLIC_API XXH_MALLOCF XXH32_state_t* XXH32_createState(void); +/*! + * @brief Frees an @ref XXH32_state_t. + * + * Must be allocated with XXH32_createState(). + * @param statePtr A pointer to an @ref XXH32_state_t allocated with @ref XXH32_createState(). + * @return XXH_OK. + */ +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr); +/*! + * @brief Copies one @ref XXH32_state_t to another. + * + * @param dst_state The state to copy to. + * @param src_state The state to copy from. + * @pre + * @p dst_state and @p src_state must not be `NULL` and must not overlap. + */ +XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state); + +/*! + * @brief Resets an @ref XXH32_state_t to begin a new hash. + * + * This function resets and seeds a state. Call it before @ref XXH32_update(). + * + * @param statePtr The state struct to reset. + * @param seed The 32-bit seed to alter the hash result predictably. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + */ +XXH_PUBLIC_API XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, XXH32_hash_t seed); + +/*! + * @brief Consumes a block of @p input to an @ref XXH32_state_t. + * + * Call this to incrementally consume blocks of data. + * + * @param statePtr The state struct to update. + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + */ +XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length); + +/*! + * @brief Returns the calculated hash value from an @ref XXH32_state_t. + * + * @note + * Calling XXH32_digest() will not affect @p statePtr, so you can update, + * digest, and update again. + * + * @param statePtr The state struct to calculate the hash from. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return The calculated xxHash32 value from that state. + */ +XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ + +/******* Canonical representation *******/ + +/* + * The default return values from XXH functions are unsigned 32 and 64 bit + * integers. + * This the simplest and fastest format for further post-processing. + * + * However, this leaves open the question of what is the order on the byte level, + * since little and big endian conventions will store the same number differently. + * + * The canonical representation settles this issue by mandating big-endian + * convention, the same convention as human-readable numbers (large digits first). + * + * When writing hash values to storage, sending them over a network, or printing + * them, it's highly recommended to use the canonical representation to ensure + * portability across a wider range of systems, present and future. + * + * The following functions allow transformation of hash values to and from + * canonical format. + */ + +/*! + * @brief Canonical (big endian) representation of @ref XXH32_hash_t. + */ +typedef struct { + unsigned char digest[4]; /*!< Hash bytes, big endian */ +} XXH32_canonical_t; + +/*! + * @brief Converts an @ref XXH32_hash_t to a big endian @ref XXH32_canonical_t. + * + * @param dst The @ref XXH32_canonical_t pointer to be stored to. + * @param hash The @ref XXH32_hash_t to be converted. + * + * @pre + * @p dst must not be `NULL`. + */ +XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash); + +/*! + * @brief Converts an @ref XXH32_canonical_t to a native @ref XXH32_hash_t. + * + * @param src The @ref XXH32_canonical_t to convert. + * + * @pre + * @p src must not be `NULL`. + * + * @return The converted hash. + */ +XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src); + + +/*! @cond Doxygen ignores this part */ +#ifdef __has_attribute +# define XXH_HAS_ATTRIBUTE(x) __has_attribute(x) +#else +# define XXH_HAS_ATTRIBUTE(x) 0 +#endif +/*! @endcond */ + +/*! @cond Doxygen ignores this part */ +/* + * C23 __STDC_VERSION__ number hasn't been specified yet. For now + * leave as `201711L` (C17 + 1). + * TODO: Update to correct value when its been specified. + */ +#define XXH_C23_VN 201711L +/*! @endcond */ + +/*! @cond Doxygen ignores this part */ +/* C-language Attributes are added in C23. */ +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN) && defined(__has_c_attribute) +# define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x) +#else +# define XXH_HAS_C_ATTRIBUTE(x) 0 +#endif +/*! @endcond */ + +/*! @cond Doxygen ignores this part */ +#if defined(__cplusplus) && defined(__has_cpp_attribute) +# define XXH_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x) +#else +# define XXH_HAS_CPP_ATTRIBUTE(x) 0 +#endif +/*! @endcond */ + +/*! @cond Doxygen ignores this part */ +/* + * Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute + * introduced in CPP17 and C23. + * CPP17 : https://en.cppreference.com/w/cpp/language/attributes/fallthrough + * C23 : https://en.cppreference.com/w/c/language/attributes/fallthrough + */ +#if XXH_HAS_C_ATTRIBUTE(fallthrough) || XXH_HAS_CPP_ATTRIBUTE(fallthrough) +# define XXH_FALLTHROUGH [[fallthrough]] +#elif XXH_HAS_ATTRIBUTE(__fallthrough__) +# define XXH_FALLTHROUGH __attribute__ ((__fallthrough__)) +#else +# define XXH_FALLTHROUGH /* fallthrough */ +#endif +/*! @endcond */ + +/*! @cond Doxygen ignores this part */ +/* + * Define XXH_NOESCAPE for annotated pointers in public API. + * https://clang.llvm.org/docs/AttributeReference.html#noescape + * As of writing this, only supported by clang. + */ +#if XXH_HAS_ATTRIBUTE(noescape) +# define XXH_NOESCAPE __attribute__((noescape)) +#else +# define XXH_NOESCAPE +#endif +/*! @endcond */ + + +/*! + * @} + * @ingroup public + * @{ + */ + +#ifndef XXH_NO_LONG_LONG +/*-********************************************************************** +* 64-bit hash +************************************************************************/ +#if defined(XXH_DOXYGEN) /* don't include <stdint.h> */ +/*! + * @brief An unsigned 64-bit integer. + * + * Not necessarily defined to `uint64_t` but functionally equivalent. + */ +typedef uint64_t XXH64_hash_t; +#elif !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include <stdint.h> + typedef uint64_t XXH64_hash_t; +#else +# include <limits.h> +# if defined(__LP64__) && ULONG_MAX == 0xFFFFFFFFFFFFFFFFULL + /* LP64 ABI says uint64_t is unsigned long */ + typedef unsigned long XXH64_hash_t; +# else + /* the following type must have a width of 64-bit */ + typedef unsigned long long XXH64_hash_t; +# endif +#endif + +/*! + * @} + * + * @defgroup XXH64_family XXH64 family + * @ingroup public + * @{ + * Contains functions used in the classic 64-bit xxHash algorithm. + * + * @note + * XXH3 provides competitive speed for both 32-bit and 64-bit systems, + * and offers true 64/128 bit hash results. + * It provides better speed for systems with vector processing capabilities. + */ + +/*! + * @brief Calculates the 64-bit hash of @p input using xxHash64. + * + * This function usually runs faster on 64-bit systems, but slower on 32-bit + * systems (see benchmark). + * + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * @param seed The 64-bit seed to alter the hash's output predictably. + * + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return The calculated 64-bit hash. + * + * @see + * XXH32(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128(): + * Direct equivalents for the other variants of xxHash. + * @see + * XXH64_createState(), XXH64_update(), XXH64_digest(): Streaming version. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed); + +/******* Streaming *******/ +#ifndef XXH_NO_STREAM +/*! + * @brief The opaque state struct for the XXH64 streaming API. + * + * @see XXH64_state_s for details. + */ +typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */ + +/*! + * @brief Allocates an @ref XXH64_state_t. + * + * Must be freed with XXH64_freeState(). + * @return An allocated XXH64_state_t on success, `NULL` on failure. + */ +XXH_PUBLIC_API XXH_MALLOCF XXH64_state_t* XXH64_createState(void); + +/*! + * @brief Frees an @ref XXH64_state_t. + * + * Must be allocated with XXH64_createState(). + * @param statePtr A pointer to an @ref XXH64_state_t allocated with @ref XXH64_createState(). + * @return XXH_OK. + */ +XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr); + +/*! + * @brief Copies one @ref XXH64_state_t to another. + * + * @param dst_state The state to copy to. + * @param src_state The state to copy from. + * @pre + * @p dst_state and @p src_state must not be `NULL` and must not overlap. + */ +XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dst_state, const XXH64_state_t* src_state); + +/*! + * @brief Resets an @ref XXH64_state_t to begin a new hash. + * + * This function resets and seeds a state. Call it before @ref XXH64_update(). + * + * @param statePtr The state struct to reset. + * @param seed The 64-bit seed to alter the hash result predictably. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + */ +XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed); + +/*! + * @brief Consumes a block of @p input to an @ref XXH64_state_t. + * + * Call this to incrementally consume blocks of data. + * + * @param statePtr The state struct to update. + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + */ +XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH_NOESCAPE XXH64_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length); + +/*! + * @brief Returns the calculated hash value from an @ref XXH64_state_t. + * + * @note + * Calling XXH64_digest() will not affect @p statePtr, so you can update, + * digest, and update again. + * + * @param statePtr The state struct to calculate the hash from. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return The calculated xxHash64 value from that state. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_digest (XXH_NOESCAPE const XXH64_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ +/******* Canonical representation *******/ + +/*! + * @brief Canonical (big endian) representation of @ref XXH64_hash_t. + */ +typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t; + +/*! + * @brief Converts an @ref XXH64_hash_t to a big endian @ref XXH64_canonical_t. + * + * @param dst The @ref XXH64_canonical_t pointer to be stored to. + * @param hash The @ref XXH64_hash_t to be converted. + * + * @pre + * @p dst must not be `NULL`. + */ +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash); + +/*! + * @brief Converts an @ref XXH64_canonical_t to a native @ref XXH64_hash_t. + * + * @param src The @ref XXH64_canonical_t to convert. + * + * @pre + * @p src must not be `NULL`. + * + * @return The converted hash. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src); + +#ifndef XXH_NO_XXH3 + +/*! + * @} + * ************************************************************************ + * @defgroup XXH3_family XXH3 family + * @ingroup public + * @{ + * + * XXH3 is a more recent hash algorithm featuring: + * - Improved speed for both small and large inputs + * - True 64-bit and 128-bit outputs + * - SIMD acceleration + * - Improved 32-bit viability + * + * Speed analysis methodology is explained here: + * + * https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html + * + * Compared to XXH64, expect XXH3 to run approximately + * ~2x faster on large inputs and >3x faster on small ones, + * exact differences vary depending on platform. + * + * XXH3's speed benefits greatly from SIMD and 64-bit arithmetic, + * but does not require it. + * Most 32-bit and 64-bit targets that can run XXH32 smoothly can run XXH3 + * at competitive speeds, even without vector support. Further details are + * explained in the implementation. + * + * XXH3 has a fast scalar implementation, but it also includes accelerated SIMD + * implementations for many common platforms: + * - AVX512 + * - AVX2 + * - SSE2 + * - ARM NEON + * - WebAssembly SIMD128 + * - POWER8 VSX + * - s390x ZVector + * This can be controlled via the @ref XXH_VECTOR macro, but it automatically + * selects the best version according to predefined macros. For the x86 family, an + * automatic runtime dispatcher is included separately in @ref xxh_x86dispatch.c. + * + * XXH3 implementation is portable: + * it has a generic C90 formulation that can be compiled on any platform, + * all implementations generate exactly the same hash value on all platforms. + * Starting from v0.8.0, it's also labelled "stable", meaning that + * any future version will also generate the same hash value. + * + * XXH3 offers 2 variants, _64bits and _128bits. + * + * When only 64 bits are needed, prefer invoking the _64bits variant, as it + * reduces the amount of mixing, resulting in faster speed on small inputs. + * It's also generally simpler to manipulate a scalar return type than a struct. + * + * The API supports one-shot hashing, streaming mode, and custom secrets. + */ +/*-********************************************************************** +* XXH3 64-bit variant +************************************************************************/ + +/*! + * @brief 64-bit unseeded variant of XXH3. + * + * This is equivalent to @ref XXH3_64bits_withSeed() with a seed of 0, however + * it may have slightly better performance due to constant propagation of the + * defaults. + * + * @see + * XXH32(), XXH64(), XXH3_128bits(): equivalent for the other xxHash algorithms + * @see + * XXH3_64bits_withSeed(), XXH3_64bits_withSecret(): other seeding variants + * @see + * XXH3_64bits_reset(), XXH3_64bits_update(), XXH3_64bits_digest(): Streaming version. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length); + +/*! + * @brief 64-bit seeded variant of XXH3 + * + * This variant generates a custom secret on the fly based on default secret + * altered using the `seed` value. + * + * While this operation is decently fast, note that it's not completely free. + * + * @note + * seed == 0 produces the same results as @ref XXH3_64bits(). + * + * @param input The data to hash + * @param length The length + * @param seed The 64-bit seed to alter the state. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed); + +/*! + * The bare minimum size for a custom secret. + * + * @see + * XXH3_64bits_withSecret(), XXH3_64bits_reset_withSecret(), + * XXH3_128bits_withSecret(), XXH3_128bits_reset_withSecret(). + */ +#define XXH3_SECRET_SIZE_MIN 136 + +/*! + * @brief 64-bit variant of XXH3 with a custom "secret". + * + * It's possible to provide any blob of bytes as a "secret" to generate the hash. + * This makes it more difficult for an external actor to prepare an intentional collision. + * The main condition is that secretSize *must* be large enough (>= XXH3_SECRET_SIZE_MIN). + * However, the quality of the secret impacts the dispersion of the hash algorithm. + * Therefore, the secret _must_ look like a bunch of random bytes. + * Avoid "trivial" or structured data such as repeated sequences or a text document. + * Whenever in doubt about the "randomness" of the blob of bytes, + * consider employing "XXH3_generateSecret()" instead (see below). + * It will generate a proper high entropy secret derived from the blob of bytes. + * Another advantage of using XXH3_generateSecret() is that + * it guarantees that all bits within the initial blob of bytes + * will impact every bit of the output. + * This is not necessarily the case when using the blob of bytes directly + * because, when hashing _small_ inputs, only a portion of the secret is employed. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize); + + +/******* Streaming *******/ +#ifndef XXH_NO_STREAM +/* + * Streaming requires state maintenance. + * This operation costs memory and CPU. + * As a consequence, streaming is slower than one-shot hashing. + * For better performance, prefer one-shot functions whenever applicable. + */ + +/*! + * @brief The state struct for the XXH3 streaming API. + * + * @see XXH3_state_s for details. + */ +typedef struct XXH3_state_s XXH3_state_t; +XXH_PUBLIC_API XXH_MALLOCF XXH3_state_t* XXH3_createState(void); +XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr); + +/*! + * @brief Copies one @ref XXH3_state_t to another. + * + * @param dst_state The state to copy to. + * @param src_state The state to copy from. + * @pre + * @p dst_state and @p src_state must not be `NULL` and must not overlap. + */ +XXH_PUBLIC_API void XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state); + +/*! + * @brief Resets an @ref XXH3_state_t to begin a new hash. + * + * This function resets `statePtr` and generate a secret with default parameters. Call it before @ref XXH3_64bits_update(). + * Digest will be equivalent to `XXH3_64bits()`. + * + * @param statePtr The state struct to reset. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + * + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr); + +/*! + * @brief Resets an @ref XXH3_state_t with 64-bit seed to begin a new hash. + * + * This function resets `statePtr` and generate a secret from `seed`. Call it before @ref XXH3_64bits_update(). + * Digest will be equivalent to `XXH3_64bits_withSeed()`. + * + * @param statePtr The state struct to reset. + * @param seed The 64-bit seed to alter the state. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + * + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed); + +/*! + * XXH3_64bits_reset_withSecret(): + * `secret` is referenced, it _must outlive_ the hash streaming session. + * Similar to one-shot API, `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`, + * and the quality of produced hash values depends on secret's entropy + * (secret's content should look like a bunch of random bytes). + * When in doubt about the randomness of a candidate `secret`, + * consider employing `XXH3_generateSecret()` instead (see below). + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize); + +/*! + * @brief Consumes a block of @p input to an @ref XXH3_state_t. + * + * Call this to incrementally consume blocks of data. + * + * @param statePtr The state struct to update. + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length); + +/*! + * @brief Returns the calculated XXH3 64-bit hash value from an @ref XXH3_state_t. + * + * @note + * Calling XXH3_64bits_digest() will not affect @p statePtr, so you can update, + * digest, and update again. + * + * @param statePtr The state struct to calculate the hash from. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return The calculated XXH3 64-bit hash value from that state. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ + +/* note : canonical representation of XXH3 is the same as XXH64 + * since they both produce XXH64_hash_t values */ + + +/*-********************************************************************** +* XXH3 128-bit variant +************************************************************************/ + +/*! + * @brief The return value from 128-bit hashes. + * + * Stored in little endian order, although the fields themselves are in native + * endianness. + */ +typedef struct { + XXH64_hash_t low64; /*!< `value & 0xFFFFFFFFFFFFFFFF` */ + XXH64_hash_t high64; /*!< `value >> 64` */ +} XXH128_hash_t; + +/*! + * @brief Unseeded 128-bit variant of XXH3 + * + * The 128-bit variant of XXH3 has more strength, but it has a bit of overhead + * for shorter inputs. + * + * This is equivalent to @ref XXH3_128bits_withSeed() with a seed of 0, however + * it may have slightly better performance due to constant propagation of the + * defaults. + * + * @see + * XXH32(), XXH64(), XXH3_64bits(): equivalent for the other xxHash algorithms + * @see + * XXH3_128bits_withSeed(), XXH3_128bits_withSecret(): other seeding variants + * @see + * XXH3_128bits_reset(), XXH3_128bits_update(), XXH3_128bits_digest(): Streaming version. + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* data, size_t len); +/*! @brief Seeded 128-bit variant of XXH3. @see XXH3_64bits_withSeed(). */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSeed(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed); +/*! @brief Custom secret 128-bit variant of XXH3. @see XXH3_64bits_withSecret(). */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize); + +/******* Streaming *******/ +#ifndef XXH_NO_STREAM +/* + * Streaming requires state maintenance. + * This operation costs memory and CPU. + * As a consequence, streaming is slower than one-shot hashing. + * For better performance, prefer one-shot functions whenever applicable. + * + * XXH3_128bits uses the same XXH3_state_t as XXH3_64bits(). + * Use already declared XXH3_createState() and XXH3_freeState(). + * + * All reset and streaming functions have same meaning as their 64-bit counterpart. + */ + +/*! + * @brief Resets an @ref XXH3_state_t to begin a new hash. + * + * This function resets `statePtr` and generate a secret with default parameters. Call it before @ref XXH3_128bits_update(). + * Digest will be equivalent to `XXH3_128bits()`. + * + * @param statePtr The state struct to reset. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + * + */ +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr); + +/*! + * @brief Resets an @ref XXH3_state_t with 64-bit seed to begin a new hash. + * + * This function resets `statePtr` and generate a secret from `seed`. Call it before @ref XXH3_128bits_update(). + * Digest will be equivalent to `XXH3_128bits_withSeed()`. + * + * @param statePtr The state struct to reset. + * @param seed The 64-bit seed to alter the state. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + * + */ +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed); +/*! @brief Custom secret 128-bit variant of XXH3. @see XXH_64bits_reset_withSecret(). */ +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize); + +/*! + * @brief Consumes a block of @p input to an @ref XXH3_state_t. + * + * Call this to incrementally consume blocks of data. + * + * @param statePtr The state struct to update. + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + */ +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length); + +/*! + * @brief Returns the calculated XXH3 128-bit hash value from an @ref XXH3_state_t. + * + * @note + * Calling XXH3_128bits_digest() will not affect @p statePtr, so you can update, + * digest, and update again. + * + * @param statePtr The state struct to calculate the hash from. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return The calculated XXH3 128-bit hash value from that state. + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ + +/* Following helper functions make it possible to compare XXH128_hast_t values. + * Since XXH128_hash_t is a structure, this capability is not offered by the language. + * Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */ + +/*! + * XXH128_isEqual(): + * Return: 1 if `h1` and `h2` are equal, 0 if they are not. + */ +XXH_PUBLIC_API XXH_PUREF int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2); + +/*! + * @brief Compares two @ref XXH128_hash_t + * This comparator is compatible with stdlib's `qsort()`/`bsearch()`. + * + * @return: >0 if *h128_1 > *h128_2 + * =0 if *h128_1 == *h128_2 + * <0 if *h128_1 < *h128_2 + */ +XXH_PUBLIC_API XXH_PUREF int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2); + + +/******* Canonical representation *******/ +typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t; + + +/*! + * @brief Converts an @ref XXH128_hash_t to a big endian @ref XXH128_canonical_t. + * + * @param dst The @ref XXH128_canonical_t pointer to be stored to. + * @param hash The @ref XXH128_hash_t to be converted. + * + * @pre + * @p dst must not be `NULL`. + */ +XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash); + +/*! + * @brief Converts an @ref XXH128_canonical_t to a native @ref XXH128_hash_t. + * + * @param src The @ref XXH128_canonical_t to convert. + * + * @pre + * @p src must not be `NULL`. + * + * @return The converted hash. + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src); + + +#endif /* !XXH_NO_XXH3 */ +#endif /* XXH_NO_LONG_LONG */ + +/*! + * @} + */ +#endif /* XXHASH_H_5627135585666179 */ + + + +#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) +#define XXHASH_H_STATIC_13879238742 +/* **************************************************************************** + * This section contains declarations which are not guaranteed to remain stable. + * They may change in future versions, becoming incompatible with a different + * version of the library. + * These declarations should only be used with static linking. + * Never use them in association with dynamic linking! + ***************************************************************************** */ + +/* + * These definitions are only present to allow static allocation + * of XXH states, on stack or in a struct, for example. + * Never **ever** access their members directly. + */ + +/*! + * @internal + * @brief Structure for XXH32 streaming API. + * + * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, + * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is + * an opaque type. This allows fields to safely be changed. + * + * Typedef'd to @ref XXH32_state_t. + * Do not access the members of this struct directly. + * @see XXH64_state_s, XXH3_state_s + */ +struct XXH32_state_s { + XXH32_hash_t total_len_32; /*!< Total length hashed, modulo 2^32 */ + XXH32_hash_t large_len; /*!< Whether the hash is >= 16 (handles @ref total_len_32 overflow) */ + XXH32_hash_t v[4]; /*!< Accumulator lanes */ + XXH32_hash_t mem32[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[16]. */ + XXH32_hash_t memsize; /*!< Amount of data in @ref mem32 */ + XXH32_hash_t reserved; /*!< Reserved field. Do not read nor write to it. */ +}; /* typedef'd to XXH32_state_t */ + + +#ifndef XXH_NO_LONG_LONG /* defined when there is no 64-bit support */ + +/*! + * @internal + * @brief Structure for XXH64 streaming API. + * + * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, + * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is + * an opaque type. This allows fields to safely be changed. + * + * Typedef'd to @ref XXH64_state_t. + * Do not access the members of this struct directly. + * @see XXH32_state_s, XXH3_state_s + */ +struct XXH64_state_s { + XXH64_hash_t total_len; /*!< Total length hashed. This is always 64-bit. */ + XXH64_hash_t v[4]; /*!< Accumulator lanes */ + XXH64_hash_t mem64[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[32]. */ + XXH32_hash_t memsize; /*!< Amount of data in @ref mem64 */ + XXH32_hash_t reserved32; /*!< Reserved field, needed for padding anyways*/ + XXH64_hash_t reserved64; /*!< Reserved field. Do not read or write to it. */ +}; /* typedef'd to XXH64_state_t */ + +#ifndef XXH_NO_XXH3 + +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* >= C11 */ +# include <stdalign.h> +# define XXH_ALIGN(n) alignas(n) +#elif defined(__cplusplus) && (__cplusplus >= 201103L) /* >= C++11 */ +/* In C++ alignas() is a keyword */ +# define XXH_ALIGN(n) alignas(n) +#elif defined(__GNUC__) +# define XXH_ALIGN(n) __attribute__ ((aligned(n))) +#elif defined(_MSC_VER) +# define XXH_ALIGN(n) __declspec(align(n)) +#else +# define XXH_ALIGN(n) /* disabled */ +#endif + +/* Old GCC versions only accept the attribute after the type in structures. */ +#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) /* C11+ */ \ + && ! (defined(__cplusplus) && (__cplusplus >= 201103L)) /* >= C++11 */ \ + && defined(__GNUC__) +# define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align) +#else +# define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type +#endif + +/*! + * @brief The size of the internal XXH3 buffer. + * + * This is the optimal update size for incremental hashing. + * + * @see XXH3_64b_update(), XXH3_128b_update(). + */ +#define XXH3_INTERNALBUFFER_SIZE 256 + +/*! + * @internal + * @brief Default size of the secret buffer (and @ref XXH3_kSecret). + * + * This is the size used in @ref XXH3_kSecret and the seeded functions. + * + * Not to be confused with @ref XXH3_SECRET_SIZE_MIN. + */ +#define XXH3_SECRET_DEFAULT_SIZE 192 + +/*! + * @internal + * @brief Structure for XXH3 streaming API. + * + * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, + * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. + * Otherwise it is an opaque type. + * Never use this definition in combination with dynamic library. + * This allows fields to safely be changed in the future. + * + * @note ** This structure has a strict alignment requirement of 64 bytes!! ** + * Do not allocate this with `malloc()` or `new`, + * it will not be sufficiently aligned. + * Use @ref XXH3_createState() and @ref XXH3_freeState(), or stack allocation. + * + * Typedef'd to @ref XXH3_state_t. + * Do never access the members of this struct directly. + * + * @see XXH3_INITSTATE() for stack initialization. + * @see XXH3_createState(), XXH3_freeState(). + * @see XXH32_state_s, XXH64_state_s + */ +struct XXH3_state_s { + XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]); + /*!< The 8 accumulators. See @ref XXH32_state_s::v and @ref XXH64_state_s::v */ + XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]); + /*!< Used to store a custom secret generated from a seed. */ + XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]); + /*!< The internal buffer. @see XXH32_state_s::mem32 */ + XXH32_hash_t bufferedSize; + /*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */ + XXH32_hash_t useSeed; + /*!< Reserved field. Needed for padding on 64-bit. */ + size_t nbStripesSoFar; + /*!< Number or stripes processed. */ + XXH64_hash_t totalLen; + /*!< Total length hashed. 64-bit even on 32-bit targets. */ + size_t nbStripesPerBlock; + /*!< Number of stripes per block. */ + size_t secretLimit; + /*!< Size of @ref customSecret or @ref extSecret */ + XXH64_hash_t seed; + /*!< Seed for _withSeed variants. Must be zero otherwise, @see XXH3_INITSTATE() */ + XXH64_hash_t reserved64; + /*!< Reserved field. */ + const unsigned char* extSecret; + /*!< Reference to an external secret for the _withSecret variants, NULL + * for other variants. */ + /* note: there may be some padding at the end due to alignment on 64 bytes */ +}; /* typedef'd to XXH3_state_t */ + +#undef XXH_ALIGN_MEMBER + +/*! + * @brief Initializes a stack-allocated `XXH3_state_s`. + * + * When the @ref XXH3_state_t structure is merely emplaced on stack, + * it should be initialized with XXH3_INITSTATE() or a memset() + * in case its first reset uses XXH3_NNbits_reset_withSeed(). + * This init can be omitted if the first reset uses default or _withSecret mode. + * This operation isn't necessary when the state is created with XXH3_createState(). + * Note that this doesn't prepare the state for a streaming operation, + * it's still necessary to use XXH3_NNbits_reset*() afterwards. + */ +#define XXH3_INITSTATE(XXH3_state_ptr) \ + do { \ + XXH3_state_t* tmp_xxh3_state_ptr = (XXH3_state_ptr); \ + tmp_xxh3_state_ptr->seed = 0; \ + tmp_xxh3_state_ptr->extSecret = NULL; \ + } while(0) + + +/*! + * simple alias to pre-selected XXH3_128bits variant + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed); + + +/* === Experimental API === */ +/* Symbols defined below must be considered tied to a specific library version. */ + +/*! + * XXH3_generateSecret(): + * + * Derive a high-entropy secret from any user-defined content, named customSeed. + * The generated secret can be used in combination with `*_withSecret()` functions. + * The `_withSecret()` variants are useful to provide a higher level of protection + * than 64-bit seed, as it becomes much more difficult for an external actor to + * guess how to impact the calculation logic. + * + * The function accepts as input a custom seed of any length and any content, + * and derives from it a high-entropy secret of length @p secretSize into an + * already allocated buffer @p secretBuffer. + * + * The generated secret can then be used with any `*_withSecret()` variant. + * The functions @ref XXH3_128bits_withSecret(), @ref XXH3_64bits_withSecret(), + * @ref XXH3_128bits_reset_withSecret() and @ref XXH3_64bits_reset_withSecret() + * are part of this list. They all accept a `secret` parameter + * which must be large enough for implementation reasons (>= @ref XXH3_SECRET_SIZE_MIN) + * _and_ feature very high entropy (consist of random-looking bytes). + * These conditions can be a high bar to meet, so @ref XXH3_generateSecret() can + * be employed to ensure proper quality. + * + * @p customSeed can be anything. It can have any size, even small ones, + * and its content can be anything, even "poor entropy" sources such as a bunch + * of zeroes. The resulting `secret` will nonetheless provide all required qualities. + * + * @pre + * - @p secretSize must be >= @ref XXH3_SECRET_SIZE_MIN + * - When @p customSeedSize > 0, supplying NULL as customSeed is undefined behavior. + * + * Example code: + * @code{.c} + * #include <stdio.h> + * #include <stdlib.h> + * #include <string.h> + * #define XXH_STATIC_LINKING_ONLY // expose unstable API + * #include "xxhash.h" + * // Hashes argv[2] using the entropy from argv[1]. + * int main(int argc, char* argv[]) + * { + * char secret[XXH3_SECRET_SIZE_MIN]; + * if (argv != 3) { return 1; } + * XXH3_generateSecret(secret, sizeof(secret), argv[1], strlen(argv[1])); + * XXH64_hash_t h = XXH3_64bits_withSecret( + * argv[2], strlen(argv[2]), + * secret, sizeof(secret) + * ); + * printf("%016llx\n", (unsigned long long) h); + * } + * @endcode + */ +XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize); + +/*! + * @brief Generate the same secret as the _withSeed() variants. + * + * The generated secret can be used in combination with + *`*_withSecret()` and `_withSecretandSeed()` variants. + * + * Example C++ `std::string` hash class: + * @code{.cpp} + * #include <string> + * #define XXH_STATIC_LINKING_ONLY // expose unstable API + * #include "xxhash.h" + * // Slow, seeds each time + * class HashSlow { + * XXH64_hash_t seed; + * public: + * HashSlow(XXH64_hash_t s) : seed{s} {} + * size_t operator()(const std::string& x) const { + * return size_t{XXH3_64bits_withSeed(x.c_str(), x.length(), seed)}; + * } + * }; + * // Fast, caches the seeded secret for future uses. + * class HashFast { + * unsigned char secret[XXH3_SECRET_SIZE_MIN]; + * public: + * HashFast(XXH64_hash_t s) { + * XXH3_generateSecret_fromSeed(secret, seed); + * } + * size_t operator()(const std::string& x) const { + * return size_t{ + * XXH3_64bits_withSecret(x.c_str(), x.length(), secret, sizeof(secret)) + * }; + * } + * }; + * @endcode + * @param secretBuffer A writable buffer of @ref XXH3_SECRET_SIZE_MIN bytes + * @param seed The seed to seed the state. + */ +XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed); + +/*! + * These variants generate hash values using either + * @p seed for "short" keys (< XXH3_MIDSIZE_MAX = 240 bytes) + * or @p secret for "large" keys (>= XXH3_MIDSIZE_MAX). + * + * This generally benefits speed, compared to `_withSeed()` or `_withSecret()`. + * `_withSeed()` has to generate the secret on the fly for "large" keys. + * It's fast, but can be perceptible for "not so large" keys (< 1 KB). + * `_withSecret()` has to generate the masks on the fly for "small" keys, + * which requires more instructions than _withSeed() variants. + * Therefore, _withSecretandSeed variant combines the best of both worlds. + * + * When @p secret has been generated by XXH3_generateSecret_fromSeed(), + * this variant produces *exactly* the same results as `_withSeed()` variant, + * hence offering only a pure speed benefit on "large" input, + * by skipping the need to regenerate the secret for every large input. + * + * Another usage scenario is to hash the secret to a 64-bit hash value, + * for example with XXH3_64bits(), which then becomes the seed, + * and then employ both the seed and the secret in _withSecretandSeed(). + * On top of speed, an added benefit is that each bit in the secret + * has a 50% chance to swap each bit in the output, via its impact to the seed. + * + * This is not guaranteed when using the secret directly in "small data" scenarios, + * because only portions of the secret are employed for small data. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t +XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* data, size_t len, + XXH_NOESCAPE const void* secret, size_t secretSize, + XXH64_hash_t seed); +/*! @copydoc XXH3_64bits_withSecretandSeed() */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t +XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length, + XXH_NOESCAPE const void* secret, size_t secretSize, + XXH64_hash_t seed64); +#ifndef XXH_NO_STREAM +/*! @copydoc XXH3_64bits_withSecretandSeed() */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, + XXH_NOESCAPE const void* secret, size_t secretSize, + XXH64_hash_t seed64); +/*! @copydoc XXH3_64bits_withSecretandSeed() */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, + XXH_NOESCAPE const void* secret, size_t secretSize, + XXH64_hash_t seed64); +#endif /* !XXH_NO_STREAM */ + +#endif /* !XXH_NO_XXH3 */ +#endif /* XXH_NO_LONG_LONG */ +#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) +# define XXH_IMPLEMENTATION +#endif + +#endif /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */ + + +/* ======================================================================== */ +/* ======================================================================== */ +/* ======================================================================== */ + + +/*-********************************************************************** + * xxHash implementation + *-********************************************************************** + * xxHash's implementation used to be hosted inside xxhash.c. + * + * However, inlining requires implementation to be visible to the compiler, + * hence be included alongside the header. + * Previously, implementation was hosted inside xxhash.c, + * which was then #included when inlining was activated. + * This construction created issues with a few build and install systems, + * as it required xxhash.c to be stored in /include directory. + * + * xxHash implementation is now directly integrated within xxhash.h. + * As a consequence, xxhash.c is no longer needed in /include. + * + * xxhash.c is still available and is still useful. + * In a "normal" setup, when xxhash is not inlined, + * xxhash.h only exposes the prototypes and public symbols, + * while xxhash.c can be built into an object file xxhash.o + * which can then be linked into the final binary. + ************************************************************************/ + +#if ( defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) \ + || defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387) +# define XXH_IMPLEM_13a8737387 + +/* ************************************* +* Tuning parameters +***************************************/ + +/*! + * @defgroup tuning Tuning parameters + * @{ + * + * Various macros to control xxHash's behavior. + */ +#ifdef XXH_DOXYGEN +/*! + * @brief Define this to disable 64-bit code. + * + * Useful if only using the @ref XXH32_family and you have a strict C90 compiler. + */ +# define XXH_NO_LONG_LONG +# undef XXH_NO_LONG_LONG /* don't actually */ +/*! + * @brief Controls how unaligned memory is accessed. + * + * By default, access to unaligned memory is controlled by `memcpy()`, which is + * safe and portable. + * + * Unfortunately, on some target/compiler combinations, the generated assembly + * is sub-optimal. + * + * The below switch allow selection of a different access method + * in the search for improved performance. + * + * @par Possible options: + * + * - `XXH_FORCE_MEMORY_ACCESS=0` (default): `memcpy` + * @par + * Use `memcpy()`. Safe and portable. Note that most modern compilers will + * eliminate the function call and treat it as an unaligned access. + * + * - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((aligned(1)))` + * @par + * Depends on compiler extensions and is therefore not portable. + * This method is safe _if_ your compiler supports it, + * and *generally* as fast or faster than `memcpy`. + * + * - `XXH_FORCE_MEMORY_ACCESS=2`: Direct cast + * @par + * Casts directly and dereferences. This method doesn't depend on the + * compiler, but it violates the C standard as it directly dereferences an + * unaligned pointer. It can generate buggy code on targets which do not + * support unaligned memory accesses, but in some circumstances, it's the + * only known way to get the most performance. + * + * - `XXH_FORCE_MEMORY_ACCESS=3`: Byteshift + * @par + * Also portable. This can generate the best code on old compilers which don't + * inline small `memcpy()` calls, and it might also be faster on big-endian + * systems which lack a native byteswap instruction. However, some compilers + * will emit literal byteshifts even if the target supports unaligned access. + * + * + * @warning + * Methods 1 and 2 rely on implementation-defined behavior. Use these with + * care, as what works on one compiler/platform/optimization level may cause + * another to read garbage data or even crash. + * + * See https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html for details. + * + * Prefer these methods in priority order (0 > 3 > 1 > 2) + */ +# define XXH_FORCE_MEMORY_ACCESS 0 + +/*! + * @def XXH_SIZE_OPT + * @brief Controls how much xxHash optimizes for size. + * + * xxHash, when compiled, tends to result in a rather large binary size. This + * is mostly due to heavy usage to forced inlining and constant folding of the + * @ref XXH3_family to increase performance. + * + * However, some developers prefer size over speed. This option can + * significantly reduce the size of the generated code. When using the `-Os` + * or `-Oz` options on GCC or Clang, this is defined to 1 by default, + * otherwise it is defined to 0. + * + * Most of these size optimizations can be controlled manually. + * + * This is a number from 0-2. + * - `XXH_SIZE_OPT` == 0: Default. xxHash makes no size optimizations. Speed + * comes first. + * - `XXH_SIZE_OPT` == 1: Default for `-Os` and `-Oz`. xxHash is more + * conservative and disables hacks that increase code size. It implies the + * options @ref XXH_NO_INLINE_HINTS == 1, @ref XXH_FORCE_ALIGN_CHECK == 0, + * and @ref XXH3_NEON_LANES == 8 if they are not already defined. + * - `XXH_SIZE_OPT` == 2: xxHash tries to make itself as small as possible. + * Performance may cry. For example, the single shot functions just use the + * streaming API. + */ +# define XXH_SIZE_OPT 0 + +/*! + * @def XXH_FORCE_ALIGN_CHECK + * @brief If defined to non-zero, adds a special path for aligned inputs (XXH32() + * and XXH64() only). + * + * This is an important performance trick for architectures without decent + * unaligned memory access performance. + * + * It checks for input alignment, and when conditions are met, uses a "fast + * path" employing direct 32-bit/64-bit reads, resulting in _dramatically + * faster_ read speed. + * + * The check costs one initial branch per hash, which is generally negligible, + * but not zero. + * + * Moreover, it's not useful to generate an additional code path if memory + * access uses the same instruction for both aligned and unaligned + * addresses (e.g. x86 and aarch64). + * + * In these cases, the alignment check can be removed by setting this macro to 0. + * Then the code will always use unaligned memory access. + * Align check is automatically disabled on x86, x64, ARM64, and some ARM chips + * which are platforms known to offer good unaligned memory accesses performance. + * + * It is also disabled by default when @ref XXH_SIZE_OPT >= 1. + * + * This option does not affect XXH3 (only XXH32 and XXH64). + */ +# define XXH_FORCE_ALIGN_CHECK 0 + +/*! + * @def XXH_NO_INLINE_HINTS + * @brief When non-zero, sets all functions to `static`. + * + * By default, xxHash tries to force the compiler to inline almost all internal + * functions. + * + * This can usually improve performance due to reduced jumping and improved + * constant folding, but significantly increases the size of the binary which + * might not be favorable. + * + * Additionally, sometimes the forced inlining can be detrimental to performance, + * depending on the architecture. + * + * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the + * compiler full control on whether to inline or not. + * + * When not optimizing (-O0), using `-fno-inline` with GCC or Clang, or if + * @ref XXH_SIZE_OPT >= 1, this will automatically be defined. + */ +# define XXH_NO_INLINE_HINTS 0 + +/*! + * @def XXH3_INLINE_SECRET + * @brief Determines whether to inline the XXH3 withSecret code. + * + * When the secret size is known, the compiler can improve the performance + * of XXH3_64bits_withSecret() and XXH3_128bits_withSecret(). + * + * However, if the secret size is not known, it doesn't have any benefit. This + * happens when xxHash is compiled into a global symbol. Therefore, if + * @ref XXH_INLINE_ALL is *not* defined, this will be defined to 0. + * + * Additionally, this defaults to 0 on GCC 12+, which has an issue with function pointers + * that are *sometimes* force inline on -Og, and it is impossible to automatically + * detect this optimization level. + */ +# define XXH3_INLINE_SECRET 0 + +/*! + * @def XXH32_ENDJMP + * @brief Whether to use a jump for `XXH32_finalize`. + * + * For performance, `XXH32_finalize` uses multiple branches in the finalizer. + * This is generally preferable for performance, + * but depending on exact architecture, a jmp may be preferable. + * + * This setting is only possibly making a difference for very small inputs. + */ +# define XXH32_ENDJMP 0 + +/*! + * @internal + * @brief Redefines old internal names. + * + * For compatibility with code that uses xxHash's internals before the names + * were changed to improve namespacing. There is no other reason to use this. + */ +# define XXH_OLD_NAMES +# undef XXH_OLD_NAMES /* don't actually use, it is ugly. */ + +/*! + * @def XXH_NO_STREAM + * @brief Disables the streaming API. + * + * When xxHash is not inlined and the streaming functions are not used, disabling + * the streaming functions can improve code size significantly, especially with + * the @ref XXH3_family which tends to make constant folded copies of itself. + */ +# define XXH_NO_STREAM +# undef XXH_NO_STREAM /* don't actually */ +#endif /* XXH_DOXYGEN */ +/*! + * @} + */ + +#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ + /* prefer __packed__ structures (method 1) for GCC + * < ARMv7 with unaligned access (e.g. Raspbian armhf) still uses byte shifting, so we use memcpy + * which for some reason does unaligned loads. */ +# if defined(__GNUC__) && !(defined(__ARM_ARCH) && __ARM_ARCH < 7 && defined(__ARM_FEATURE_UNALIGNED)) +# define XXH_FORCE_MEMORY_ACCESS 1 +# endif +#endif + +#ifndef XXH_SIZE_OPT + /* default to 1 for -Os or -Oz */ +# if (defined(__GNUC__) || defined(__clang__)) && defined(__OPTIMIZE_SIZE__) +# define XXH_SIZE_OPT 1 +# else +# define XXH_SIZE_OPT 0 +# endif +#endif + +#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */ + /* don't check on sizeopt, x86, aarch64, or arm when unaligned access is available */ +# if XXH_SIZE_OPT >= 1 || \ + defined(__i386) || defined(__x86_64__) || defined(__aarch64__) || defined(__ARM_FEATURE_UNALIGNED) \ + || defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) || defined(_M_ARM) /* visual */ +# define XXH_FORCE_ALIGN_CHECK 0 +# else +# define XXH_FORCE_ALIGN_CHECK 1 +# endif +#endif + +#ifndef XXH_NO_INLINE_HINTS +# if XXH_SIZE_OPT >= 1 || defined(__NO_INLINE__) /* -O0, -fno-inline */ +# define XXH_NO_INLINE_HINTS 1 +# else +# define XXH_NO_INLINE_HINTS 0 +# endif +#endif + +#ifndef XXH3_INLINE_SECRET +# if (defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 12) \ + || !defined(XXH_INLINE_ALL) +# define XXH3_INLINE_SECRET 0 +# else +# define XXH3_INLINE_SECRET 1 +# endif +#endif + +#ifndef XXH32_ENDJMP +/* generally preferable for performance */ +# define XXH32_ENDJMP 0 +#endif + +/*! + * @defgroup impl Implementation + * @{ + */ + + +/* ************************************* +* Includes & Memory related functions +***************************************/ +#if defined(XXH_NO_STREAM) +/* nothing */ +#elif defined(XXH_NO_STDLIB) + +/* When requesting to disable any mention of stdlib, + * the library loses the ability to invoked malloc / free. + * In practice, it means that functions like `XXH*_createState()` + * will always fail, and return NULL. + * This flag is useful in situations where + * xxhash.h is integrated into some kernel, embedded or limited environment + * without access to dynamic allocation. + */ + +static XXH_CONSTF void* XXH_malloc(size_t s) { (void)s; return NULL; } +static void XXH_free(void* p) { (void)p; } + +#else + +/* + * Modify the local functions below should you wish to use + * different memory routines for malloc() and free() + */ +#include <stdlib.h> + +/*! + * @internal + * @brief Modify this function to use a different routine than malloc(). + */ +static XXH_MALLOCF void* XXH_malloc(size_t s) { return malloc(s); } + +/*! + * @internal + * @brief Modify this function to use a different routine than free(). + */ +static void XXH_free(void* p) { free(p); } + +#endif /* XXH_NO_STDLIB */ + +#include <string.h> + +/*! + * @internal + * @brief Modify this function to use a different routine than memcpy(). + */ +static void* XXH_memcpy(void* dest, const void* src, size_t size) +{ + return memcpy(dest,src,size); +} + +#include <limits.h> /* ULLONG_MAX */ + + +/* ************************************* +* Compiler Specific Options +***************************************/ +#ifdef _MSC_VER /* Visual Studio warning fix */ +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +#endif + +#if XXH_NO_INLINE_HINTS /* disable inlining hints */ +# if defined(__GNUC__) || defined(__clang__) +# define XXH_FORCE_INLINE static __attribute__((unused)) +# else +# define XXH_FORCE_INLINE static +# endif +# define XXH_NO_INLINE static +/* enable inlining hints */ +#elif defined(__GNUC__) || defined(__clang__) +# define XXH_FORCE_INLINE static __inline__ __attribute__((always_inline, unused)) +# define XXH_NO_INLINE static __attribute__((noinline)) +#elif defined(_MSC_VER) /* Visual Studio */ +# define XXH_FORCE_INLINE static __forceinline +# define XXH_NO_INLINE static __declspec(noinline) +#elif defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) /* C99 */ +# define XXH_FORCE_INLINE static inline +# define XXH_NO_INLINE static +#else +# define XXH_FORCE_INLINE static +# define XXH_NO_INLINE static +#endif + +#if XXH3_INLINE_SECRET +# define XXH3_WITH_SECRET_INLINE XXH_FORCE_INLINE +#else +# define XXH3_WITH_SECRET_INLINE XXH_NO_INLINE +#endif + + +/* ************************************* +* Debug +***************************************/ +/*! + * @ingroup tuning + * @def XXH_DEBUGLEVEL + * @brief Sets the debugging level. + * + * XXH_DEBUGLEVEL is expected to be defined externally, typically via the + * compiler's command line options. The value must be a number. + */ +#ifndef XXH_DEBUGLEVEL +# ifdef DEBUGLEVEL /* backwards compat */ +# define XXH_DEBUGLEVEL DEBUGLEVEL +# else +# define XXH_DEBUGLEVEL 0 +# endif +#endif + +#if (XXH_DEBUGLEVEL>=1) +# include <assert.h> /* note: can still be disabled with NDEBUG */ +# define XXH_ASSERT(c) assert(c) +#else +# if defined(__INTEL_COMPILER) +# define XXH_ASSERT(c) XXH_ASSUME((unsigned char) (c)) +# else +# define XXH_ASSERT(c) XXH_ASSUME(c) +# endif +#endif + +/* note: use after variable declarations */ +#ifndef XXH_STATIC_ASSERT +# if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11 */ +# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { _Static_assert((c),m); } while(0) +# elif defined(__cplusplus) && (__cplusplus >= 201103L) /* C++11 */ +# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { static_assert((c),m); } while(0) +# else +# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { struct xxh_sa { char x[(c) ? 1 : -1]; }; } while(0) +# endif +# define XXH_STATIC_ASSERT(c) XXH_STATIC_ASSERT_WITH_MESSAGE((c),#c) +#endif + +/*! + * @internal + * @def XXH_COMPILER_GUARD(var) + * @brief Used to prevent unwanted optimizations for @p var. + * + * It uses an empty GCC inline assembly statement with a register constraint + * which forces @p var into a general purpose register (eg eax, ebx, ecx + * on x86) and marks it as modified. + * + * This is used in a few places to avoid unwanted autovectorization (e.g. + * XXH32_round()). All vectorization we want is explicit via intrinsics, + * and _usually_ isn't wanted elsewhere. + * + * We also use it to prevent unwanted constant folding for AArch64 in + * XXH3_initCustomSecret_scalar(). + */ +#if defined(__GNUC__) || defined(__clang__) +# define XXH_COMPILER_GUARD(var) __asm__("" : "+r" (var)) +#else +# define XXH_COMPILER_GUARD(var) ((void)0) +#endif + +/* Specifically for NEON vectors which use the "w" constraint, on + * Clang. */ +#if defined(__clang__) && defined(__ARM_ARCH) && !defined(__wasm__) +# define XXH_COMPILER_GUARD_CLANG_NEON(var) __asm__("" : "+w" (var)) +#else +# define XXH_COMPILER_GUARD_CLANG_NEON(var) ((void)0) +#endif + +/* ************************************* +* Basic Types +***************************************/ +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include <stdint.h> + typedef uint8_t xxh_u8; +#else + typedef unsigned char xxh_u8; +#endif +typedef XXH32_hash_t xxh_u32; + +#ifdef XXH_OLD_NAMES +# warning "XXH_OLD_NAMES is planned to be removed starting v0.9. If the program depends on it, consider moving away from it by employing newer type names directly" +# define BYTE xxh_u8 +# define U8 xxh_u8 +# define U32 xxh_u32 +#endif + +/* *** Memory access *** */ + +/*! + * @internal + * @fn xxh_u32 XXH_read32(const void* ptr) + * @brief Reads an unaligned 32-bit integer from @p ptr in native endianness. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * + * @param ptr The pointer to read from. + * @return The 32-bit native endian integer from the bytes at @p ptr. + */ + +/*! + * @internal + * @fn xxh_u32 XXH_readLE32(const void* ptr) + * @brief Reads an unaligned 32-bit little endian integer from @p ptr. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * + * @param ptr The pointer to read from. + * @return The 32-bit little endian integer from the bytes at @p ptr. + */ + +/*! + * @internal + * @fn xxh_u32 XXH_readBE32(const void* ptr) + * @brief Reads an unaligned 32-bit big endian integer from @p ptr. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * + * @param ptr The pointer to read from. + * @return The 32-bit big endian integer from the bytes at @p ptr. + */ + +/*! + * @internal + * @fn xxh_u32 XXH_readLE32_align(const void* ptr, XXH_alignment align) + * @brief Like @ref XXH_readLE32(), but has an option for aligned reads. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * Note that when @ref XXH_FORCE_ALIGN_CHECK == 0, the @p align parameter is + * always @ref XXH_alignment::XXH_unaligned. + * + * @param ptr The pointer to read from. + * @param align Whether @p ptr is aligned. + * @pre + * If @p align == @ref XXH_alignment::XXH_aligned, @p ptr must be 4 byte + * aligned. + * @return The 32-bit little endian integer from the bytes at @p ptr. + */ + +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) +/* + * Manual byteshift. Best for old compilers which don't inline memcpy. + * We actually directly use XXH_readLE32 and XXH_readBE32. + */ +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) + +/* + * Force direct memory access. Only works on CPU which support unaligned memory + * access in hardware. + */ +static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; } + +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) + +/* + * __attribute__((aligned(1))) is supported by gcc and clang. Originally the + * documentation claimed that it only increased the alignment, but actually it + * can decrease it on gcc, clang, and icc: + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502, + * https://gcc.godbolt.org/z/xYez1j67Y. + */ +#ifdef XXH_OLD_NAMES +typedef union { xxh_u32 u32; } __attribute__((packed)) unalign; +#endif +static xxh_u32 XXH_read32(const void* ptr) +{ + typedef __attribute__((aligned(1))) xxh_u32 xxh_unalign32; + return *((const xxh_unalign32*)ptr); +} + +#else + +/* + * Portable and safe solution. Generally efficient. + * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html + */ +static xxh_u32 XXH_read32(const void* memPtr) +{ + xxh_u32 val; + XXH_memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ + + +/* *** Endianness *** */ + +/*! + * @ingroup tuning + * @def XXH_CPU_LITTLE_ENDIAN + * @brief Whether the target is little endian. + * + * Defined to 1 if the target is little endian, or 0 if it is big endian. + * It can be defined externally, for example on the compiler command line. + * + * If it is not defined, + * a runtime check (which is usually constant folded) is used instead. + * + * @note + * This is not necessarily defined to an integer constant. + * + * @see XXH_isLittleEndian() for the runtime check. + */ +#ifndef XXH_CPU_LITTLE_ENDIAN +/* + * Try to detect endianness automatically, to avoid the nonstandard behavior + * in `XXH_isLittleEndian()` + */ +# if defined(_WIN32) /* Windows is always little endian */ \ + || defined(__LITTLE_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +# define XXH_CPU_LITTLE_ENDIAN 1 +# elif defined(__BIG_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +# define XXH_CPU_LITTLE_ENDIAN 0 +# else +/*! + * @internal + * @brief Runtime check for @ref XXH_CPU_LITTLE_ENDIAN. + * + * Most compilers will constant fold this. + */ +static int XXH_isLittleEndian(void) +{ + /* + * Portable and well-defined behavior. + * Don't use static: it is detrimental to performance. + */ + const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 }; + return one.c[0]; +} +# define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian() +# endif +#endif + + + + +/* **************************************** +* Compiler-specific Functions and Macros +******************************************/ +#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) + +#ifdef __has_builtin +# define XXH_HAS_BUILTIN(x) __has_builtin(x) +#else +# define XXH_HAS_BUILTIN(x) 0 +#endif + + + +/* + * C23 and future versions have standard "unreachable()". + * Once it has been implemented reliably we can add it as an + * additional case: + * + * ``` + * #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN) + * # include <stddef.h> + * # ifdef unreachable + * # define XXH_UNREACHABLE() unreachable() + * # endif + * #endif + * ``` + * + * Note C++23 also has std::unreachable() which can be detected + * as follows: + * ``` + * #if defined(__cpp_lib_unreachable) && (__cpp_lib_unreachable >= 202202L) + * # include <utility> + * # define XXH_UNREACHABLE() std::unreachable() + * #endif + * ``` + * NB: `__cpp_lib_unreachable` is defined in the `<version>` header. + * We don't use that as including `<utility>` in `extern "C"` blocks + * doesn't work on GCC12 + */ + +#if XXH_HAS_BUILTIN(__builtin_unreachable) +# define XXH_UNREACHABLE() __builtin_unreachable() + +#elif defined(_MSC_VER) +# define XXH_UNREACHABLE() __assume(0) + +#else +# define XXH_UNREACHABLE() +#endif + +#if XXH_HAS_BUILTIN(__builtin_assume) +# define XXH_ASSUME(c) __builtin_assume(c) +#else +# define XXH_ASSUME(c) if (!(c)) { XXH_UNREACHABLE(); } +#endif + +/*! + * @internal + * @def XXH_rotl32(x,r) + * @brief 32-bit rotate left. + * + * @param x The 32-bit integer to be rotated. + * @param r The number of bits to rotate. + * @pre + * @p r > 0 && @p r < 32 + * @note + * @p x and @p r may be evaluated multiple times. + * @return The rotated result. + */ +#if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) \ + && XXH_HAS_BUILTIN(__builtin_rotateleft64) +# define XXH_rotl32 __builtin_rotateleft32 +# define XXH_rotl64 __builtin_rotateleft64 +/* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */ +#elif defined(_MSC_VER) +# define XXH_rotl32(x,r) _rotl(x,r) +# define XXH_rotl64(x,r) _rotl64(x,r) +#else +# define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r)))) +# define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r)))) +#endif + +/*! + * @internal + * @fn xxh_u32 XXH_swap32(xxh_u32 x) + * @brief A 32-bit byteswap. + * + * @param x The 32-bit integer to byteswap. + * @return @p x, byteswapped. + */ +#if defined(_MSC_VER) /* Visual Studio */ +# define XXH_swap32 _byteswap_ulong +#elif XXH_GCC_VERSION >= 403 +# define XXH_swap32 __builtin_bswap32 +#else +static xxh_u32 XXH_swap32 (xxh_u32 x) +{ + return ((x << 24) & 0xff000000 ) | + ((x << 8) & 0x00ff0000 ) | + ((x >> 8) & 0x0000ff00 ) | + ((x >> 24) & 0x000000ff ); +} +#endif + + +/* *************************** +* Memory reads +*****************************/ + +/*! + * @internal + * @brief Enum to indicate whether a pointer is aligned. + */ +typedef enum { + XXH_aligned, /*!< Aligned */ + XXH_unaligned /*!< Possibly unaligned */ +} XXH_alignment; + +/* + * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. + * + * This is ideal for older compilers which don't inline memcpy. + */ +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) + +XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[0] + | ((xxh_u32)bytePtr[1] << 8) + | ((xxh_u32)bytePtr[2] << 16) + | ((xxh_u32)bytePtr[3] << 24); +} + +XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[3] + | ((xxh_u32)bytePtr[2] << 8) + | ((xxh_u32)bytePtr[1] << 16) + | ((xxh_u32)bytePtr[0] << 24); +} + +#else +XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr)); +} + +static xxh_u32 XXH_readBE32(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr); +} +#endif + +XXH_FORCE_INLINE xxh_u32 +XXH_readLE32_align(const void* ptr, XXH_alignment align) +{ + if (align==XXH_unaligned) { + return XXH_readLE32(ptr); + } else { + return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr); + } +} + + +/* ************************************* +* Misc +***************************************/ +/*! @ingroup public */ +XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; } + + +/* ******************************************************************* +* 32-bit hash functions +*********************************************************************/ +/*! + * @} + * @defgroup XXH32_impl XXH32 implementation + * @ingroup impl + * + * Details on the XXH32 implementation. + * @{ + */ + /* #define instead of static const, to be used as initializers */ +#define XXH_PRIME32_1 0x9E3779B1U /*!< 0b10011110001101110111100110110001 */ +#define XXH_PRIME32_2 0x85EBCA77U /*!< 0b10000101111010111100101001110111 */ +#define XXH_PRIME32_3 0xC2B2AE3DU /*!< 0b11000010101100101010111000111101 */ +#define XXH_PRIME32_4 0x27D4EB2FU /*!< 0b00100111110101001110101100101111 */ +#define XXH_PRIME32_5 0x165667B1U /*!< 0b00010110010101100110011110110001 */ + +#ifdef XXH_OLD_NAMES +# define PRIME32_1 XXH_PRIME32_1 +# define PRIME32_2 XXH_PRIME32_2 +# define PRIME32_3 XXH_PRIME32_3 +# define PRIME32_4 XXH_PRIME32_4 +# define PRIME32_5 XXH_PRIME32_5 +#endif + +/*! + * @internal + * @brief Normal stripe processing routine. + * + * This shuffles the bits so that any bit from @p input impacts several bits in + * @p acc. + * + * @param acc The accumulator lane. + * @param input The stripe of input to mix. + * @return The mixed accumulator lane. + */ +static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) +{ + acc += input * XXH_PRIME32_2; + acc = XXH_rotl32(acc, 13); + acc *= XXH_PRIME32_1; +#if (defined(__SSE4_1__) || defined(__aarch64__) || defined(__wasm_simd128__)) && !defined(XXH_ENABLE_AUTOVECTORIZE) + /* + * UGLY HACK: + * A compiler fence is the only thing that prevents GCC and Clang from + * autovectorizing the XXH32 loop (pragmas and attributes don't work for some + * reason) without globally disabling SSE4.1. + * + * The reason we want to avoid vectorization is because despite working on + * 4 integers at a time, there are multiple factors slowing XXH32 down on + * SSE4: + * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on + * newer chips!) making it slightly slower to multiply four integers at + * once compared to four integers independently. Even when pmulld was + * fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE + * just to multiply unless doing a long operation. + * + * - Four instructions are required to rotate, + * movqda tmp, v // not required with VEX encoding + * pslld tmp, 13 // tmp <<= 13 + * psrld v, 19 // x >>= 19 + * por v, tmp // x |= tmp + * compared to one for scalar: + * roll v, 13 // reliably fast across the board + * shldl v, v, 13 // Sandy Bridge and later prefer this for some reason + * + * - Instruction level parallelism is actually more beneficial here because + * the SIMD actually serializes this operation: While v1 is rotating, v2 + * can load data, while v3 can multiply. SSE forces them to operate + * together. + * + * This is also enabled on AArch64, as Clang is *very aggressive* in vectorizing + * the loop. NEON is only faster on the A53, and with the newer cores, it is less + * than half the speed. + * + * Additionally, this is used on WASM SIMD128 because it JITs to the same + * SIMD instructions and has the same issue. + */ + XXH_COMPILER_GUARD(acc); +#endif + return acc; +} + +/*! + * @internal + * @brief Mixes all bits to finalize the hash. + * + * The final mix ensures that all input bits have a chance to impact any bit in + * the output digest, resulting in an unbiased distribution. + * + * @param hash The hash to avalanche. + * @return The avalanched hash. + */ +static xxh_u32 XXH32_avalanche(xxh_u32 hash) +{ + hash ^= hash >> 15; + hash *= XXH_PRIME32_2; + hash ^= hash >> 13; + hash *= XXH_PRIME32_3; + hash ^= hash >> 16; + return hash; +} + +#define XXH_get32bits(p) XXH_readLE32_align(p, align) + +/*! + * @internal + * @brief Processes the last 0-15 bytes of @p ptr. + * + * There may be up to 15 bytes remaining to consume from the input. + * This final stage will digest them to ensure that all input bytes are present + * in the final mix. + * + * @param hash The hash to finalize. + * @param ptr The pointer to the remaining input. + * @param len The remaining length, modulo 16. + * @param align Whether @p ptr is aligned. + * @return The finalized hash. + * @see XXH64_finalize(). + */ +static XXH_PUREF xxh_u32 +XXH32_finalize(xxh_u32 hash, const xxh_u8* ptr, size_t len, XXH_alignment align) +{ +#define XXH_PROCESS1 do { \ + hash += (*ptr++) * XXH_PRIME32_5; \ + hash = XXH_rotl32(hash, 11) * XXH_PRIME32_1; \ +} while (0) + +#define XXH_PROCESS4 do { \ + hash += XXH_get32bits(ptr) * XXH_PRIME32_3; \ + ptr += 4; \ + hash = XXH_rotl32(hash, 17) * XXH_PRIME32_4; \ +} while (0) + + if (ptr==NULL) XXH_ASSERT(len == 0); + + /* Compact rerolled version; generally faster */ + if (!XXH32_ENDJMP) { + len &= 15; + while (len >= 4) { + XXH_PROCESS4; + len -= 4; + } + while (len > 0) { + XXH_PROCESS1; + --len; + } + return XXH32_avalanche(hash); + } else { + switch(len&15) /* or switch(bEnd - p) */ { + case 12: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 8: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 4: XXH_PROCESS4; + return XXH32_avalanche(hash); + + case 13: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 9: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 5: XXH_PROCESS4; + XXH_PROCESS1; + return XXH32_avalanche(hash); + + case 14: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 10: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 6: XXH_PROCESS4; + XXH_PROCESS1; + XXH_PROCESS1; + return XXH32_avalanche(hash); + + case 15: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 11: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 7: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 3: XXH_PROCESS1; + XXH_FALLTHROUGH; /* fallthrough */ + case 2: XXH_PROCESS1; + XXH_FALLTHROUGH; /* fallthrough */ + case 1: XXH_PROCESS1; + XXH_FALLTHROUGH; /* fallthrough */ + case 0: return XXH32_avalanche(hash); + } + XXH_ASSERT(0); + return hash; /* reaching this point is deemed impossible */ + } +} + +#ifdef XXH_OLD_NAMES +# define PROCESS1 XXH_PROCESS1 +# define PROCESS4 XXH_PROCESS4 +#else +# undef XXH_PROCESS1 +# undef XXH_PROCESS4 +#endif + +/*! + * @internal + * @brief The implementation for @ref XXH32(). + * + * @param input , len , seed Directly passed from @ref XXH32(). + * @param align Whether @p input is aligned. + * @return The calculated hash. + */ +XXH_FORCE_INLINE XXH_PUREF xxh_u32 +XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align) +{ + xxh_u32 h32; + + if (input==NULL) XXH_ASSERT(len == 0); + + if (len>=16) { + const xxh_u8* const bEnd = input + len; + const xxh_u8* const limit = bEnd - 15; + xxh_u32 v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2; + xxh_u32 v2 = seed + XXH_PRIME32_2; + xxh_u32 v3 = seed + 0; + xxh_u32 v4 = seed - XXH_PRIME32_1; + + do { + v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4; + v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4; + v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4; + v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4; + } while (input < limit); + + h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18); + } else { + h32 = seed + XXH_PRIME32_5; + } + + h32 += (xxh_u32)len; + + return XXH32_finalize(h32, input, len&15, align); +} + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed) +{ +#if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH32_state_t state; + XXH32_reset(&state, seed); + XXH32_update(&state, (const xxh_u8*)input, len); + return XXH32_digest(&state); +#else + if (XXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 3) == 0) { /* Input is 4-bytes aligned, leverage the speed benefit */ + return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned); + } } + + return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned); +#endif +} + + + +/******* Hash streaming *******/ +#ifndef XXH_NO_STREAM +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void) +{ + return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t)); +} +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr) +{ + XXH_free(statePtr); + return XXH_OK; +} + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState) +{ + XXH_memcpy(dstState, srcState, sizeof(*dstState)); +} + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed) +{ + XXH_ASSERT(statePtr != NULL); + memset(statePtr, 0, sizeof(*statePtr)); + statePtr->v[0] = seed + XXH_PRIME32_1 + XXH_PRIME32_2; + statePtr->v[1] = seed + XXH_PRIME32_2; + statePtr->v[2] = seed + 0; + statePtr->v[3] = seed - XXH_PRIME32_1; + return XXH_OK; +} + + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH_errorcode +XXH32_update(XXH32_state_t* state, const void* input, size_t len) +{ + if (input==NULL) { + XXH_ASSERT(len == 0); + return XXH_OK; + } + + { const xxh_u8* p = (const xxh_u8*)input; + const xxh_u8* const bEnd = p + len; + + state->total_len_32 += (XXH32_hash_t)len; + state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16)); + + if (state->memsize + len < 16) { /* fill in tmp buffer */ + XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len); + state->memsize += (XXH32_hash_t)len; + return XXH_OK; + } + + if (state->memsize) { /* some data left from previous update */ + XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize); + { const xxh_u32* p32 = state->mem32; + state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p32)); p32++; + state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p32)); p32++; + state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p32)); p32++; + state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p32)); + } + p += 16-state->memsize; + state->memsize = 0; + } + + if (p <= bEnd-16) { + const xxh_u8* const limit = bEnd - 16; + + do { + state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p)); p+=4; + state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p)); p+=4; + state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p)); p+=4; + state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p)); p+=4; + } while (p<=limit); + + } + + if (p < bEnd) { + XXH_memcpy(state->mem32, p, (size_t)(bEnd-p)); + state->memsize = (unsigned)(bEnd-p); + } + } + + return XXH_OK; +} + + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* state) +{ + xxh_u32 h32; + + if (state->large_len) { + h32 = XXH_rotl32(state->v[0], 1) + + XXH_rotl32(state->v[1], 7) + + XXH_rotl32(state->v[2], 12) + + XXH_rotl32(state->v[3], 18); + } else { + h32 = state->v[2] /* == seed */ + XXH_PRIME32_5; + } + + h32 += state->total_len_32; + + return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned); +} +#endif /* !XXH_NO_STREAM */ + +/******* Canonical representation *******/ + +/*! + * @ingroup XXH32_family + * The default return values from XXH functions are unsigned 32 and 64 bit + * integers. + * + * The canonical representation uses big endian convention, the same convention + * as human-readable numbers (large digits first). + * + * This way, hash values can be written into a file or buffer, remaining + * comparable across different systems. + * + * The following functions allow transformation of hash values to and from their + * canonical format. + */ +XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash); + XXH_memcpy(dst, &hash, sizeof(*dst)); +} +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src) +{ + return XXH_readBE32(src); +} + + +#ifndef XXH_NO_LONG_LONG + +/* ******************************************************************* +* 64-bit hash functions +*********************************************************************/ +/*! + * @} + * @ingroup impl + * @{ + */ +/******* Memory access *******/ + +typedef XXH64_hash_t xxh_u64; + +#ifdef XXH_OLD_NAMES +# define U64 xxh_u64 +#endif + +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) +/* + * Manual byteshift. Best for old compilers which don't inline memcpy. + * We actually directly use XXH_readLE64 and XXH_readBE64. + */ +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) + +/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ +static xxh_u64 XXH_read64(const void* memPtr) +{ + return *(const xxh_u64*) memPtr; +} + +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) + +/* + * __attribute__((aligned(1))) is supported by gcc and clang. Originally the + * documentation claimed that it only increased the alignment, but actually it + * can decrease it on gcc, clang, and icc: + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502, + * https://gcc.godbolt.org/z/xYez1j67Y. + */ +#ifdef XXH_OLD_NAMES +typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64; +#endif +static xxh_u64 XXH_read64(const void* ptr) +{ + typedef __attribute__((aligned(1))) xxh_u64 xxh_unalign64; + return *((const xxh_unalign64*)ptr); +} + +#else + +/* + * Portable and safe solution. Generally efficient. + * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html + */ +static xxh_u64 XXH_read64(const void* memPtr) +{ + xxh_u64 val; + XXH_memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ + +#if defined(_MSC_VER) /* Visual Studio */ +# define XXH_swap64 _byteswap_uint64 +#elif XXH_GCC_VERSION >= 403 +# define XXH_swap64 __builtin_bswap64 +#else +static xxh_u64 XXH_swap64(xxh_u64 x) +{ + return ((x << 56) & 0xff00000000000000ULL) | + ((x << 40) & 0x00ff000000000000ULL) | + ((x << 24) & 0x0000ff0000000000ULL) | + ((x << 8) & 0x000000ff00000000ULL) | + ((x >> 8) & 0x00000000ff000000ULL) | + ((x >> 24) & 0x0000000000ff0000ULL) | + ((x >> 40) & 0x000000000000ff00ULL) | + ((x >> 56) & 0x00000000000000ffULL); +} +#endif + + +/* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */ +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) + +XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[0] + | ((xxh_u64)bytePtr[1] << 8) + | ((xxh_u64)bytePtr[2] << 16) + | ((xxh_u64)bytePtr[3] << 24) + | ((xxh_u64)bytePtr[4] << 32) + | ((xxh_u64)bytePtr[5] << 40) + | ((xxh_u64)bytePtr[6] << 48) + | ((xxh_u64)bytePtr[7] << 56); +} + +XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[7] + | ((xxh_u64)bytePtr[6] << 8) + | ((xxh_u64)bytePtr[5] << 16) + | ((xxh_u64)bytePtr[4] << 24) + | ((xxh_u64)bytePtr[3] << 32) + | ((xxh_u64)bytePtr[2] << 40) + | ((xxh_u64)bytePtr[1] << 48) + | ((xxh_u64)bytePtr[0] << 56); +} + +#else +XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr)); +} + +static xxh_u64 XXH_readBE64(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr); +} +#endif + +XXH_FORCE_INLINE xxh_u64 +XXH_readLE64_align(const void* ptr, XXH_alignment align) +{ + if (align==XXH_unaligned) + return XXH_readLE64(ptr); + else + return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr); +} + + +/******* xxh64 *******/ +/*! + * @} + * @defgroup XXH64_impl XXH64 implementation + * @ingroup impl + * + * Details on the XXH64 implementation. + * @{ + */ +/* #define rather that static const, to be used as initializers */ +#define XXH_PRIME64_1 0x9E3779B185EBCA87ULL /*!< 0b1001111000110111011110011011000110000101111010111100101010000111 */ +#define XXH_PRIME64_2 0xC2B2AE3D27D4EB4FULL /*!< 0b1100001010110010101011100011110100100111110101001110101101001111 */ +#define XXH_PRIME64_3 0x165667B19E3779F9ULL /*!< 0b0001011001010110011001111011000110011110001101110111100111111001 */ +#define XXH_PRIME64_4 0x85EBCA77C2B2AE63ULL /*!< 0b1000010111101011110010100111011111000010101100101010111001100011 */ +#define XXH_PRIME64_5 0x27D4EB2F165667C5ULL /*!< 0b0010011111010100111010110010111100010110010101100110011111000101 */ + +#ifdef XXH_OLD_NAMES +# define PRIME64_1 XXH_PRIME64_1 +# define PRIME64_2 XXH_PRIME64_2 +# define PRIME64_3 XXH_PRIME64_3 +# define PRIME64_4 XXH_PRIME64_4 +# define PRIME64_5 XXH_PRIME64_5 +#endif + +/*! @copydoc XXH32_round */ +static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input) +{ + acc += input * XXH_PRIME64_2; + acc = XXH_rotl64(acc, 31); + acc *= XXH_PRIME64_1; + return acc; +} + +static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val) +{ + val = XXH64_round(0, val); + acc ^= val; + acc = acc * XXH_PRIME64_1 + XXH_PRIME64_4; + return acc; +} + +/*! @copydoc XXH32_avalanche */ +static xxh_u64 XXH64_avalanche(xxh_u64 hash) +{ + hash ^= hash >> 33; + hash *= XXH_PRIME64_2; + hash ^= hash >> 29; + hash *= XXH_PRIME64_3; + hash ^= hash >> 32; + return hash; +} + + +#define XXH_get64bits(p) XXH_readLE64_align(p, align) + +/*! + * @internal + * @brief Processes the last 0-31 bytes of @p ptr. + * + * There may be up to 31 bytes remaining to consume from the input. + * This final stage will digest them to ensure that all input bytes are present + * in the final mix. + * + * @param hash The hash to finalize. + * @param ptr The pointer to the remaining input. + * @param len The remaining length, modulo 32. + * @param align Whether @p ptr is aligned. + * @return The finalized hash + * @see XXH32_finalize(). + */ +static XXH_PUREF xxh_u64 +XXH64_finalize(xxh_u64 hash, const xxh_u8* ptr, size_t len, XXH_alignment align) +{ + if (ptr==NULL) XXH_ASSERT(len == 0); + len &= 31; + while (len >= 8) { + xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); + ptr += 8; + hash ^= k1; + hash = XXH_rotl64(hash,27) * XXH_PRIME64_1 + XXH_PRIME64_4; + len -= 8; + } + if (len >= 4) { + hash ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1; + ptr += 4; + hash = XXH_rotl64(hash, 23) * XXH_PRIME64_2 + XXH_PRIME64_3; + len -= 4; + } + while (len > 0) { + hash ^= (*ptr++) * XXH_PRIME64_5; + hash = XXH_rotl64(hash, 11) * XXH_PRIME64_1; + --len; + } + return XXH64_avalanche(hash); +} + +#ifdef XXH_OLD_NAMES +# define PROCESS1_64 XXH_PROCESS1_64 +# define PROCESS4_64 XXH_PROCESS4_64 +# define PROCESS8_64 XXH_PROCESS8_64 +#else +# undef XXH_PROCESS1_64 +# undef XXH_PROCESS4_64 +# undef XXH_PROCESS8_64 +#endif + +/*! + * @internal + * @brief The implementation for @ref XXH64(). + * + * @param input , len , seed Directly passed from @ref XXH64(). + * @param align Whether @p input is aligned. + * @return The calculated hash. + */ +XXH_FORCE_INLINE XXH_PUREF xxh_u64 +XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align) +{ + xxh_u64 h64; + if (input==NULL) XXH_ASSERT(len == 0); + + if (len>=32) { + const xxh_u8* const bEnd = input + len; + const xxh_u8* const limit = bEnd - 31; + xxh_u64 v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2; + xxh_u64 v2 = seed + XXH_PRIME64_2; + xxh_u64 v3 = seed + 0; + xxh_u64 v4 = seed - XXH_PRIME64_1; + + do { + v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8; + v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8; + v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8; + v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8; + } while (input<limit); + + h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18); + h64 = XXH64_mergeRound(h64, v1); + h64 = XXH64_mergeRound(h64, v2); + h64 = XXH64_mergeRound(h64, v3); + h64 = XXH64_mergeRound(h64, v4); + + } else { + h64 = seed + XXH_PRIME64_5; + } + + h64 += (xxh_u64) len; + + return XXH64_finalize(h64, input, len, align); +} + + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH64_hash_t XXH64 (XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed) +{ +#if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH64_state_t state; + XXH64_reset(&state, seed); + XXH64_update(&state, (const xxh_u8*)input, len); + return XXH64_digest(&state); +#else + if (XXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 7)==0) { /* Input is aligned, let's leverage the speed advantage */ + return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned); + } } + + return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned); + +#endif +} + +/******* Hash Streaming *******/ +#ifndef XXH_NO_STREAM +/*! @ingroup XXH64_family*/ +XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void) +{ + return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t)); +} +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr) +{ + XXH_free(statePtr); + return XXH_OK; +} + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dstState, const XXH64_state_t* srcState) +{ + XXH_memcpy(dstState, srcState, sizeof(*dstState)); +} + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed) +{ + XXH_ASSERT(statePtr != NULL); + memset(statePtr, 0, sizeof(*statePtr)); + statePtr->v[0] = seed + XXH_PRIME64_1 + XXH_PRIME64_2; + statePtr->v[1] = seed + XXH_PRIME64_2; + statePtr->v[2] = seed + 0; + statePtr->v[3] = seed - XXH_PRIME64_1; + return XXH_OK; +} + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH_errorcode +XXH64_update (XXH_NOESCAPE XXH64_state_t* state, XXH_NOESCAPE const void* input, size_t len) +{ + if (input==NULL) { + XXH_ASSERT(len == 0); + return XXH_OK; + } + + { const xxh_u8* p = (const xxh_u8*)input; + const xxh_u8* const bEnd = p + len; + + state->total_len += len; + + if (state->memsize + len < 32) { /* fill in tmp buffer */ + XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len); + state->memsize += (xxh_u32)len; + return XXH_OK; + } + + if (state->memsize) { /* tmp buffer is full */ + XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize); + state->v[0] = XXH64_round(state->v[0], XXH_readLE64(state->mem64+0)); + state->v[1] = XXH64_round(state->v[1], XXH_readLE64(state->mem64+1)); + state->v[2] = XXH64_round(state->v[2], XXH_readLE64(state->mem64+2)); + state->v[3] = XXH64_round(state->v[3], XXH_readLE64(state->mem64+3)); + p += 32 - state->memsize; + state->memsize = 0; + } + + if (p+32 <= bEnd) { + const xxh_u8* const limit = bEnd - 32; + + do { + state->v[0] = XXH64_round(state->v[0], XXH_readLE64(p)); p+=8; + state->v[1] = XXH64_round(state->v[1], XXH_readLE64(p)); p+=8; + state->v[2] = XXH64_round(state->v[2], XXH_readLE64(p)); p+=8; + state->v[3] = XXH64_round(state->v[3], XXH_readLE64(p)); p+=8; + } while (p<=limit); + + } + + if (p < bEnd) { + XXH_memcpy(state->mem64, p, (size_t)(bEnd-p)); + state->memsize = (unsigned)(bEnd-p); + } + } + + return XXH_OK; +} + + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH64_hash_t XXH64_digest(XXH_NOESCAPE const XXH64_state_t* state) +{ + xxh_u64 h64; + + if (state->total_len >= 32) { + h64 = XXH_rotl64(state->v[0], 1) + XXH_rotl64(state->v[1], 7) + XXH_rotl64(state->v[2], 12) + XXH_rotl64(state->v[3], 18); + h64 = XXH64_mergeRound(h64, state->v[0]); + h64 = XXH64_mergeRound(h64, state->v[1]); + h64 = XXH64_mergeRound(h64, state->v[2]); + h64 = XXH64_mergeRound(h64, state->v[3]); + } else { + h64 = state->v[2] /*seed*/ + XXH_PRIME64_5; + } + + h64 += (xxh_u64) state->total_len; + + return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned); +} +#endif /* !XXH_NO_STREAM */ + +/******* Canonical representation *******/ + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash); + XXH_memcpy(dst, &hash, sizeof(*dst)); +} + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src) +{ + return XXH_readBE64(src); +} + +#ifndef XXH_NO_XXH3 + +/* ********************************************************************* +* XXH3 +* New generation hash designed for speed on small keys and vectorization +************************************************************************ */ +/*! + * @} + * @defgroup XXH3_impl XXH3 implementation + * @ingroup impl + * @{ + */ + +/* === Compiler specifics === */ + +#if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */ +# define XXH_RESTRICT /* disable */ +#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */ +# define XXH_RESTRICT restrict +#elif (defined (__GNUC__) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \ + || (defined (__clang__)) \ + || (defined (_MSC_VER) && (_MSC_VER >= 1400)) \ + || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300)) +/* + * There are a LOT more compilers that recognize __restrict but this + * covers the major ones. + */ +# define XXH_RESTRICT __restrict +#else +# define XXH_RESTRICT /* disable */ +#endif + +#if (defined(__GNUC__) && (__GNUC__ >= 3)) \ + || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \ + || defined(__clang__) +# define XXH_likely(x) __builtin_expect(x, 1) +# define XXH_unlikely(x) __builtin_expect(x, 0) +#else +# define XXH_likely(x) (x) +# define XXH_unlikely(x) (x) +#endif + +#ifndef XXH_HAS_INCLUDE +# ifdef __has_include +# define XXH_HAS_INCLUDE(x) __has_include(x) +# else +# define XXH_HAS_INCLUDE(x) 0 +# endif +#endif + +#if defined(__GNUC__) || defined(__clang__) +# if defined(__ARM_FEATURE_SVE) +# include <arm_sve.h> +# endif +# if defined(__ARM_NEON__) || defined(__ARM_NEON) \ + || (defined(_M_ARM) && _M_ARM >= 7) \ + || defined(_M_ARM64) || defined(_M_ARM64EC) \ + || (defined(__wasm_simd128__) && XXH_HAS_INCLUDE(<arm_neon.h>)) /* WASM SIMD128 via SIMDe */ +# define inline __inline__ /* circumvent a clang bug */ +# include <arm_neon.h> +# undef inline +# elif defined(__AVX2__) +# include <immintrin.h> +# elif defined(__SSE2__) +# include <emmintrin.h> +# endif +#endif + +#if defined(_MSC_VER) +# include <intrin.h> +#endif + +/* + * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while + * remaining a true 64-bit/128-bit hash function. + * + * This is done by prioritizing a subset of 64-bit operations that can be + * emulated without too many steps on the average 32-bit machine. + * + * For example, these two lines seem similar, and run equally fast on 64-bit: + * + * xxh_u64 x; + * x ^= (x >> 47); // good + * x ^= (x >> 13); // bad + * + * However, to a 32-bit machine, there is a major difference. + * + * x ^= (x >> 47) looks like this: + * + * x.lo ^= (x.hi >> (47 - 32)); + * + * while x ^= (x >> 13) looks like this: + * + * // note: funnel shifts are not usually cheap. + * x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13)); + * x.hi ^= (x.hi >> 13); + * + * The first one is significantly faster than the second, simply because the + * shift is larger than 32. This means: + * - All the bits we need are in the upper 32 bits, so we can ignore the lower + * 32 bits in the shift. + * - The shift result will always fit in the lower 32 bits, and therefore, + * we can ignore the upper 32 bits in the xor. + * + * Thanks to this optimization, XXH3 only requires these features to be efficient: + * + * - Usable unaligned access + * - A 32-bit or 64-bit ALU + * - If 32-bit, a decent ADC instruction + * - A 32 or 64-bit multiply with a 64-bit result + * - For the 128-bit variant, a decent byteswap helps short inputs. + * + * The first two are already required by XXH32, and almost all 32-bit and 64-bit + * platforms which can run XXH32 can run XXH3 efficiently. + * + * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one + * notable exception. + * + * First of all, Thumb-1 lacks support for the UMULL instruction which + * performs the important long multiply. This means numerous __aeabi_lmul + * calls. + * + * Second of all, the 8 functional registers are just not enough. + * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need + * Lo registers, and this shuffling results in thousands more MOVs than A32. + * + * A32 and T32 don't have this limitation. They can access all 14 registers, + * do a 32->64 multiply with UMULL, and the flexible operand allowing free + * shifts is helpful, too. + * + * Therefore, we do a quick sanity check. + * + * If compiling Thumb-1 for a target which supports ARM instructions, we will + * emit a warning, as it is not a "sane" platform to compile for. + * + * Usually, if this happens, it is because of an accident and you probably need + * to specify -march, as you likely meant to compile for a newer architecture. + * + * Credit: large sections of the vectorial and asm source code paths + * have been contributed by @easyaspi314 + */ +#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM) +# warning "XXH3 is highly inefficient without ARM or Thumb-2." +#endif + +/* ========================================== + * Vectorization detection + * ========================================== */ + +#ifdef XXH_DOXYGEN +/*! + * @ingroup tuning + * @brief Overrides the vectorization implementation chosen for XXH3. + * + * Can be defined to 0 to disable SIMD or any of the values mentioned in + * @ref XXH_VECTOR_TYPE. + * + * If this is not defined, it uses predefined macros to determine the best + * implementation. + */ +# define XXH_VECTOR XXH_SCALAR +/*! + * @ingroup tuning + * @brief Possible values for @ref XXH_VECTOR. + * + * Note that these are actually implemented as macros. + * + * If this is not defined, it is detected automatically. + * internal macro XXH_X86DISPATCH overrides this. + */ +enum XXH_VECTOR_TYPE /* fake enum */ { + XXH_SCALAR = 0, /*!< Portable scalar version */ + XXH_SSE2 = 1, /*!< + * SSE2 for Pentium 4, Opteron, all x86_64. + * + * @note SSE2 is also guaranteed on Windows 10, macOS, and + * Android x86. + */ + XXH_AVX2 = 2, /*!< AVX2 for Haswell and Bulldozer */ + XXH_AVX512 = 3, /*!< AVX512 for Skylake and Icelake */ + XXH_NEON = 4, /*!< + * NEON for most ARMv7-A, all AArch64, and WASM SIMD128 + * via the SIMDeverywhere polyfill provided with the + * Emscripten SDK. + */ + XXH_VSX = 5, /*!< VSX and ZVector for POWER8/z13 (64-bit) */ + XXH_SVE = 6, /*!< SVE for some ARMv8-A and ARMv9-A */ +}; +/*! + * @ingroup tuning + * @brief Selects the minimum alignment for XXH3's accumulators. + * + * When using SIMD, this should match the alignment required for said vector + * type, so, for example, 32 for AVX2. + * + * Default: Auto detected. + */ +# define XXH_ACC_ALIGN 8 +#endif + +/* Actual definition */ +#ifndef XXH_DOXYGEN +# define XXH_SCALAR 0 +# define XXH_SSE2 1 +# define XXH_AVX2 2 +# define XXH_AVX512 3 +# define XXH_NEON 4 +# define XXH_VSX 5 +# define XXH_SVE 6 +#endif + +#ifndef XXH_VECTOR /* can be defined on command line */ +# if defined(__ARM_FEATURE_SVE) +# define XXH_VECTOR XXH_SVE +# elif ( \ + defined(__ARM_NEON__) || defined(__ARM_NEON) /* gcc */ \ + || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) /* msvc */ \ + || (defined(__wasm_simd128__) && XXH_HAS_INCLUDE(<arm_neon.h>)) /* wasm simd128 via SIMDe */ \ + ) && ( \ + defined(_WIN32) || defined(__LITTLE_ENDIAN__) /* little endian only */ \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \ + ) +# define XXH_VECTOR XXH_NEON +# elif defined(__AVX512F__) +# define XXH_VECTOR XXH_AVX512 +# elif defined(__AVX2__) +# define XXH_VECTOR XXH_AVX2 +# elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2)) +# define XXH_VECTOR XXH_SSE2 +# elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \ + || (defined(__s390x__) && defined(__VEC__)) \ + && defined(__GNUC__) /* TODO: IBM XL */ +# define XXH_VECTOR XXH_VSX +# else +# define XXH_VECTOR XXH_SCALAR +# endif +#endif + +/* __ARM_FEATURE_SVE is only supported by GCC & Clang. */ +#if (XXH_VECTOR == XXH_SVE) && !defined(__ARM_FEATURE_SVE) +# ifdef _MSC_VER +# pragma warning(once : 4606) +# else +# warning "__ARM_FEATURE_SVE isn't supported. Use SCALAR instead." +# endif +# undef XXH_VECTOR +# define XXH_VECTOR XXH_SCALAR +#endif + +/* + * Controls the alignment of the accumulator, + * for compatibility with aligned vector loads, which are usually faster. + */ +#ifndef XXH_ACC_ALIGN +# if defined(XXH_X86DISPATCH) +# define XXH_ACC_ALIGN 64 /* for compatibility with avx512 */ +# elif XXH_VECTOR == XXH_SCALAR /* scalar */ +# define XXH_ACC_ALIGN 8 +# elif XXH_VECTOR == XXH_SSE2 /* sse2 */ +# define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_AVX2 /* avx2 */ +# define XXH_ACC_ALIGN 32 +# elif XXH_VECTOR == XXH_NEON /* neon */ +# define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_VSX /* vsx */ +# define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_AVX512 /* avx512 */ +# define XXH_ACC_ALIGN 64 +# elif XXH_VECTOR == XXH_SVE /* sve */ +# define XXH_ACC_ALIGN 64 +# endif +#endif + +#if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \ + || XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512 +# define XXH_SEC_ALIGN XXH_ACC_ALIGN +#elif XXH_VECTOR == XXH_SVE +# define XXH_SEC_ALIGN XXH_ACC_ALIGN +#else +# define XXH_SEC_ALIGN 8 +#endif + +#if defined(__GNUC__) || defined(__clang__) +# define XXH_ALIASING __attribute__((may_alias)) +#else +# define XXH_ALIASING /* nothing */ +#endif + +/* + * UGLY HACK: + * GCC usually generates the best code with -O3 for xxHash. + * + * However, when targeting AVX2, it is overzealous in its unrolling resulting + * in code roughly 3/4 the speed of Clang. + * + * There are other issues, such as GCC splitting _mm256_loadu_si256 into + * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which + * only applies to Sandy and Ivy Bridge... which don't even support AVX2. + * + * That is why when compiling the AVX2 version, it is recommended to use either + * -O2 -mavx2 -march=haswell + * or + * -O2 -mavx2 -mno-avx256-split-unaligned-load + * for decent performance, or to use Clang instead. + * + * Fortunately, we can control the first one with a pragma that forces GCC into + * -O2, but the other one we can't control without "failed to inline always + * inline function due to target mismatch" warnings. + */ +#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ + && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ + && defined(__OPTIMIZE__) && XXH_SIZE_OPT <= 0 /* respect -O0 and -Os */ +# pragma GCC push_options +# pragma GCC optimize("-O2") +#endif + +#if XXH_VECTOR == XXH_NEON + +/* + * UGLY HACK: While AArch64 GCC on Linux does not seem to care, on macOS, GCC -O3 + * optimizes out the entire hashLong loop because of the aliasing violation. + * + * However, GCC is also inefficient at load-store optimization with vld1q/vst1q, + * so the only option is to mark it as aliasing. + */ +typedef uint64x2_t xxh_aliasing_uint64x2_t XXH_ALIASING; + +/*! + * @internal + * @brief `vld1q_u64` but faster and alignment-safe. + * + * On AArch64, unaligned access is always safe, but on ARMv7-a, it is only + * *conditionally* safe (`vld1` has an alignment bit like `movdq[ua]` in x86). + * + * GCC for AArch64 sees `vld1q_u8` as an intrinsic instead of a load, so it + * prohibits load-store optimizations. Therefore, a direct dereference is used. + * + * Otherwise, `vld1q_u8` is used with `vreinterpretq_u8_u64` to do a safe + * unaligned load. + */ +#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__) +XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) /* silence -Wcast-align */ +{ + return *(xxh_aliasing_uint64x2_t const *)ptr; +} +#else +XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) +{ + return vreinterpretq_u64_u8(vld1q_u8((uint8_t const*)ptr)); +} +#endif + +/*! + * @internal + * @brief `vmlal_u32` on low and high halves of a vector. + * + * This is a workaround for AArch64 GCC < 11 which implemented arm_neon.h with + * inline assembly and were therefore incapable of merging the `vget_{low, high}_u32` + * with `vmlal_u32`. + */ +#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 11 +XXH_FORCE_INLINE uint64x2_t +XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs) +{ + /* Inline assembly is the only way */ + __asm__("umlal %0.2d, %1.2s, %2.2s" : "+w" (acc) : "w" (lhs), "w" (rhs)); + return acc; +} +XXH_FORCE_INLINE uint64x2_t +XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs) +{ + /* This intrinsic works as expected */ + return vmlal_high_u32(acc, lhs, rhs); +} +#else +/* Portable intrinsic versions */ +XXH_FORCE_INLINE uint64x2_t +XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs) +{ + return vmlal_u32(acc, vget_low_u32(lhs), vget_low_u32(rhs)); +} +/*! @copydoc XXH_vmlal_low_u32 + * Assume the compiler converts this to vmlal_high_u32 on aarch64 */ +XXH_FORCE_INLINE uint64x2_t +XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs) +{ + return vmlal_u32(acc, vget_high_u32(lhs), vget_high_u32(rhs)); +} +#endif + +/*! + * @ingroup tuning + * @brief Controls the NEON to scalar ratio for XXH3 + * + * This can be set to 2, 4, 6, or 8. + * + * ARM Cortex CPUs are _very_ sensitive to how their pipelines are used. + * + * For example, the Cortex-A73 can dispatch 3 micro-ops per cycle, but only 2 of those + * can be NEON. If you are only using NEON instructions, you are only using 2/3 of the CPU + * bandwidth. + * + * This is even more noticeable on the more advanced cores like the Cortex-A76 which + * can dispatch 8 micro-ops per cycle, but still only 2 NEON micro-ops at once. + * + * Therefore, to make the most out of the pipeline, it is beneficial to run 6 NEON lanes + * and 2 scalar lanes, which is chosen by default. + * + * This does not apply to Apple processors or 32-bit processors, which run better with + * full NEON. These will default to 8. Additionally, size-optimized builds run 8 lanes. + * + * This change benefits CPUs with large micro-op buffers without negatively affecting + * most other CPUs: + * + * | Chipset | Dispatch type | NEON only | 6:2 hybrid | Diff. | + * |:----------------------|:--------------------|----------:|-----------:|------:| + * | Snapdragon 730 (A76) | 2 NEON/8 micro-ops | 8.8 GB/s | 10.1 GB/s | ~16% | + * | Snapdragon 835 (A73) | 2 NEON/3 micro-ops | 5.1 GB/s | 5.3 GB/s | ~5% | + * | Marvell PXA1928 (A53) | In-order dual-issue | 1.9 GB/s | 1.9 GB/s | 0% | + * | Apple M1 | 4 NEON/8 micro-ops | 37.3 GB/s | 36.1 GB/s | ~-3% | + * + * It also seems to fix some bad codegen on GCC, making it almost as fast as clang. + * + * When using WASM SIMD128, if this is 2 or 6, SIMDe will scalarize 2 of the lanes meaning + * it effectively becomes worse 4. + * + * @see XXH3_accumulate_512_neon() + */ +# ifndef XXH3_NEON_LANES +# if (defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) \ + && !defined(__APPLE__) && XXH_SIZE_OPT <= 0 +# define XXH3_NEON_LANES 6 +# else +# define XXH3_NEON_LANES XXH_ACC_NB +# endif +# endif +#endif /* XXH_VECTOR == XXH_NEON */ + +/* + * VSX and Z Vector helpers. + * + * This is very messy, and any pull requests to clean this up are welcome. + * + * There are a lot of problems with supporting VSX and s390x, due to + * inconsistent intrinsics, spotty coverage, and multiple endiannesses. + */ +#if XXH_VECTOR == XXH_VSX +/* Annoyingly, these headers _may_ define three macros: `bool`, `vector`, + * and `pixel`. This is a problem for obvious reasons. + * + * These keywords are unnecessary; the spec literally says they are + * equivalent to `__bool`, `__vector`, and `__pixel` and may be undef'd + * after including the header. + * + * We use pragma push_macro/pop_macro to keep the namespace clean. */ +# pragma push_macro("bool") +# pragma push_macro("vector") +# pragma push_macro("pixel") +/* silence potential macro redefined warnings */ +# undef bool +# undef vector +# undef pixel + +# if defined(__s390x__) +# include <s390intrin.h> +# else +# include <altivec.h> +# endif + +/* Restore the original macro values, if applicable. */ +# pragma pop_macro("pixel") +# pragma pop_macro("vector") +# pragma pop_macro("bool") + +typedef __vector unsigned long long xxh_u64x2; +typedef __vector unsigned char xxh_u8x16; +typedef __vector unsigned xxh_u32x4; + +/* + * UGLY HACK: Similar to aarch64 macOS GCC, s390x GCC has the same aliasing issue. + */ +typedef xxh_u64x2 xxh_aliasing_u64x2 XXH_ALIASING; + +# ifndef XXH_VSX_BE +# if defined(__BIG_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +# define XXH_VSX_BE 1 +# elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__ +# warning "-maltivec=be is not recommended. Please use native endianness." +# define XXH_VSX_BE 1 +# else +# define XXH_VSX_BE 0 +# endif +# endif /* !defined(XXH_VSX_BE) */ + +# if XXH_VSX_BE +# if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__)) +# define XXH_vec_revb vec_revb +# else +/*! + * A polyfill for POWER9's vec_revb(). + */ +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val) +{ + xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, + 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 }; + return vec_perm(val, val, vByteSwap); +} +# endif +# endif /* XXH_VSX_BE */ + +/*! + * Performs an unaligned vector load and byte swaps it on big endian. + */ +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr) +{ + xxh_u64x2 ret; + XXH_memcpy(&ret, ptr, sizeof(xxh_u64x2)); +# if XXH_VSX_BE + ret = XXH_vec_revb(ret); +# endif + return ret; +} + +/* + * vec_mulo and vec_mule are very problematic intrinsics on PowerPC + * + * These intrinsics weren't added until GCC 8, despite existing for a while, + * and they are endian dependent. Also, their meaning swap depending on version. + * */ +# if defined(__s390x__) + /* s390x is always big endian, no issue on this platform */ +# define XXH_vec_mulo vec_mulo +# define XXH_vec_mule vec_mule +# elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw) && !defined(__ibmxl__) +/* Clang has a better way to control this, we can just use the builtin which doesn't swap. */ + /* The IBM XL Compiler (which defined __clang__) only implements the vec_* operations */ +# define XXH_vec_mulo __builtin_altivec_vmulouw +# define XXH_vec_mule __builtin_altivec_vmuleuw +# else +/* gcc needs inline assembly */ +/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */ +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b) +{ + xxh_u64x2 result; + __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); + return result; +} +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b) +{ + xxh_u64x2 result; + __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); + return result; +} +# endif /* XXH_vec_mulo, XXH_vec_mule */ +#endif /* XXH_VECTOR == XXH_VSX */ + +#if XXH_VECTOR == XXH_SVE +#define ACCRND(acc, offset) \ +do { \ + svuint64_t input_vec = svld1_u64(mask, xinput + offset); \ + svuint64_t secret_vec = svld1_u64(mask, xsecret + offset); \ + svuint64_t mixed = sveor_u64_x(mask, secret_vec, input_vec); \ + svuint64_t swapped = svtbl_u64(input_vec, kSwap); \ + svuint64_t mixed_lo = svextw_u64_x(mask, mixed); \ + svuint64_t mixed_hi = svlsr_n_u64_x(mask, mixed, 32); \ + svuint64_t mul = svmad_u64_x(mask, mixed_lo, mixed_hi, swapped); \ + acc = svadd_u64_x(mask, acc, mul); \ +} while (0) +#endif /* XXH_VECTOR == XXH_SVE */ + +/* prefetch + * can be disabled, by declaring XXH_NO_PREFETCH build macro */ +#if defined(XXH_NO_PREFETCH) +# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ +#else +# if XXH_SIZE_OPT >= 1 +# define XXH_PREFETCH(ptr) (void)(ptr) +# elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) /* _mm_prefetch() not defined outside of x86/x64 */ +# include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ +# define XXH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0) +# elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) ) +# define XXH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) +# else +# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ +# endif +#endif /* XXH_NO_PREFETCH */ + + +/* ========================================== + * XXH3 default settings + * ========================================== */ + +#define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */ + +#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN) +# error "default keyset is not large enough" +#endif + +/*! Pseudorandom secret taken directly from FARSH. */ +XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = { + 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c, + 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, + 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21, + 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c, + 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3, + 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8, + 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d, + 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64, + 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb, + 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e, + 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce, + 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e, +}; + +static const xxh_u64 PRIME_MX1 = 0x165667919E3779F9ULL; /*!< 0b0001011001010110011001111001000110011110001101110111100111111001 */ +static const xxh_u64 PRIME_MX2 = 0x9FB21C651E98DF25ULL; /*!< 0b1001111110110010000111000110010100011110100110001101111100100101 */ + +#ifdef XXH_OLD_NAMES +# define kSecret XXH3_kSecret +#endif + +#ifdef XXH_DOXYGEN +/*! + * @brief Calculates a 32-bit to 64-bit long multiply. + * + * Implemented as a macro. + * + * Wraps `__emulu` on MSVC x86 because it tends to call `__allmul` when it doesn't + * need to (but it shouldn't need to anyways, it is about 7 instructions to do + * a 64x64 multiply...). Since we know that this will _always_ emit `MULL`, we + * use that instead of the normal method. + * + * If you are compiling for platforms like Thumb-1 and don't have a better option, + * you may also want to write your own long multiply routine here. + * + * @param x, y Numbers to be multiplied + * @return 64-bit product of the low 32 bits of @p x and @p y. + */ +XXH_FORCE_INLINE xxh_u64 +XXH_mult32to64(xxh_u64 x, xxh_u64 y) +{ + return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF); +} +#elif defined(_MSC_VER) && defined(_M_IX86) +# define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y)) +#else +/* + * Downcast + upcast is usually better than masking on older compilers like + * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers. + * + * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands + * and perform a full 64x64 multiply -- entirely redundant on 32-bit. + */ +# define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y)) +#endif + +/*! + * @brief Calculates a 64->128-bit long multiply. + * + * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar + * version. + * + * @param lhs , rhs The 64-bit integers to be multiplied + * @return The 128-bit result represented in an @ref XXH128_hash_t. + */ +static XXH128_hash_t +XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) +{ + /* + * GCC/Clang __uint128_t method. + * + * On most 64-bit targets, GCC and Clang define a __uint128_t type. + * This is usually the best way as it usually uses a native long 64-bit + * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64. + * + * Usually. + * + * Despite being a 32-bit platform, Clang (and emscripten) define this type + * despite not having the arithmetic for it. This results in a laggy + * compiler builtin call which calculates a full 128-bit multiply. + * In that case it is best to use the portable one. + * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677 + */ +#if (defined(__GNUC__) || defined(__clang__)) && !defined(__wasm__) \ + && defined(__SIZEOF_INT128__) \ + || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) + + __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs; + XXH128_hash_t r128; + r128.low64 = (xxh_u64)(product); + r128.high64 = (xxh_u64)(product >> 64); + return r128; + + /* + * MSVC for x64's _umul128 method. + * + * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct); + * + * This compiles to single operand MUL on x64. + */ +#elif (defined(_M_X64) || defined(_M_IA64)) && !defined(_M_ARM64EC) + +#ifndef _MSC_VER +# pragma intrinsic(_umul128) +#endif + xxh_u64 product_high; + xxh_u64 const product_low = _umul128(lhs, rhs, &product_high); + XXH128_hash_t r128; + r128.low64 = product_low; + r128.high64 = product_high; + return r128; + + /* + * MSVC for ARM64's __umulh method. + * + * This compiles to the same MUL + UMULH as GCC/Clang's __uint128_t method. + */ +#elif defined(_M_ARM64) || defined(_M_ARM64EC) + +#ifndef _MSC_VER +# pragma intrinsic(__umulh) +#endif + XXH128_hash_t r128; + r128.low64 = lhs * rhs; + r128.high64 = __umulh(lhs, rhs); + return r128; + +#else + /* + * Portable scalar method. Optimized for 32-bit and 64-bit ALUs. + * + * This is a fast and simple grade school multiply, which is shown below + * with base 10 arithmetic instead of base 0x100000000. + * + * 9 3 // D2 lhs = 93 + * x 7 5 // D2 rhs = 75 + * ---------- + * 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15 + * 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45 + * 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21 + * + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63 + * --------- + * 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27 + * + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67 + * --------- + * 6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975 + * + * The reasons for adding the products like this are: + * 1. It avoids manual carry tracking. Just like how + * (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX. + * This avoids a lot of complexity. + * + * 2. It hints for, and on Clang, compiles to, the powerful UMAAL + * instruction available in ARM's Digital Signal Processing extension + * in 32-bit ARMv6 and later, which is shown below: + * + * void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm) + * { + * xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm; + * *RdLo = (xxh_u32)(product & 0xFFFFFFFF); + * *RdHi = (xxh_u32)(product >> 32); + * } + * + * This instruction was designed for efficient long multiplication, and + * allows this to be calculated in only 4 instructions at speeds + * comparable to some 64-bit ALUs. + * + * 3. It isn't terrible on other platforms. Usually this will be a couple + * of 32-bit ADD/ADCs. + */ + + /* First calculate all of the cross products. */ + xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF); + xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF); + xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32); + xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32); + + /* Now add the products together. These will never overflow. */ + xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; + xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; + xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); + + XXH128_hash_t r128; + r128.low64 = lower; + r128.high64 = upper; + return r128; +#endif +} + +/*! + * @brief Calculates a 64-bit to 128-bit multiply, then XOR folds it. + * + * The reason for the separate function is to prevent passing too many structs + * around by value. This will hopefully inline the multiply, but we don't force it. + * + * @param lhs , rhs The 64-bit integers to multiply + * @return The low 64 bits of the product XOR'd by the high 64 bits. + * @see XXH_mult64to128() + */ +static xxh_u64 +XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) +{ + XXH128_hash_t product = XXH_mult64to128(lhs, rhs); + return product.low64 ^ product.high64; +} + +/*! Seems to produce slightly better code on GCC for some reason. */ +XXH_FORCE_INLINE XXH_CONSTF xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift) +{ + XXH_ASSERT(0 <= shift && shift < 64); + return v64 ^ (v64 >> shift); +} + +/* + * This is a fast avalanche stage, + * suitable when input bits are already partially mixed + */ +static XXH64_hash_t XXH3_avalanche(xxh_u64 h64) +{ + h64 = XXH_xorshift64(h64, 37); + h64 *= PRIME_MX1; + h64 = XXH_xorshift64(h64, 32); + return h64; +} + +/* + * This is a stronger avalanche, + * inspired by Pelle Evensen's rrmxmx + * preferable when input has not been previously mixed + */ +static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len) +{ + /* this mix is inspired by Pelle Evensen's rrmxmx */ + h64 ^= XXH_rotl64(h64, 49) ^ XXH_rotl64(h64, 24); + h64 *= PRIME_MX2; + h64 ^= (h64 >> 35) + len ; + h64 *= PRIME_MX2; + return XXH_xorshift64(h64, 28); +} + + +/* ========================================== + * Short keys + * ========================================== + * One of the shortcomings of XXH32 and XXH64 was that their performance was + * sub-optimal on short lengths. It used an iterative algorithm which strongly + * favored lengths that were a multiple of 4 or 8. + * + * Instead of iterating over individual inputs, we use a set of single shot + * functions which piece together a range of lengths and operate in constant time. + * + * Additionally, the number of multiplies has been significantly reduced. This + * reduces latency, especially when emulating 64-bit multiplies on 32-bit. + * + * Depending on the platform, this may or may not be faster than XXH32, but it + * is almost guaranteed to be faster than XXH64. + */ + +/* + * At very short lengths, there isn't enough input to fully hide secrets, or use + * the entire secret. + * + * There is also only a limited amount of mixing we can do before significantly + * impacting performance. + * + * Therefore, we use different sections of the secret and always mix two secret + * samples with an XOR. This should have no effect on performance on the + * seedless or withSeed variants because everything _should_ be constant folded + * by modern compilers. + * + * The XOR mixing hides individual parts of the secret and increases entropy. + * + * This adds an extra layer of strength for custom secrets. + */ +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(1 <= len && len <= 3); + XXH_ASSERT(secret != NULL); + /* + * len = 1: combined = { input[0], 0x01, input[0], input[0] } + * len = 2: combined = { input[1], 0x02, input[0], input[1] } + * len = 3: combined = { input[2], 0x03, input[0], input[1] } + */ + { xxh_u8 const c1 = input[0]; + xxh_u8 const c2 = input[len >> 1]; + xxh_u8 const c3 = input[len - 1]; + xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2 << 24) + | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); + xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; + xxh_u64 const keyed = (xxh_u64)combined ^ bitflip; + return XXH64_avalanche(keyed); + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(4 <= len && len <= 8); + seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; + { xxh_u32 const input1 = XXH_readLE32(input); + xxh_u32 const input2 = XXH_readLE32(input + len - 4); + xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed; + xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32); + xxh_u64 const keyed = input64 ^ bitflip; + return XXH3_rrmxmx(keyed, len); + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(9 <= len && len <= 16); + { xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed; + xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed; + xxh_u64 const input_lo = XXH_readLE64(input) ^ bitflip1; + xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2; + xxh_u64 const acc = len + + XXH_swap64(input_lo) + input_hi + + XXH3_mul128_fold64(input_lo, input_hi); + return XXH3_avalanche(acc); + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(len <= 16); + { if (XXH_likely(len > 8)) return XXH3_len_9to16_64b(input, len, secret, seed); + if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed); + if (len) return XXH3_len_1to3_64b(input, len, secret, seed); + return XXH64_avalanche(seed ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64))); + } +} + +/* + * DISCLAIMER: There are known *seed-dependent* multicollisions here due to + * multiplication by zero, affecting hashes of lengths 17 to 240. + * + * However, they are very unlikely. + * + * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all + * unseeded non-cryptographic hashes, it does not attempt to defend itself + * against specially crafted inputs, only random inputs. + * + * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes + * cancelling out the secret is taken an arbitrary number of times (addressed + * in XXH3_accumulate_512), this collision is very unlikely with random inputs + * and/or proper seeding: + * + * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a + * function that is only called up to 16 times per hash with up to 240 bytes of + * input. + * + * This is not too bad for a non-cryptographic hash function, especially with + * only 64 bit outputs. + * + * The 128-bit variant (which trades some speed for strength) is NOT affected + * by this, although it is always a good idea to use a proper seed if you care + * about strength. + */ +XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input, + const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64) +{ +#if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ + && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable like XXH32 hack */ + /* + * UGLY HACK: + * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in + * slower code. + * + * By forcing seed64 into a register, we disrupt the cost model and + * cause it to scalarize. See `XXH32_round()` + * + * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600, + * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on + * GCC 9.2, despite both emitting scalar code. + * + * GCC generates much better scalar code than Clang for the rest of XXH3, + * which is why finding a more optimal codepath is an interest. + */ + XXH_COMPILER_GUARD(seed64); +#endif + { xxh_u64 const input_lo = XXH_readLE64(input); + xxh_u64 const input_hi = XXH_readLE64(input+8); + return XXH3_mul128_fold64( + input_lo ^ (XXH_readLE64(secret) + seed64), + input_hi ^ (XXH_readLE64(secret+8) - seed64) + ); + } +} + +/* For mid range keys, XXH3 uses a Mum-hash variant. */ +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(16 < len && len <= 128); + + { xxh_u64 acc = len * XXH_PRIME64_1; +#if XXH_SIZE_OPT >= 1 + /* Smaller and cleaner, but slightly slower. */ + unsigned int i = (unsigned int)(len - 1) / 32; + do { + acc += XXH3_mix16B(input+16 * i, secret+32*i, seed); + acc += XXH3_mix16B(input+len-16*(i+1), secret+32*i+16, seed); + } while (i-- != 0); +#else + if (len > 32) { + if (len > 64) { + if (len > 96) { + acc += XXH3_mix16B(input+48, secret+96, seed); + acc += XXH3_mix16B(input+len-64, secret+112, seed); + } + acc += XXH3_mix16B(input+32, secret+64, seed); + acc += XXH3_mix16B(input+len-48, secret+80, seed); + } + acc += XXH3_mix16B(input+16, secret+32, seed); + acc += XXH3_mix16B(input+len-32, secret+48, seed); + } + acc += XXH3_mix16B(input+0, secret+0, seed); + acc += XXH3_mix16B(input+len-16, secret+16, seed); +#endif + return XXH3_avalanche(acc); + } +} + +#define XXH3_MIDSIZE_MAX 240 + +XXH_NO_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); + + #define XXH3_MIDSIZE_STARTOFFSET 3 + #define XXH3_MIDSIZE_LASTOFFSET 17 + + { xxh_u64 acc = len * XXH_PRIME64_1; + xxh_u64 acc_end; + unsigned int const nbRounds = (unsigned int)len / 16; + unsigned int i; + XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); + for (i=0; i<8; i++) { + acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed); + } + /* last bytes */ + acc_end = XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed); + XXH_ASSERT(nbRounds >= 8); + acc = XXH3_avalanche(acc); +#if defined(__clang__) /* Clang */ \ + && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ + /* + * UGLY HACK: + * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86. + * In everywhere else, it uses scalar code. + * + * For 64->128-bit multiplies, even if the NEON was 100% optimal, it + * would still be slower than UMAAL (see XXH_mult64to128). + * + * Unfortunately, Clang doesn't handle the long multiplies properly and + * converts them to the nonexistent "vmulq_u64" intrinsic, which is then + * scalarized into an ugly mess of VMOV.32 instructions. + * + * This mess is difficult to avoid without turning autovectorization + * off completely, but they are usually relatively minor and/or not + * worth it to fix. + * + * This loop is the easiest to fix, as unlike XXH32, this pragma + * _actually works_ because it is a loop vectorization instead of an + * SLP vectorization. + */ + #pragma clang loop vectorize(disable) +#endif + for (i=8 ; i < nbRounds; i++) { + /* + * Prevents clang for unrolling the acc loop and interleaving with this one. + */ + XXH_COMPILER_GUARD(acc); + acc_end += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed); + } + return XXH3_avalanche(acc + acc_end); + } +} + + +/* ======= Long Keys ======= */ + +#define XXH_STRIPE_LEN 64 +#define XXH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */ +#define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64)) + +#ifdef XXH_OLD_NAMES +# define STRIPE_LEN XXH_STRIPE_LEN +# define ACC_NB XXH_ACC_NB +#endif + +#ifndef XXH_PREFETCH_DIST +# ifdef __clang__ +# define XXH_PREFETCH_DIST 320 +# else +# if (XXH_VECTOR == XXH_AVX512) +# define XXH_PREFETCH_DIST 512 +# else +# define XXH_PREFETCH_DIST 384 +# endif +# endif /* __clang__ */ +#endif /* XXH_PREFETCH_DIST */ + +/* + * These macros are to generate an XXH3_accumulate() function. + * The two arguments select the name suffix and target attribute. + * + * The name of this symbol is XXH3_accumulate_<name>() and it calls + * XXH3_accumulate_512_<name>(). + * + * It may be useful to hand implement this function if the compiler fails to + * optimize the inline function. + */ +#define XXH3_ACCUMULATE_TEMPLATE(name) \ +void \ +XXH3_accumulate_##name(xxh_u64* XXH_RESTRICT acc, \ + const xxh_u8* XXH_RESTRICT input, \ + const xxh_u8* XXH_RESTRICT secret, \ + size_t nbStripes) \ +{ \ + size_t n; \ + for (n = 0; n < nbStripes; n++ ) { \ + const xxh_u8* const in = input + n*XXH_STRIPE_LEN; \ + XXH_PREFETCH(in + XXH_PREFETCH_DIST); \ + XXH3_accumulate_512_##name( \ + acc, \ + in, \ + secret + n*XXH_SECRET_CONSUME_RATE); \ + } \ +} + + +XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64) +{ + if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64); + XXH_memcpy(dst, &v64, sizeof(v64)); +} + +/* Several intrinsic functions below are supposed to accept __int64 as argument, + * as documented in https://software.intel.com/sites/landingpage/IntrinsicsGuide/ . + * However, several environments do not define __int64 type, + * requiring a workaround. + */ +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) + typedef int64_t xxh_i64; +#else + /* the following type must have a width of 64-bit */ + typedef long long xxh_i64; +#endif + + +/* + * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized. + * + * It is a hardened version of UMAC, based off of FARSH's implementation. + * + * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD + * implementations, and it is ridiculously fast. + * + * We harden it by mixing the original input to the accumulators as well as the product. + * + * This means that in the (relatively likely) case of a multiply by zero, the + * original input is preserved. + * + * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve + * cross-pollination, as otherwise the upper and lower halves would be + * essentially independent. + * + * This doesn't matter on 64-bit hashes since they all get merged together in + * the end, so we skip the extra step. + * + * Both XXH3_64bits and XXH3_128bits use this subroutine. + */ + +#if (XXH_VECTOR == XXH_AVX512) \ + || (defined(XXH_DISPATCH_AVX512) && XXH_DISPATCH_AVX512 != 0) + +#ifndef XXH_TARGET_AVX512 +# define XXH_TARGET_AVX512 /* disable attribute target */ +#endif + +XXH_FORCE_INLINE XXH_TARGET_AVX512 void +XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + __m512i* const xacc = (__m512i *) acc; + XXH_ASSERT((((size_t)acc) & 63) == 0); + XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); + + { + /* data_vec = input[0]; */ + __m512i const data_vec = _mm512_loadu_si512 (input); + /* key_vec = secret[0]; */ + __m512i const key_vec = _mm512_loadu_si512 (secret); + /* data_key = data_vec ^ key_vec; */ + __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m512i const data_key_lo = _mm512_srli_epi64 (data_key, 32); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m512i const product = _mm512_mul_epu32 (data_key, data_key_lo); + /* xacc[0] += swap(data_vec); */ + __m512i const data_swap = _mm512_shuffle_epi32(data_vec, (_MM_PERM_ENUM)_MM_SHUFFLE(1, 0, 3, 2)); + __m512i const sum = _mm512_add_epi64(*xacc, data_swap); + /* xacc[0] += product; */ + *xacc = _mm512_add_epi64(product, sum); + } +} +XXH_FORCE_INLINE XXH_TARGET_AVX512 XXH3_ACCUMULATE_TEMPLATE(avx512) + +/* + * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing. + * + * Multiplication isn't perfect, as explained by Google in HighwayHash: + * + * // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to + * // varying degrees. In descending order of goodness, bytes + * // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32. + * // As expected, the upper and lower bytes are much worse. + * + * Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291 + * + * Since our algorithm uses a pseudorandom secret to add some variance into the + * mix, we don't need to (or want to) mix as often or as much as HighwayHash does. + * + * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid + * extraction. + * + * Both XXH3_64bits and XXH3_128bits use this subroutine. + */ + +XXH_FORCE_INLINE XXH_TARGET_AVX512 void +XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 63) == 0); + XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); + { __m512i* const xacc = (__m512i*) acc; + const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1); + + /* xacc[0] ^= (xacc[0] >> 47) */ + __m512i const acc_vec = *xacc; + __m512i const shifted = _mm512_srli_epi64 (acc_vec, 47); + /* xacc[0] ^= secret; */ + __m512i const key_vec = _mm512_loadu_si512 (secret); + __m512i const data_key = _mm512_ternarylogic_epi32(key_vec, acc_vec, shifted, 0x96 /* key_vec ^ acc_vec ^ shifted */); + + /* xacc[0] *= XXH_PRIME32_1; */ + __m512i const data_key_hi = _mm512_srli_epi64 (data_key, 32); + __m512i const prod_lo = _mm512_mul_epu32 (data_key, prime32); + __m512i const prod_hi = _mm512_mul_epu32 (data_key_hi, prime32); + *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32)); + } +} + +XXH_FORCE_INLINE XXH_TARGET_AVX512 void +XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0); + XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64); + XXH_ASSERT(((size_t)customSecret & 63) == 0); + (void)(&XXH_writeLE64); + { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i); + __m512i const seed_pos = _mm512_set1_epi64((xxh_i64)seed64); + __m512i const seed = _mm512_mask_sub_epi64(seed_pos, 0xAA, _mm512_set1_epi8(0), seed_pos); + + const __m512i* const src = (const __m512i*) ((const void*) XXH3_kSecret); + __m512i* const dest = ( __m512i*) customSecret; + int i; + XXH_ASSERT(((size_t)src & 63) == 0); /* control alignment */ + XXH_ASSERT(((size_t)dest & 63) == 0); + for (i=0; i < nbRounds; ++i) { + dest[i] = _mm512_add_epi64(_mm512_load_si512(src + i), seed); + } } +} + +#endif + +#if (XXH_VECTOR == XXH_AVX2) \ + || (defined(XXH_DISPATCH_AVX2) && XXH_DISPATCH_AVX2 != 0) + +#ifndef XXH_TARGET_AVX2 +# define XXH_TARGET_AVX2 /* disable attribute target */ +#endif + +XXH_FORCE_INLINE XXH_TARGET_AVX2 void +XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 31) == 0); + { __m256i* const xacc = (__m256i *) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xinput = (const __m256i *) input; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xsecret = (const __m256i *) secret; + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) { + /* data_vec = xinput[i]; */ + __m256i const data_vec = _mm256_loadu_si256 (xinput+i); + /* key_vec = xsecret[i]; */ + __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); + /* data_key = data_vec ^ key_vec; */ + __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m256i const data_key_lo = _mm256_srli_epi64 (data_key, 32); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m256i const product = _mm256_mul_epu32 (data_key, data_key_lo); + /* xacc[i] += swap(data_vec); */ + __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2)); + __m256i const sum = _mm256_add_epi64(xacc[i], data_swap); + /* xacc[i] += product; */ + xacc[i] = _mm256_add_epi64(product, sum); + } } +} +XXH_FORCE_INLINE XXH_TARGET_AVX2 XXH3_ACCUMULATE_TEMPLATE(avx2) + +XXH_FORCE_INLINE XXH_TARGET_AVX2 void +XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 31) == 0); + { __m256i* const xacc = (__m256i*) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xsecret = (const __m256i *) secret; + const __m256i prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1); + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) { + /* xacc[i] ^= (xacc[i] >> 47) */ + __m256i const acc_vec = xacc[i]; + __m256i const shifted = _mm256_srli_epi64 (acc_vec, 47); + __m256i const data_vec = _mm256_xor_si256 (acc_vec, shifted); + /* xacc[i] ^= xsecret; */ + __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); + __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); + + /* xacc[i] *= XXH_PRIME32_1; */ + __m256i const data_key_hi = _mm256_srli_epi64 (data_key, 32); + __m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32); + __m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32); + xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32)); + } + } +} + +XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 31) == 0); + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE / sizeof(__m256i)) == 6); + XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64); + (void)(&XXH_writeLE64); + XXH_PREFETCH(customSecret); + { __m256i const seed = _mm256_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64, (xxh_i64)(0U - seed64), (xxh_i64)seed64); + + const __m256i* const src = (const __m256i*) ((const void*) XXH3_kSecret); + __m256i* dest = ( __m256i*) customSecret; + +# if defined(__GNUC__) || defined(__clang__) + /* + * On GCC & Clang, marking 'dest' as modified will cause the compiler: + * - do not extract the secret from sse registers in the internal loop + * - use less common registers, and avoid pushing these reg into stack + */ + XXH_COMPILER_GUARD(dest); +# endif + XXH_ASSERT(((size_t)src & 31) == 0); /* control alignment */ + XXH_ASSERT(((size_t)dest & 31) == 0); + + /* GCC -O2 need unroll loop manually */ + dest[0] = _mm256_add_epi64(_mm256_load_si256(src+0), seed); + dest[1] = _mm256_add_epi64(_mm256_load_si256(src+1), seed); + dest[2] = _mm256_add_epi64(_mm256_load_si256(src+2), seed); + dest[3] = _mm256_add_epi64(_mm256_load_si256(src+3), seed); + dest[4] = _mm256_add_epi64(_mm256_load_si256(src+4), seed); + dest[5] = _mm256_add_epi64(_mm256_load_si256(src+5), seed); + } +} + +#endif + +/* x86dispatch always generates SSE2 */ +#if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH) + +#ifndef XXH_TARGET_SSE2 +# define XXH_TARGET_SSE2 /* disable attribute target */ +#endif + +XXH_FORCE_INLINE XXH_TARGET_SSE2 void +XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + /* SSE2 is just a half-scale version of the AVX2 version. */ + XXH_ASSERT((((size_t)acc) & 15) == 0); + { __m128i* const xacc = (__m128i *) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xinput = (const __m128i *) input; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xsecret = (const __m128i *) secret; + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) { + /* data_vec = xinput[i]; */ + __m128i const data_vec = _mm_loadu_si128 (xinput+i); + /* key_vec = xsecret[i]; */ + __m128i const key_vec = _mm_loadu_si128 (xsecret+i); + /* data_key = data_vec ^ key_vec; */ + __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m128i const product = _mm_mul_epu32 (data_key, data_key_lo); + /* xacc[i] += swap(data_vec); */ + __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2)); + __m128i const sum = _mm_add_epi64(xacc[i], data_swap); + /* xacc[i] += product; */ + xacc[i] = _mm_add_epi64(product, sum); + } } +} +XXH_FORCE_INLINE XXH_TARGET_SSE2 XXH3_ACCUMULATE_TEMPLATE(sse2) + +XXH_FORCE_INLINE XXH_TARGET_SSE2 void +XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + { __m128i* const xacc = (__m128i*) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xsecret = (const __m128i *) secret; + const __m128i prime32 = _mm_set1_epi32((int)XXH_PRIME32_1); + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) { + /* xacc[i] ^= (xacc[i] >> 47) */ + __m128i const acc_vec = xacc[i]; + __m128i const shifted = _mm_srli_epi64 (acc_vec, 47); + __m128i const data_vec = _mm_xor_si128 (acc_vec, shifted); + /* xacc[i] ^= xsecret[i]; */ + __m128i const key_vec = _mm_loadu_si128 (xsecret+i); + __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); + + /* xacc[i] *= XXH_PRIME32_1; */ + __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + __m128i const prod_lo = _mm_mul_epu32 (data_key, prime32); + __m128i const prod_hi = _mm_mul_epu32 (data_key_hi, prime32); + xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32)); + } + } +} + +XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); + (void)(&XXH_writeLE64); + { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i); + +# if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900 + /* MSVC 32bit mode does not support _mm_set_epi64x before 2015 */ + XXH_ALIGN(16) const xxh_i64 seed64x2[2] = { (xxh_i64)seed64, (xxh_i64)(0U - seed64) }; + __m128i const seed = _mm_load_si128((__m128i const*)seed64x2); +# else + __m128i const seed = _mm_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64); +# endif + int i; + + const void* const src16 = XXH3_kSecret; + __m128i* dst16 = (__m128i*) customSecret; +# if defined(__GNUC__) || defined(__clang__) + /* + * On GCC & Clang, marking 'dest' as modified will cause the compiler: + * - do not extract the secret from sse registers in the internal loop + * - use less common registers, and avoid pushing these reg into stack + */ + XXH_COMPILER_GUARD(dst16); +# endif + XXH_ASSERT(((size_t)src16 & 15) == 0); /* control alignment */ + XXH_ASSERT(((size_t)dst16 & 15) == 0); + + for (i=0; i < nbRounds; ++i) { + dst16[i] = _mm_add_epi64(_mm_load_si128((const __m128i *)src16+i), seed); + } } +} + +#endif + +#if (XXH_VECTOR == XXH_NEON) + +/* forward declarations for the scalar routines */ +XXH_FORCE_INLINE void +XXH3_scalarRound(void* XXH_RESTRICT acc, void const* XXH_RESTRICT input, + void const* XXH_RESTRICT secret, size_t lane); + +XXH_FORCE_INLINE void +XXH3_scalarScrambleRound(void* XXH_RESTRICT acc, + void const* XXH_RESTRICT secret, size_t lane); + +/*! + * @internal + * @brief The bulk processing loop for NEON and WASM SIMD128. + * + * The NEON code path is actually partially scalar when running on AArch64. This + * is to optimize the pipelining and can have up to 15% speedup depending on the + * CPU, and it also mitigates some GCC codegen issues. + * + * @see XXH3_NEON_LANES for configuring this and details about this optimization. + * + * NEON's 32-bit to 64-bit long multiply takes a half vector of 32-bit + * integers instead of the other platforms which mask full 64-bit vectors, + * so the setup is more complicated than just shifting right. + * + * Additionally, there is an optimization for 4 lanes at once noted below. + * + * Since, as stated, the most optimal amount of lanes for Cortexes is 6, + * there needs to be *three* versions of the accumulate operation used + * for the remaining 2 lanes. + * + * WASM's SIMD128 uses SIMDe's arm_neon.h polyfill because the intrinsics overlap + * nearly perfectly. + */ + +XXH_FORCE_INLINE void +XXH3_accumulate_512_neon( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + XXH_STATIC_ASSERT(XXH3_NEON_LANES > 0 && XXH3_NEON_LANES <= XXH_ACC_NB && XXH3_NEON_LANES % 2 == 0); + { /* GCC for darwin arm64 does not like aliasing here */ + xxh_aliasing_uint64x2_t* const xacc = (xxh_aliasing_uint64x2_t*) acc; + /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */ + uint8_t const* xinput = (const uint8_t *) input; + uint8_t const* xsecret = (const uint8_t *) secret; + + size_t i; +#ifdef __wasm_simd128__ + /* + * On WASM SIMD128, Clang emits direct address loads when XXH3_kSecret + * is constant propagated, which results in it converting it to this + * inside the loop: + * + * a = v128.load(XXH3_kSecret + 0 + $secret_offset, offset = 0) + * b = v128.load(XXH3_kSecret + 16 + $secret_offset, offset = 0) + * ... + * + * This requires a full 32-bit address immediate (and therefore a 6 byte + * instruction) as well as an add for each offset. + * + * Putting an asm guard prevents it from folding (at the cost of losing + * the alignment hint), and uses the free offset in `v128.load` instead + * of adding secret_offset each time which overall reduces code size by + * about a kilobyte and improves performance. + */ + XXH_COMPILER_GUARD(xsecret); +#endif + /* Scalar lanes use the normal scalarRound routine */ + for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) { + XXH3_scalarRound(acc, input, secret, i); + } + i = 0; + /* 4 NEON lanes at a time. */ + for (; i+1 < XXH3_NEON_LANES / 2; i+=2) { + /* data_vec = xinput[i]; */ + uint64x2_t data_vec_1 = XXH_vld1q_u64(xinput + (i * 16)); + uint64x2_t data_vec_2 = XXH_vld1q_u64(xinput + ((i+1) * 16)); + /* key_vec = xsecret[i]; */ + uint64x2_t key_vec_1 = XXH_vld1q_u64(xsecret + (i * 16)); + uint64x2_t key_vec_2 = XXH_vld1q_u64(xsecret + ((i+1) * 16)); + /* data_swap = swap(data_vec) */ + uint64x2_t data_swap_1 = vextq_u64(data_vec_1, data_vec_1, 1); + uint64x2_t data_swap_2 = vextq_u64(data_vec_2, data_vec_2, 1); + /* data_key = data_vec ^ key_vec; */ + uint64x2_t data_key_1 = veorq_u64(data_vec_1, key_vec_1); + uint64x2_t data_key_2 = veorq_u64(data_vec_2, key_vec_2); + + /* + * If we reinterpret the 64x2 vectors as 32x4 vectors, we can use a + * de-interleave operation for 4 lanes in 1 step with `vuzpq_u32` to + * get one vector with the low 32 bits of each lane, and one vector + * with the high 32 bits of each lane. + * + * The intrinsic returns a double vector because the original ARMv7-a + * instruction modified both arguments in place. AArch64 and SIMD128 emit + * two instructions from this intrinsic. + * + * [ dk11L | dk11H | dk12L | dk12H ] -> [ dk11L | dk12L | dk21L | dk22L ] + * [ dk21L | dk21H | dk22L | dk22H ] -> [ dk11H | dk12H | dk21H | dk22H ] + */ + uint32x4x2_t unzipped = vuzpq_u32( + vreinterpretq_u32_u64(data_key_1), + vreinterpretq_u32_u64(data_key_2) + ); + /* data_key_lo = data_key & 0xFFFFFFFF */ + uint32x4_t data_key_lo = unzipped.val[0]; + /* data_key_hi = data_key >> 32 */ + uint32x4_t data_key_hi = unzipped.val[1]; + /* + * Then, we can split the vectors horizontally and multiply which, as for most + * widening intrinsics, have a variant that works on both high half vectors + * for free on AArch64. A similar instruction is available on SIMD128. + * + * sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi + */ + uint64x2_t sum_1 = XXH_vmlal_low_u32(data_swap_1, data_key_lo, data_key_hi); + uint64x2_t sum_2 = XXH_vmlal_high_u32(data_swap_2, data_key_lo, data_key_hi); + /* + * Clang reorders + * a += b * c; // umlal swap.2d, dkl.2s, dkh.2s + * c += a; // add acc.2d, acc.2d, swap.2d + * to + * c += a; // add acc.2d, acc.2d, swap.2d + * c += b * c; // umlal acc.2d, dkl.2s, dkh.2s + * + * While it would make sense in theory since the addition is faster, + * for reasons likely related to umlal being limited to certain NEON + * pipelines, this is worse. A compiler guard fixes this. + */ + XXH_COMPILER_GUARD_CLANG_NEON(sum_1); + XXH_COMPILER_GUARD_CLANG_NEON(sum_2); + /* xacc[i] = acc_vec + sum; */ + xacc[i] = vaddq_u64(xacc[i], sum_1); + xacc[i+1] = vaddq_u64(xacc[i+1], sum_2); + } + /* Operate on the remaining NEON lanes 2 at a time. */ + for (; i < XXH3_NEON_LANES / 2; i++) { + /* data_vec = xinput[i]; */ + uint64x2_t data_vec = XXH_vld1q_u64(xinput + (i * 16)); + /* key_vec = xsecret[i]; */ + uint64x2_t key_vec = XXH_vld1q_u64(xsecret + (i * 16)); + /* acc_vec_2 = swap(data_vec) */ + uint64x2_t data_swap = vextq_u64(data_vec, data_vec, 1); + /* data_key = data_vec ^ key_vec; */ + uint64x2_t data_key = veorq_u64(data_vec, key_vec); + /* For two lanes, just use VMOVN and VSHRN. */ + /* data_key_lo = data_key & 0xFFFFFFFF; */ + uint32x2_t data_key_lo = vmovn_u64(data_key); + /* data_key_hi = data_key >> 32; */ + uint32x2_t data_key_hi = vshrn_n_u64(data_key, 32); + /* sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi; */ + uint64x2_t sum = vmlal_u32(data_swap, data_key_lo, data_key_hi); + /* Same Clang workaround as before */ + XXH_COMPILER_GUARD_CLANG_NEON(sum); + /* xacc[i] = acc_vec + sum; */ + xacc[i] = vaddq_u64 (xacc[i], sum); + } + } +} +XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(neon) + +XXH_FORCE_INLINE void +XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + + { xxh_aliasing_uint64x2_t* xacc = (xxh_aliasing_uint64x2_t*) acc; + uint8_t const* xsecret = (uint8_t const*) secret; + + size_t i; + /* WASM uses operator overloads and doesn't need these. */ +#ifndef __wasm_simd128__ + /* { prime32_1, prime32_1 } */ + uint32x2_t const kPrimeLo = vdup_n_u32(XXH_PRIME32_1); + /* { 0, prime32_1, 0, prime32_1 } */ + uint32x4_t const kPrimeHi = vreinterpretq_u32_u64(vdupq_n_u64((xxh_u64)XXH_PRIME32_1 << 32)); +#endif + + /* AArch64 uses both scalar and neon at the same time */ + for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) { + XXH3_scalarScrambleRound(acc, secret, i); + } + for (i=0; i < XXH3_NEON_LANES / 2; i++) { + /* xacc[i] ^= (xacc[i] >> 47); */ + uint64x2_t acc_vec = xacc[i]; + uint64x2_t shifted = vshrq_n_u64(acc_vec, 47); + uint64x2_t data_vec = veorq_u64(acc_vec, shifted); + + /* xacc[i] ^= xsecret[i]; */ + uint64x2_t key_vec = XXH_vld1q_u64(xsecret + (i * 16)); + uint64x2_t data_key = veorq_u64(data_vec, key_vec); + /* xacc[i] *= XXH_PRIME32_1 */ +#ifdef __wasm_simd128__ + /* SIMD128 has multiply by u64x2, use it instead of expanding and scalarizing */ + xacc[i] = data_key * XXH_PRIME32_1; +#else + /* + * Expanded version with portable NEON intrinsics + * + * lo(x) * lo(y) + (hi(x) * lo(y) << 32) + * + * prod_hi = hi(data_key) * lo(prime) << 32 + * + * Since we only need 32 bits of this multiply a trick can be used, reinterpreting the vector + * as a uint32x4_t and multiplying by { 0, prime, 0, prime } to cancel out the unwanted bits + * and avoid the shift. + */ + uint32x4_t prod_hi = vmulq_u32 (vreinterpretq_u32_u64(data_key), kPrimeHi); + /* Extract low bits for vmlal_u32 */ + uint32x2_t data_key_lo = vmovn_u64(data_key); + /* xacc[i] = prod_hi + lo(data_key) * XXH_PRIME32_1; */ + xacc[i] = vmlal_u32(vreinterpretq_u64_u32(prod_hi), data_key_lo, kPrimeLo); +#endif + } + } +} +#endif + +#if (XXH_VECTOR == XXH_VSX) + +XXH_FORCE_INLINE void +XXH3_accumulate_512_vsx( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + /* presumed aligned */ + xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc; + xxh_u8 const* const xinput = (xxh_u8 const*) input; /* no alignment restriction */ + xxh_u8 const* const xsecret = (xxh_u8 const*) secret; /* no alignment restriction */ + xxh_u64x2 const v32 = { 32, 32 }; + size_t i; + for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) { + /* data_vec = xinput[i]; */ + xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + 16*i); + /* key_vec = xsecret[i]; */ + xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + 16*i); + xxh_u64x2 const data_key = data_vec ^ key_vec; + /* shuffled = (data_key << 32) | (data_key >> 32); */ + xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32); + /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */ + xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled); + /* acc_vec = xacc[i]; */ + xxh_u64x2 acc_vec = xacc[i]; + acc_vec += product; + + /* swap high and low halves */ +#ifdef __s390x__ + acc_vec += vec_permi(data_vec, data_vec, 2); +#else + acc_vec += vec_xxpermdi(data_vec, data_vec, 2); +#endif + xacc[i] = acc_vec; + } +} +XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(vsx) + +XXH_FORCE_INLINE void +XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + + { xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc; + const xxh_u8* const xsecret = (const xxh_u8*) secret; + /* constants */ + xxh_u64x2 const v32 = { 32, 32 }; + xxh_u64x2 const v47 = { 47, 47 }; + xxh_u32x4 const prime = { XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1 }; + size_t i; + for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) { + /* xacc[i] ^= (xacc[i] >> 47); */ + xxh_u64x2 const acc_vec = xacc[i]; + xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47); + + /* xacc[i] ^= xsecret[i]; */ + xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + 16*i); + xxh_u64x2 const data_key = data_vec ^ key_vec; + + /* xacc[i] *= XXH_PRIME32_1 */ + /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF); */ + xxh_u64x2 const prod_even = XXH_vec_mule((xxh_u32x4)data_key, prime); + /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32); */ + xxh_u64x2 const prod_odd = XXH_vec_mulo((xxh_u32x4)data_key, prime); + xacc[i] = prod_odd + (prod_even << v32); + } } +} + +#endif + +#if (XXH_VECTOR == XXH_SVE) + +XXH_FORCE_INLINE void +XXH3_accumulate_512_sve( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + uint64_t *xacc = (uint64_t *)acc; + const uint64_t *xinput = (const uint64_t *)(const void *)input; + const uint64_t *xsecret = (const uint64_t *)(const void *)secret; + svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1); + uint64_t element_count = svcntd(); + if (element_count >= 8) { + svbool_t mask = svptrue_pat_b64(SV_VL8); + svuint64_t vacc = svld1_u64(mask, xacc); + ACCRND(vacc, 0); + svst1_u64(mask, xacc, vacc); + } else if (element_count == 2) { /* sve128 */ + svbool_t mask = svptrue_pat_b64(SV_VL2); + svuint64_t acc0 = svld1_u64(mask, xacc + 0); + svuint64_t acc1 = svld1_u64(mask, xacc + 2); + svuint64_t acc2 = svld1_u64(mask, xacc + 4); + svuint64_t acc3 = svld1_u64(mask, xacc + 6); + ACCRND(acc0, 0); + ACCRND(acc1, 2); + ACCRND(acc2, 4); + ACCRND(acc3, 6); + svst1_u64(mask, xacc + 0, acc0); + svst1_u64(mask, xacc + 2, acc1); + svst1_u64(mask, xacc + 4, acc2); + svst1_u64(mask, xacc + 6, acc3); + } else { + svbool_t mask = svptrue_pat_b64(SV_VL4); + svuint64_t acc0 = svld1_u64(mask, xacc + 0); + svuint64_t acc1 = svld1_u64(mask, xacc + 4); + ACCRND(acc0, 0); + ACCRND(acc1, 4); + svst1_u64(mask, xacc + 0, acc0); + svst1_u64(mask, xacc + 4, acc1); + } +} + +XXH_FORCE_INLINE void +XXH3_accumulate_sve(xxh_u64* XXH_RESTRICT acc, + const xxh_u8* XXH_RESTRICT input, + const xxh_u8* XXH_RESTRICT secret, + size_t nbStripes) +{ + if (nbStripes != 0) { + uint64_t *xacc = (uint64_t *)acc; + const uint64_t *xinput = (const uint64_t *)(const void *)input; + const uint64_t *xsecret = (const uint64_t *)(const void *)secret; + svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1); + uint64_t element_count = svcntd(); + if (element_count >= 8) { + svbool_t mask = svptrue_pat_b64(SV_VL8); + svuint64_t vacc = svld1_u64(mask, xacc + 0); + do { + /* svprfd(svbool_t, void *, enum svfprop); */ + svprfd(mask, xinput + 128, SV_PLDL1STRM); + ACCRND(vacc, 0); + xinput += 8; + xsecret += 1; + nbStripes--; + } while (nbStripes != 0); + + svst1_u64(mask, xacc + 0, vacc); + } else if (element_count == 2) { /* sve128 */ + svbool_t mask = svptrue_pat_b64(SV_VL2); + svuint64_t acc0 = svld1_u64(mask, xacc + 0); + svuint64_t acc1 = svld1_u64(mask, xacc + 2); + svuint64_t acc2 = svld1_u64(mask, xacc + 4); + svuint64_t acc3 = svld1_u64(mask, xacc + 6); + do { + svprfd(mask, xinput + 128, SV_PLDL1STRM); + ACCRND(acc0, 0); + ACCRND(acc1, 2); + ACCRND(acc2, 4); + ACCRND(acc3, 6); + xinput += 8; + xsecret += 1; + nbStripes--; + } while (nbStripes != 0); + + svst1_u64(mask, xacc + 0, acc0); + svst1_u64(mask, xacc + 2, acc1); + svst1_u64(mask, xacc + 4, acc2); + svst1_u64(mask, xacc + 6, acc3); + } else { + svbool_t mask = svptrue_pat_b64(SV_VL4); + svuint64_t acc0 = svld1_u64(mask, xacc + 0); + svuint64_t acc1 = svld1_u64(mask, xacc + 4); + do { + svprfd(mask, xinput + 128, SV_PLDL1STRM); + ACCRND(acc0, 0); + ACCRND(acc1, 4); + xinput += 8; + xsecret += 1; + nbStripes--; + } while (nbStripes != 0); + + svst1_u64(mask, xacc + 0, acc0); + svst1_u64(mask, xacc + 4, acc1); + } + } +} + +#endif + +/* scalar variants - universal */ + +#if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__)) +/* + * In XXH3_scalarRound(), GCC and Clang have a similar codegen issue, where they + * emit an excess mask and a full 64-bit multiply-add (MADD X-form). + * + * While this might not seem like much, as AArch64 is a 64-bit architecture, only + * big Cortex designs have a full 64-bit multiplier. + * + * On the little cores, the smaller 32-bit multiplier is used, and full 64-bit + * multiplies expand to 2-3 multiplies in microcode. This has a major penalty + * of up to 4 latency cycles and 2 stall cycles in the multiply pipeline. + * + * Thankfully, AArch64 still provides the 32-bit long multiply-add (UMADDL) which does + * not have this penalty and does the mask automatically. + */ +XXH_FORCE_INLINE xxh_u64 +XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc) +{ + xxh_u64 ret; + /* note: %x = 64-bit register, %w = 32-bit register */ + __asm__("umaddl %x0, %w1, %w2, %x3" : "=r" (ret) : "r" (lhs), "r" (rhs), "r" (acc)); + return ret; +} +#else +XXH_FORCE_INLINE xxh_u64 +XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc) +{ + return XXH_mult32to64((xxh_u32)lhs, (xxh_u32)rhs) + acc; +} +#endif + +/*! + * @internal + * @brief Scalar round for @ref XXH3_accumulate_512_scalar(). + * + * This is extracted to its own function because the NEON path uses a combination + * of NEON and scalar. + */ +XXH_FORCE_INLINE void +XXH3_scalarRound(void* XXH_RESTRICT acc, + void const* XXH_RESTRICT input, + void const* XXH_RESTRICT secret, + size_t lane) +{ + xxh_u64* xacc = (xxh_u64*) acc; + xxh_u8 const* xinput = (xxh_u8 const*) input; + xxh_u8 const* xsecret = (xxh_u8 const*) secret; + XXH_ASSERT(lane < XXH_ACC_NB); + XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0); + { + xxh_u64 const data_val = XXH_readLE64(xinput + lane * 8); + xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + lane * 8); + xacc[lane ^ 1] += data_val; /* swap adjacent lanes */ + xacc[lane] = XXH_mult32to64_add64(data_key /* & 0xFFFFFFFF */, data_key >> 32, xacc[lane]); + } +} + +/*! + * @internal + * @brief Processes a 64 byte block of data using the scalar path. + */ +XXH_FORCE_INLINE void +XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + size_t i; + /* ARM GCC refuses to unroll this loop, resulting in a 24% slowdown on ARMv6. */ +#if defined(__GNUC__) && !defined(__clang__) \ + && (defined(__arm__) || defined(__thumb2__)) \ + && defined(__ARM_FEATURE_UNALIGNED) /* no unaligned access just wastes bytes */ \ + && XXH_SIZE_OPT <= 0 +# pragma GCC unroll 8 +#endif + for (i=0; i < XXH_ACC_NB; i++) { + XXH3_scalarRound(acc, input, secret, i); + } +} +XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(scalar) + +/*! + * @internal + * @brief Scalar scramble step for @ref XXH3_scrambleAcc_scalar(). + * + * This is extracted to its own function because the NEON path uses a combination + * of NEON and scalar. + */ +XXH_FORCE_INLINE void +XXH3_scalarScrambleRound(void* XXH_RESTRICT acc, + void const* XXH_RESTRICT secret, + size_t lane) +{ + xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */ + const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ + XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0); + XXH_ASSERT(lane < XXH_ACC_NB); + { + xxh_u64 const key64 = XXH_readLE64(xsecret + lane * 8); + xxh_u64 acc64 = xacc[lane]; + acc64 = XXH_xorshift64(acc64, 47); + acc64 ^= key64; + acc64 *= XXH_PRIME32_1; + xacc[lane] = acc64; + } +} + +/*! + * @internal + * @brief Scrambles the accumulators after a large chunk has been read + */ +XXH_FORCE_INLINE void +XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + size_t i; + for (i=0; i < XXH_ACC_NB; i++) { + XXH3_scalarScrambleRound(acc, secret, i); + } +} + +XXH_FORCE_INLINE void +XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + /* + * We need a separate pointer for the hack below, + * which requires a non-const pointer. + * Any decent compiler will optimize this out otherwise. + */ + const xxh_u8* kSecretPtr = XXH3_kSecret; + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); + +#if defined(__GNUC__) && defined(__aarch64__) + /* + * UGLY HACK: + * GCC and Clang generate a bunch of MOV/MOVK pairs for aarch64, and they are + * placed sequentially, in order, at the top of the unrolled loop. + * + * While MOVK is great for generating constants (2 cycles for a 64-bit + * constant compared to 4 cycles for LDR), it fights for bandwidth with + * the arithmetic instructions. + * + * I L S + * MOVK + * MOVK + * MOVK + * MOVK + * ADD + * SUB STR + * STR + * By forcing loads from memory (as the asm line causes the compiler to assume + * that XXH3_kSecretPtr has been changed), the pipelines are used more + * efficiently: + * I L S + * LDR + * ADD LDR + * SUB STR + * STR + * + * See XXH3_NEON_LANES for details on the pipsline. + * + * XXH3_64bits_withSeed, len == 256, Snapdragon 835 + * without hack: 2654.4 MB/s + * with hack: 3202.9 MB/s + */ + XXH_COMPILER_GUARD(kSecretPtr); +#endif + { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16; + int i; + for (i=0; i < nbRounds; i++) { + /* + * The asm hack causes the compiler to assume that kSecretPtr aliases with + * customSecret, and on aarch64, this prevented LDP from merging two + * loads together for free. Putting the loads together before the stores + * properly generates LDP. + */ + xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i) + seed64; + xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64; + XXH_writeLE64((xxh_u8*)customSecret + 16*i, lo); + XXH_writeLE64((xxh_u8*)customSecret + 16*i + 8, hi); + } } +} + + +typedef void (*XXH3_f_accumulate)(xxh_u64* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, size_t); +typedef void (*XXH3_f_scrambleAcc)(void* XXH_RESTRICT, const void*); +typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64); + + +#if (XXH_VECTOR == XXH_AVX512) + +#define XXH3_accumulate_512 XXH3_accumulate_512_avx512 +#define XXH3_accumulate XXH3_accumulate_avx512 +#define XXH3_scrambleAcc XXH3_scrambleAcc_avx512 +#define XXH3_initCustomSecret XXH3_initCustomSecret_avx512 + +#elif (XXH_VECTOR == XXH_AVX2) + +#define XXH3_accumulate_512 XXH3_accumulate_512_avx2 +#define XXH3_accumulate XXH3_accumulate_avx2 +#define XXH3_scrambleAcc XXH3_scrambleAcc_avx2 +#define XXH3_initCustomSecret XXH3_initCustomSecret_avx2 + +#elif (XXH_VECTOR == XXH_SSE2) + +#define XXH3_accumulate_512 XXH3_accumulate_512_sse2 +#define XXH3_accumulate XXH3_accumulate_sse2 +#define XXH3_scrambleAcc XXH3_scrambleAcc_sse2 +#define XXH3_initCustomSecret XXH3_initCustomSecret_sse2 + +#elif (XXH_VECTOR == XXH_NEON) + +#define XXH3_accumulate_512 XXH3_accumulate_512_neon +#define XXH3_accumulate XXH3_accumulate_neon +#define XXH3_scrambleAcc XXH3_scrambleAcc_neon +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#elif (XXH_VECTOR == XXH_VSX) + +#define XXH3_accumulate_512 XXH3_accumulate_512_vsx +#define XXH3_accumulate XXH3_accumulate_vsx +#define XXH3_scrambleAcc XXH3_scrambleAcc_vsx +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#elif (XXH_VECTOR == XXH_SVE) +#define XXH3_accumulate_512 XXH3_accumulate_512_sve +#define XXH3_accumulate XXH3_accumulate_sve +#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#else /* scalar */ + +#define XXH3_accumulate_512 XXH3_accumulate_512_scalar +#define XXH3_accumulate XXH3_accumulate_scalar +#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#endif + +#if XXH_SIZE_OPT >= 1 /* don't do SIMD for initialization */ +# undef XXH3_initCustomSecret +# define XXH3_initCustomSecret XXH3_initCustomSecret_scalar +#endif + +XXH_FORCE_INLINE void +XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc, + const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble) +{ + size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE; + size_t const block_len = XXH_STRIPE_LEN * nbStripesPerBlock; + size_t const nb_blocks = (len - 1) / block_len; + + size_t n; + + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); + + for (n = 0; n < nb_blocks; n++) { + f_acc(acc, input + n*block_len, secret, nbStripesPerBlock); + f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN); + } + + /* last partial block */ + XXH_ASSERT(len > XXH_STRIPE_LEN); + { size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN; + XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE)); + f_acc(acc, input + nb_blocks*block_len, secret, nbStripes); + + /* last stripe */ + { const xxh_u8* const p = input + len - XXH_STRIPE_LEN; +#define XXH_SECRET_LASTACC_START 7 /* not aligned on 8, last secret is different from acc & scrambler */ + XXH3_accumulate_512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START); + } } +} + +XXH_FORCE_INLINE xxh_u64 +XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret) +{ + return XXH3_mul128_fold64( + acc[0] ^ XXH_readLE64(secret), + acc[1] ^ XXH_readLE64(secret+8) ); +} + +static XXH64_hash_t +XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start) +{ + xxh_u64 result64 = start; + size_t i = 0; + + for (i = 0; i < 4; i++) { + result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i); +#if defined(__clang__) /* Clang */ \ + && (defined(__arm__) || defined(__thumb__)) /* ARMv7 */ \ + && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ + /* + * UGLY HACK: + * Prevent autovectorization on Clang ARMv7-a. Exact same problem as + * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b. + * XXH3_64bits, len == 256, Snapdragon 835: + * without hack: 2063.7 MB/s + * with hack: 2560.7 MB/s + */ + XXH_COMPILER_GUARD(result64); +#endif + } + + return XXH3_avalanche(result64); +} + +#define XXH3_INIT_ACC { XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \ + XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 } + +XXH_FORCE_INLINE XXH64_hash_t +XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len, + const void* XXH_RESTRICT secret, size_t secretSize, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble) +{ + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC; + + XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc, f_scramble); + + /* converge into final hash */ + XXH_STATIC_ASSERT(sizeof(acc) == 64); + /* do not align on 8, so that the secret is different from the accumulator */ +#define XXH_SECRET_MERGEACCS_START 11 + XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + return XXH3_mergeAccs(acc, (const xxh_u8*)secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * XXH_PRIME64_1); +} + +/* + * It's important for performance to transmit secret's size (when it's static) + * so that the compiler can properly optimize the vectorized loop. + * This makes a big performance difference for "medium" keys (<1 KB) when using AVX instruction set. + * When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE + * breaks -Og, this is XXH_NO_INLINE. + */ +XXH3_WITH_SECRET_INLINE XXH64_hash_t +XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) +{ + (void)seed64; + return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate, XXH3_scrambleAcc); +} + +/* + * It's preferable for performance that XXH3_hashLong is not inlined, + * as it results in a smaller function for small data, easier to the instruction cache. + * Note that inside this no_inline function, we do inline the internal loop, + * and provide a statically defined secret size to allow optimization of vector loop. + */ +XXH_NO_INLINE XXH_PUREF XXH64_hash_t +XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) +{ + (void)seed64; (void)secret; (void)secretLen; + return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate, XXH3_scrambleAcc); +} + +/* + * XXH3_hashLong_64b_withSeed(): + * Generate a custom key based on alteration of default XXH3_kSecret with the seed, + * and then use this key for long mode hashing. + * + * This operation is decently fast but nonetheless costs a little bit of time. + * Try to avoid it whenever possible (typically when seed==0). + * + * It's important for performance that XXH3_hashLong is not inlined. Not sure + * why (uop cache maybe?), but the difference is large and easily measurable. + */ +XXH_FORCE_INLINE XXH64_hash_t +XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len, + XXH64_hash_t seed, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble, + XXH3_f_initCustomSecret f_initSec) +{ +#if XXH_SIZE_OPT <= 0 + if (seed == 0) + return XXH3_hashLong_64b_internal(input, len, + XXH3_kSecret, sizeof(XXH3_kSecret), + f_acc, f_scramble); +#endif + { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; + f_initSec(secret, seed); + return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret), + f_acc, f_scramble); + } +} + +/* + * It's important for performance that XXH3_hashLong is not inlined. + */ +XXH_NO_INLINE XXH64_hash_t +XXH3_hashLong_64b_withSeed(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) +{ + (void)secret; (void)secretLen; + return XXH3_hashLong_64b_withSeed_internal(input, len, seed, + XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret); +} + + +typedef XXH64_hash_t (*XXH3_hashLong64_f)(const void* XXH_RESTRICT, size_t, + XXH64_hash_t, const xxh_u8* XXH_RESTRICT, size_t); + +XXH_FORCE_INLINE XXH64_hash_t +XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen, + XXH3_hashLong64_f f_hashLong) +{ + XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN); + /* + * If an action is to be taken if `secretLen` condition is not respected, + * it should be done here. + * For now, it's a contract pre-condition. + * Adding a check and a branch here would cost performance at every hash. + * Also, note that function signature doesn't offer room to return an error. + */ + if (len <= 16) + return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64); + if (len <= 128) + return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); + return f_hashLong(input, len, seed64, (const xxh_u8*)secret, secretLen); +} + + +/* === Public entry point === */ + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length) +{ + return XXH3_64bits_internal(input, length, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH64_hash_t +XXH3_64bits_withSecret(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize) +{ + return XXH3_64bits_internal(input, length, 0, secret, secretSize, XXH3_hashLong_64b_withSecret); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH64_hash_t +XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed) +{ + return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed); +} + +XXH_PUBLIC_API XXH64_hash_t +XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed) +{ + if (length <= XXH3_MIDSIZE_MAX) + return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL); + return XXH3_hashLong_64b_withSecret(input, length, seed, (const xxh_u8*)secret, secretSize); +} + + +/* === XXH3 streaming === */ +#ifndef XXH_NO_STREAM +/* + * Malloc's a pointer that is always aligned to align. + * + * This must be freed with `XXH_alignedFree()`. + * + * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte + * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2 + * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON. + * + * This underalignment previously caused a rather obvious crash which went + * completely unnoticed due to XXH3_createState() not actually being tested. + * Credit to RedSpah for noticing this bug. + * + * The alignment is done manually: Functions like posix_memalign or _mm_malloc + * are avoided: To maintain portability, we would have to write a fallback + * like this anyways, and besides, testing for the existence of library + * functions without relying on external build tools is impossible. + * + * The method is simple: Overallocate, manually align, and store the offset + * to the original behind the returned pointer. + * + * Align must be a power of 2 and 8 <= align <= 128. + */ +static XXH_MALLOCF void* XXH_alignedMalloc(size_t s, size_t align) +{ + XXH_ASSERT(align <= 128 && align >= 8); /* range check */ + XXH_ASSERT((align & (align-1)) == 0); /* power of 2 */ + XXH_ASSERT(s != 0 && s < (s + align)); /* empty/overflow */ + { /* Overallocate to make room for manual realignment and an offset byte */ + xxh_u8* base = (xxh_u8*)XXH_malloc(s + align); + if (base != NULL) { + /* + * Get the offset needed to align this pointer. + * + * Even if the returned pointer is aligned, there will always be + * at least one byte to store the offset to the original pointer. + */ + size_t offset = align - ((size_t)base & (align - 1)); /* base % align */ + /* Add the offset for the now-aligned pointer */ + xxh_u8* ptr = base + offset; + + XXH_ASSERT((size_t)ptr % align == 0); + + /* Store the offset immediately before the returned pointer. */ + ptr[-1] = (xxh_u8)offset; + return ptr; + } + return NULL; + } +} +/* + * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass + * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout. + */ +static void XXH_alignedFree(void* p) +{ + if (p != NULL) { + xxh_u8* ptr = (xxh_u8*)p; + /* Get the offset byte we added in XXH_malloc. */ + xxh_u8 offset = ptr[-1]; + /* Free the original malloc'd pointer */ + xxh_u8* base = ptr - offset; + XXH_free(base); + } +} +/*! @ingroup XXH3_family */ +/*! + * @brief Allocate an @ref XXH3_state_t. + * + * Must be freed with XXH3_freeState(). + * @return An allocated XXH3_state_t on success, `NULL` on failure. + */ +XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void) +{ + XXH3_state_t* const state = (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64); + if (state==NULL) return NULL; + XXH3_INITSTATE(state); + return state; +} + +/*! @ingroup XXH3_family */ +/*! + * @brief Frees an @ref XXH3_state_t. + * + * Must be allocated with XXH3_createState(). + * @param statePtr A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState(). + * @return XXH_OK. + */ +XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr) +{ + XXH_alignedFree(statePtr); + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API void +XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state) +{ + XXH_memcpy(dst_state, src_state, sizeof(*dst_state)); +} + +static void +XXH3_reset_internal(XXH3_state_t* statePtr, + XXH64_hash_t seed, + const void* secret, size_t secretSize) +{ + size_t const initStart = offsetof(XXH3_state_t, bufferedSize); + size_t const initLength = offsetof(XXH3_state_t, nbStripesPerBlock) - initStart; + XXH_ASSERT(offsetof(XXH3_state_t, nbStripesPerBlock) > initStart); + XXH_ASSERT(statePtr != NULL); + /* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */ + memset((char*)statePtr + initStart, 0, initLength); + statePtr->acc[0] = XXH_PRIME32_3; + statePtr->acc[1] = XXH_PRIME64_1; + statePtr->acc[2] = XXH_PRIME64_2; + statePtr->acc[3] = XXH_PRIME64_3; + statePtr->acc[4] = XXH_PRIME64_4; + statePtr->acc[5] = XXH_PRIME32_2; + statePtr->acc[6] = XXH_PRIME64_5; + statePtr->acc[7] = XXH_PRIME32_1; + statePtr->seed = seed; + statePtr->useSeed = (seed != 0); + statePtr->extSecret = (const unsigned char*)secret; + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); + statePtr->secretLimit = secretSize - XXH_STRIPE_LEN; + statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE); + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_reset_internal(statePtr, 0, secret, secretSize); + if (secret == NULL) return XXH_ERROR; + if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed) +{ + if (statePtr == NULL) return XXH_ERROR; + if (seed==0) return XXH3_64bits_reset(statePtr); + if ((seed != statePtr->seed) || (statePtr->extSecret != NULL)) + XXH3_initCustomSecret(statePtr->customSecret, seed); + XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE); + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed64) +{ + if (statePtr == NULL) return XXH_ERROR; + if (secret == NULL) return XXH_ERROR; + if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; + XXH3_reset_internal(statePtr, seed64, secret, secretSize); + statePtr->useSeed = 1; /* always, even if seed64==0 */ + return XXH_OK; +} + +/*! + * @internal + * @brief Processes a large input for XXH3_update() and XXH3_digest_long(). + * + * Unlike XXH3_hashLong_internal_loop(), this can process data that overlaps a block. + * + * @param acc Pointer to the 8 accumulator lanes + * @param nbStripesSoFarPtr In/out pointer to the number of leftover stripes in the block* + * @param nbStripesPerBlock Number of stripes in a block + * @param input Input pointer + * @param nbStripes Number of stripes to process + * @param secret Secret pointer + * @param secretLimit Offset of the last block in @p secret + * @param f_acc Pointer to an XXH3_accumulate implementation + * @param f_scramble Pointer to an XXH3_scrambleAcc implementation + * @return Pointer past the end of @p input after processing + */ +XXH_FORCE_INLINE const xxh_u8 * +XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc, + size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock, + const xxh_u8* XXH_RESTRICT input, size_t nbStripes, + const xxh_u8* XXH_RESTRICT secret, size_t secretLimit, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble) +{ + const xxh_u8* initialSecret = secret + *nbStripesSoFarPtr * XXH_SECRET_CONSUME_RATE; + /* Process full blocks */ + if (nbStripes >= (nbStripesPerBlock - *nbStripesSoFarPtr)) { + /* Process the initial partial block... */ + size_t nbStripesThisIter = nbStripesPerBlock - *nbStripesSoFarPtr; + + do { + /* Accumulate and scramble */ + f_acc(acc, input, initialSecret, nbStripesThisIter); + f_scramble(acc, secret + secretLimit); + input += nbStripesThisIter * XXH_STRIPE_LEN; + nbStripes -= nbStripesThisIter; + /* Then continue the loop with the full block size */ + nbStripesThisIter = nbStripesPerBlock; + initialSecret = secret; + } while (nbStripes >= nbStripesPerBlock); + *nbStripesSoFarPtr = 0; + } + /* Process a partial block */ + if (nbStripes > 0) { + f_acc(acc, input, initialSecret, nbStripes); + input += nbStripes * XXH_STRIPE_LEN; + *nbStripesSoFarPtr += nbStripes; + } + /* Return end pointer */ + return input; +} + +#ifndef XXH3_STREAM_USE_STACK +# if XXH_SIZE_OPT <= 0 && !defined(__clang__) /* clang doesn't need additional stack space */ +# define XXH3_STREAM_USE_STACK 1 +# endif +#endif +/* + * Both XXH3_64bits_update and XXH3_128bits_update use this routine. + */ +XXH_FORCE_INLINE XXH_errorcode +XXH3_update(XXH3_state_t* XXH_RESTRICT const state, + const xxh_u8* XXH_RESTRICT input, size_t len, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble) +{ + if (input==NULL) { + XXH_ASSERT(len == 0); + return XXH_OK; + } + + XXH_ASSERT(state != NULL); + { const xxh_u8* const bEnd = input + len; + const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; +#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1 + /* For some reason, gcc and MSVC seem to suffer greatly + * when operating accumulators directly into state. + * Operating into stack space seems to enable proper optimization. + * clang, on the other hand, doesn't seem to need this trick */ + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8]; + XXH_memcpy(acc, state->acc, sizeof(acc)); +#else + xxh_u64* XXH_RESTRICT const acc = state->acc; +#endif + state->totalLen += len; + XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE); + + /* small input : just fill in tmp buffer */ + if (len <= XXH3_INTERNALBUFFER_SIZE - state->bufferedSize) { + XXH_memcpy(state->buffer + state->bufferedSize, input, len); + state->bufferedSize += (XXH32_hash_t)len; + return XXH_OK; + } + + /* total input is now > XXH3_INTERNALBUFFER_SIZE */ + #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN) + XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0); /* clean multiple */ + + /* + * Internal buffer is partially filled (always, except at beginning) + * Complete it, then consume it. + */ + if (state->bufferedSize) { + size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize; + XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize); + input += loadSize; + XXH3_consumeStripes(acc, + &state->nbStripesSoFar, state->nbStripesPerBlock, + state->buffer, XXH3_INTERNALBUFFER_STRIPES, + secret, state->secretLimit, + f_acc, f_scramble); + state->bufferedSize = 0; + } + XXH_ASSERT(input < bEnd); + if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) { + size_t nbStripes = (size_t)(bEnd - 1 - input) / XXH_STRIPE_LEN; + input = XXH3_consumeStripes(acc, + &state->nbStripesSoFar, state->nbStripesPerBlock, + input, nbStripes, + secret, state->secretLimit, + f_acc, f_scramble); + XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN); + + } + /* Some remaining input (always) : buffer it */ + XXH_ASSERT(input < bEnd); + XXH_ASSERT(bEnd - input <= XXH3_INTERNALBUFFER_SIZE); + XXH_ASSERT(state->bufferedSize == 0); + XXH_memcpy(state->buffer, input, (size_t)(bEnd-input)); + state->bufferedSize = (XXH32_hash_t)(bEnd-input); +#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1 + /* save stack accumulators into state */ + XXH_memcpy(state->acc, acc, sizeof(acc)); +#endif + } + + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len) +{ + return XXH3_update(state, (const xxh_u8*)input, len, + XXH3_accumulate, XXH3_scrambleAcc); +} + + +XXH_FORCE_INLINE void +XXH3_digest_long (XXH64_hash_t* acc, + const XXH3_state_t* state, + const unsigned char* secret) +{ + xxh_u8 lastStripe[XXH_STRIPE_LEN]; + const xxh_u8* lastStripePtr; + + /* + * Digest on a local copy. This way, the state remains unaltered, and it can + * continue ingesting more input afterwards. + */ + XXH_memcpy(acc, state->acc, sizeof(state->acc)); + if (state->bufferedSize >= XXH_STRIPE_LEN) { + /* Consume remaining stripes then point to remaining data in buffer */ + size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN; + size_t nbStripesSoFar = state->nbStripesSoFar; + XXH3_consumeStripes(acc, + &nbStripesSoFar, state->nbStripesPerBlock, + state->buffer, nbStripes, + secret, state->secretLimit, + XXH3_accumulate, XXH3_scrambleAcc); + lastStripePtr = state->buffer + state->bufferedSize - XXH_STRIPE_LEN; + } else { /* bufferedSize < XXH_STRIPE_LEN */ + /* Copy to temp buffer */ + size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize; + XXH_ASSERT(state->bufferedSize > 0); /* there is always some input buffered */ + XXH_memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize); + XXH_memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize); + lastStripePtr = lastStripe; + } + /* Last stripe */ + XXH3_accumulate_512(acc, + lastStripePtr, + secret + state->secretLimit - XXH_SECRET_LASTACC_START); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* state) +{ + const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; + if (state->totalLen > XXH3_MIDSIZE_MAX) { + XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB]; + XXH3_digest_long(acc, state, secret); + return XXH3_mergeAccs(acc, + secret + XXH_SECRET_MERGEACCS_START, + (xxh_u64)state->totalLen * XXH_PRIME64_1); + } + /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */ + if (state->useSeed) + return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); + return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen), + secret, state->secretLimit + XXH_STRIPE_LEN); +} +#endif /* !XXH_NO_STREAM */ + + +/* ========================================== + * XXH3 128 bits (a.k.a XXH128) + * ========================================== + * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant, + * even without counting the significantly larger output size. + * + * For example, extra steps are taken to avoid the seed-dependent collisions + * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B). + * + * This strength naturally comes at the cost of some speed, especially on short + * lengths. Note that longer hashes are about as fast as the 64-bit version + * due to it using only a slight modification of the 64-bit loop. + * + * XXH128 is also more oriented towards 64-bit machines. It is still extremely + * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64). + */ + +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + /* A doubled version of 1to3_64b with different constants. */ + XXH_ASSERT(input != NULL); + XXH_ASSERT(1 <= len && len <= 3); + XXH_ASSERT(secret != NULL); + /* + * len = 1: combinedl = { input[0], 0x01, input[0], input[0] } + * len = 2: combinedl = { input[1], 0x02, input[0], input[1] } + * len = 3: combinedl = { input[2], 0x03, input[0], input[1] } + */ + { xxh_u8 const c1 = input[0]; + xxh_u8 const c2 = input[len >> 1]; + xxh_u8 const c3 = input[len - 1]; + xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24) + | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); + xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13); + xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; + xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed; + xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl; + xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph; + XXH128_hash_t h128; + h128.low64 = XXH64_avalanche(keyed_lo); + h128.high64 = XXH64_avalanche(keyed_hi); + return h128; + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(4 <= len && len <= 8); + seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; + { xxh_u32 const input_lo = XXH_readLE32(input); + xxh_u32 const input_hi = XXH_readLE32(input + len - 4); + xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32); + xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed; + xxh_u64 const keyed = input_64 ^ bitflip; + + /* Shift len to the left to ensure it is even, this avoids even multiplies. */ + XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2)); + + m128.high64 += (m128.low64 << 1); + m128.low64 ^= (m128.high64 >> 3); + + m128.low64 = XXH_xorshift64(m128.low64, 35); + m128.low64 *= PRIME_MX2; + m128.low64 = XXH_xorshift64(m128.low64, 28); + m128.high64 = XXH3_avalanche(m128.high64); + return m128; + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(9 <= len && len <= 16); + { xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed; + xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed; + xxh_u64 const input_lo = XXH_readLE64(input); + xxh_u64 input_hi = XXH_readLE64(input + len - 8); + XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1); + /* + * Put len in the middle of m128 to ensure that the length gets mixed to + * both the low and high bits in the 128x64 multiply below. + */ + m128.low64 += (xxh_u64)(len - 1) << 54; + input_hi ^= bitfliph; + /* + * Add the high 32 bits of input_hi to the high 32 bits of m128, then + * add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to + * the high 64 bits of m128. + * + * The best approach to this operation is different on 32-bit and 64-bit. + */ + if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */ + /* + * 32-bit optimized version, which is more readable. + * + * On 32-bit, it removes an ADC and delays a dependency between the two + * halves of m128.high64, but it generates an extra mask on 64-bit. + */ + m128.high64 += (input_hi & 0xFFFFFFFF00000000ULL) + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2); + } else { + /* + * 64-bit optimized (albeit more confusing) version. + * + * Uses some properties of addition and multiplication to remove the mask: + * + * Let: + * a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF) + * b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000) + * c = XXH_PRIME32_2 + * + * a + (b * c) + * Inverse Property: x + y - x == y + * a + (b * (1 + c - 1)) + * Distributive Property: x * (y + z) == (x * y) + (x * z) + * a + (b * 1) + (b * (c - 1)) + * Identity Property: x * 1 == x + * a + b + (b * (c - 1)) + * + * Substitute a, b, and c: + * input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1)) + * + * Since input_hi.hi + input_hi.lo == input_hi, we get this: + * input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1)) + */ + m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1); + } + /* m128 ^= XXH_swap64(m128 >> 64); */ + m128.low64 ^= XXH_swap64(m128.high64); + + { /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */ + XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2); + h128.high64 += m128.high64 * XXH_PRIME64_2; + + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = XXH3_avalanche(h128.high64); + return h128; + } } +} + +/* + * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN + */ +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(len <= 16); + { if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed); + if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed); + if (len) return XXH3_len_1to3_128b(input, len, secret, seed); + { XXH128_hash_t h128; + xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72); + xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88); + h128.low64 = XXH64_avalanche(seed ^ bitflipl); + h128.high64 = XXH64_avalanche( seed ^ bitfliph); + return h128; + } } +} + +/* + * A bit slower than XXH3_mix16B, but handles multiply by zero better. + */ +XXH_FORCE_INLINE XXH128_hash_t +XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2, + const xxh_u8* secret, XXH64_hash_t seed) +{ + acc.low64 += XXH3_mix16B (input_1, secret+0, seed); + acc.low64 ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8); + acc.high64 += XXH3_mix16B (input_2, secret+16, seed); + acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8); + return acc; +} + + +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(16 < len && len <= 128); + + { XXH128_hash_t acc; + acc.low64 = len * XXH_PRIME64_1; + acc.high64 = 0; + +#if XXH_SIZE_OPT >= 1 + { + /* Smaller, but slightly slower. */ + unsigned int i = (unsigned int)(len - 1) / 32; + do { + acc = XXH128_mix32B(acc, input+16*i, input+len-16*(i+1), secret+32*i, seed); + } while (i-- != 0); + } +#else + if (len > 32) { + if (len > 64) { + if (len > 96) { + acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed); + } + acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed); + } + acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed); + } + acc = XXH128_mix32B(acc, input, input+len-16, secret, seed); +#endif + { XXH128_hash_t h128; + h128.low64 = acc.low64 + acc.high64; + h128.high64 = (acc.low64 * XXH_PRIME64_1) + + (acc.high64 * XXH_PRIME64_4) + + ((len - seed) * XXH_PRIME64_2); + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); + return h128; + } + } +} + +XXH_NO_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); + + { XXH128_hash_t acc; + unsigned i; + acc.low64 = len * XXH_PRIME64_1; + acc.high64 = 0; + /* + * We set as `i` as offset + 32. We do this so that unchanged + * `len` can be used as upper bound. This reaches a sweet spot + * where both x86 and aarch64 get simple agen and good codegen + * for the loop. + */ + for (i = 32; i < 160; i += 32) { + acc = XXH128_mix32B(acc, + input + i - 32, + input + i - 16, + secret + i - 32, + seed); + } + acc.low64 = XXH3_avalanche(acc.low64); + acc.high64 = XXH3_avalanche(acc.high64); + /* + * NB: `i <= len` will duplicate the last 32-bytes if + * len % 32 was zero. This is an unfortunate necessity to keep + * the hash result stable. + */ + for (i=160; i <= len; i += 32) { + acc = XXH128_mix32B(acc, + input + i - 32, + input + i - 16, + secret + XXH3_MIDSIZE_STARTOFFSET + i - 160, + seed); + } + /* last bytes */ + acc = XXH128_mix32B(acc, + input + len - 16, + input + len - 32, + secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16, + (XXH64_hash_t)0 - seed); + + { XXH128_hash_t h128; + h128.low64 = acc.low64 + acc.high64; + h128.high64 = (acc.low64 * XXH_PRIME64_1) + + (acc.high64 * XXH_PRIME64_4) + + ((len - seed) * XXH_PRIME64_2); + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); + return h128; + } + } +} + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble) +{ + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC; + + XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc, f_scramble); + + /* converge into final hash */ + XXH_STATIC_ASSERT(sizeof(acc) == 64); + XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + { XXH128_hash_t h128; + h128.low64 = XXH3_mergeAccs(acc, + secret + XXH_SECRET_MERGEACCS_START, + (xxh_u64)len * XXH_PRIME64_1); + h128.high64 = XXH3_mergeAccs(acc, + secret + secretSize + - sizeof(acc) - XXH_SECRET_MERGEACCS_START, + ~((xxh_u64)len * XXH_PRIME64_2)); + return h128; + } +} + +/* + * It's important for performance that XXH3_hashLong() is not inlined. + */ +XXH_NO_INLINE XXH_PUREF XXH128_hash_t +XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, + const void* XXH_RESTRICT secret, size_t secretLen) +{ + (void)seed64; (void)secret; (void)secretLen; + return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), + XXH3_accumulate, XXH3_scrambleAcc); +} + +/* + * It's important for performance to pass @p secretLen (when it's static) + * to the compiler, so that it can properly optimize the vectorized loop. + * + * When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE + * breaks -Og, this is XXH_NO_INLINE. + */ +XXH3_WITH_SECRET_INLINE XXH128_hash_t +XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, + const void* XXH_RESTRICT secret, size_t secretLen) +{ + (void)seed64; + return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen, + XXH3_accumulate, XXH3_scrambleAcc); +} + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble, + XXH3_f_initCustomSecret f_initSec) +{ + if (seed64 == 0) + return XXH3_hashLong_128b_internal(input, len, + XXH3_kSecret, sizeof(XXH3_kSecret), + f_acc, f_scramble); + { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; + f_initSec(secret, seed64); + return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret), + f_acc, f_scramble); + } +} + +/* + * It's important for performance that XXH3_hashLong is not inlined. + */ +XXH_NO_INLINE XXH128_hash_t +XXH3_hashLong_128b_withSeed(const void* input, size_t len, + XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen) +{ + (void)secret; (void)secretLen; + return XXH3_hashLong_128b_withSeed_internal(input, len, seed64, + XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret); +} + +typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void* XXH_RESTRICT, size_t, + XXH64_hash_t, const void* XXH_RESTRICT, size_t); + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_128bits_internal(const void* input, size_t len, + XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen, + XXH3_hashLong128_f f_hl128) +{ + XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN); + /* + * If an action is to be taken if `secret` conditions are not respected, + * it should be done here. + * For now, it's a contract pre-condition. + * Adding a check and a branch here would cost performance at every hash. + */ + if (len <= 16) + return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64); + if (len <= 128) + return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); + return f_hl128(input, len, seed64, secret, secretLen); +} + + +/* === Public XXH128 API === */ + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* input, size_t len) +{ + return XXH3_128bits_internal(input, len, 0, + XXH3_kSecret, sizeof(XXH3_kSecret), + XXH3_hashLong_128b_default); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH3_128bits_withSecret(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize) +{ + return XXH3_128bits_internal(input, len, 0, + (const xxh_u8*)secret, secretSize, + XXH3_hashLong_128b_withSecret); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH3_128bits_withSeed(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed) +{ + return XXH3_128bits_internal(input, len, seed, + XXH3_kSecret, sizeof(XXH3_kSecret), + XXH3_hashLong_128b_withSeed); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed) +{ + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL); + return XXH3_hashLong_128b_withSecret(input, len, seed, secret, secretSize); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH128(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed) +{ + return XXH3_128bits_withSeed(input, len, seed); +} + + +/* === XXH3 128-bit streaming === */ +#ifndef XXH_NO_STREAM +/* + * All initialization and update functions are identical to 64-bit streaming variant. + * The only difference is the finalization routine. + */ + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr) +{ + return XXH3_64bits_reset(statePtr); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize) +{ + return XXH3_64bits_reset_withSecret(statePtr, secret, secretSize); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed) +{ + return XXH3_64bits_reset_withSeed(statePtr, seed); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed) +{ + return XXH3_64bits_reset_withSecretandSeed(statePtr, secret, secretSize, seed); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len) +{ + return XXH3_64bits_update(state, input, len); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* state) +{ + const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; + if (state->totalLen > XXH3_MIDSIZE_MAX) { + XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB]; + XXH3_digest_long(acc, state, secret); + XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + { XXH128_hash_t h128; + h128.low64 = XXH3_mergeAccs(acc, + secret + XXH_SECRET_MERGEACCS_START, + (xxh_u64)state->totalLen * XXH_PRIME64_1); + h128.high64 = XXH3_mergeAccs(acc, + secret + state->secretLimit + XXH_STRIPE_LEN + - sizeof(acc) - XXH_SECRET_MERGEACCS_START, + ~((xxh_u64)state->totalLen * XXH_PRIME64_2)); + return h128; + } + } + /* len <= XXH3_MIDSIZE_MAX : short code */ + if (state->seed) + return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); + return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen), + secret, state->secretLimit + XXH_STRIPE_LEN); +} +#endif /* !XXH_NO_STREAM */ +/* 128-bit utility functions */ + +#include <string.h> /* memcmp, memcpy */ + +/* return : 1 is equal, 0 if different */ +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) +{ + /* note : XXH128_hash_t is compact, it has no padding byte */ + return !(memcmp(&h1, &h2, sizeof(h1))); +} + +/* This prototype is compatible with stdlib's qsort(). + * @return : >0 if *h128_1 > *h128_2 + * <0 if *h128_1 < *h128_2 + * =0 if *h128_1 == *h128_2 */ +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2) +{ + XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1; + XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2; + int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64); + /* note : bets that, in most cases, hash values are different */ + if (hcmp) return hcmp; + return (h1.low64 > h2.low64) - (h2.low64 > h1.low64); +} + + +/*====== Canonical representation ======*/ +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API void +XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) { + hash.high64 = XXH_swap64(hash.high64); + hash.low64 = XXH_swap64(hash.low64); + } + XXH_memcpy(dst, &hash.high64, sizeof(hash.high64)); + XXH_memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64)); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src) +{ + XXH128_hash_t h; + h.high64 = XXH_readBE64(src); + h.low64 = XXH_readBE64(src->digest + 8); + return h; +} + + + +/* ========================================== + * Secret generators + * ========================================== + */ +#define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x)) + +XXH_FORCE_INLINE void XXH3_combine16(void* dst, XXH128_hash_t h128) +{ + XXH_writeLE64( dst, XXH_readLE64(dst) ^ h128.low64 ); + XXH_writeLE64( (char*)dst+8, XXH_readLE64((char*)dst+8) ^ h128.high64 ); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize) +{ +#if (XXH_DEBUGLEVEL >= 1) + XXH_ASSERT(secretBuffer != NULL); + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); +#else + /* production mode, assert() are disabled */ + if (secretBuffer == NULL) return XXH_ERROR; + if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; +#endif + + if (customSeedSize == 0) { + customSeed = XXH3_kSecret; + customSeedSize = XXH_SECRET_DEFAULT_SIZE; + } +#if (XXH_DEBUGLEVEL >= 1) + XXH_ASSERT(customSeed != NULL); +#else + if (customSeed == NULL) return XXH_ERROR; +#endif + + /* Fill secretBuffer with a copy of customSeed - repeat as needed */ + { size_t pos = 0; + while (pos < secretSize) { + size_t const toCopy = XXH_MIN((secretSize - pos), customSeedSize); + memcpy((char*)secretBuffer + pos, customSeed, toCopy); + pos += toCopy; + } } + + { size_t const nbSeg16 = secretSize / 16; + size_t n; + XXH128_canonical_t scrambler; + XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0)); + for (n=0; n<nbSeg16; n++) { + XXH128_hash_t const h128 = XXH128(&scrambler, sizeof(scrambler), n); + XXH3_combine16((char*)secretBuffer + n*16, h128); + } + /* last segment */ + XXH3_combine16((char*)secretBuffer + secretSize - 16, XXH128_hashFromCanonical(&scrambler)); + } + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API void +XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed) +{ + XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; + XXH3_initCustomSecret(secret, seed); + XXH_ASSERT(secretBuffer != NULL); + memcpy(secretBuffer, secret, XXH_SECRET_DEFAULT_SIZE); +} + + + +/* Pop our optimization override from above */ +#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ + && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ + && defined(__OPTIMIZE__) && XXH_SIZE_OPT <= 0 /* respect -O0 and -Os */ +# pragma GCC pop_options +#endif + +#endif /* XXH_NO_LONG_LONG */ + +#endif /* XXH_NO_XXH3 */ + +/*! + * @} + */ +#endif /* XXH_IMPLEMENTATION */ + + +#if defined (__cplusplus) +} /* extern "C" */ +#endif |