diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2022-06-09 04:52:47 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2022-06-09 04:52:57 +0000 |
commit | 00151562145df50cc65e9902d52d5fa77f89fe50 (patch) | |
tree | 2737716802f6725a5074d606ec8fe5422c58a83c /database | |
parent | Releasing debian version 1.34.1-1. (diff) | |
download | netdata-00151562145df50cc65e9902d52d5fa77f89fe50.tar.xz netdata-00151562145df50cc65e9902d52d5fa77f89fe50.zip |
Merging upstream version 1.35.0.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
33 files changed, 2727 insertions, 699 deletions
diff --git a/database/KolmogorovSmirnovDist.c b/database/KolmogorovSmirnovDist.c new file mode 100644 index 000000000..1486abc7b --- /dev/null +++ b/database/KolmogorovSmirnovDist.c @@ -0,0 +1,788 @@ +// SPDX-License-Identifier: GPL-3.0 + +/******************************************************************** + * + * File: KolmogorovSmirnovDist.c + * Environment: ISO C99 or ANSI C89 + * Author: Richard Simard + * Organization: DIRO, Université de Montréal + * Date: 1 February 2012 + * Version 1.1 + + * Copyright 1 march 2010 by Université de Montréal, + Richard Simard and Pierre L'Ecuyer + ===================================================================== + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, version 3 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + + =====================================================================*/ + +#include "KolmogorovSmirnovDist.h" +#include <math.h> +#include <stdlib.h> + +#define num_Pi 3.14159265358979323846 /* PI */ +#define num_Ln2 0.69314718055994530941 /* log(2) */ + +/* For x close to 0 or 1, we use the exact formulae of Ruben-Gambino in all + cases. For n <= NEXACT, we use exact algorithms: the Durbin matrix and + the Pomeranz algorithms. For n > NEXACT, we use asymptotic methods + except for x close to 0 where we still use the method of Durbin + for n <= NKOLMO. For n > NKOLMO, we use asymptotic methods only and + so the precision is less for x close to 0. + We could increase the limit NKOLMO to 10^6 to get better precision + for x close to 0, but at the price of a slower speed. */ +#define NEXACT 500 +#define NKOLMO 100000 + +/* The Durbin matrix algorithm for the Kolmogorov-Smirnov distribution */ +static double DurbinMatrix (int n, double d); + + +/*========================================================================*/ +#if 0 + +/* For ANSI C89 only, not for ISO C99 */ +#define MAXI 50 +#define EPSILON 1.0e-15 + +double log1p (double x) +{ + /* returns a value equivalent to log(1 + x) accurate also for small x. */ + if (fabs (x) > 0.1) { + return log (1.0 + x); + } else { + double term = x; + double sum = x; + int s = 2; + while ((fabs (term) > EPSILON * fabs (sum)) && (s < MAXI)) { + term *= -x; + sum += term / s; + s++; + } + return sum; + } +} + +#undef MAXI +#undef EPSILON + +#endif + +/*========================================================================*/ +#define MFACT 30 + +/* The natural logarithm of factorial n! for 0 <= n <= MFACT */ +static double LnFactorial[MFACT + 1] = { + 0., + 0., + 0.6931471805599453, + 1.791759469228055, + 3.178053830347946, + 4.787491742782046, + 6.579251212010101, + 8.525161361065415, + 10.60460290274525, + 12.80182748008147, + 15.10441257307552, + 17.50230784587389, + 19.98721449566188, + 22.55216385312342, + 25.19122118273868, + 27.89927138384088, + 30.67186010608066, + 33.50507345013688, + 36.39544520803305, + 39.33988418719949, + 42.33561646075348, + 45.3801388984769, + 48.47118135183522, + 51.60667556776437, + 54.7847293981123, + 58.00360522298051, + 61.26170176100199, + 64.55753862700632, + 67.88974313718154, + 71.257038967168, + 74.65823634883016 +}; + +/*------------------------------------------------------------------------*/ + +static double getLogFactorial (int n) +{ + /* Returns the natural logarithm of factorial n! */ + if (n <= MFACT) { + return LnFactorial[n]; + + } else { + double x = (double) (n + 1); + double y = 1.0 / (x * x); + double z = ((-(5.95238095238E-4 * y) + 7.936500793651E-4) * y - + 2.7777777777778E-3) * y + 8.3333333333333E-2; + z = ((x - 0.5) * log (x) - x) + 9.1893853320467E-1 + z / x; + return z; + } +} + +/*------------------------------------------------------------------------*/ + +static double rapfac (int n) +{ + /* Computes n! / n^n */ + int i; + double res = 1.0 / n; + for (i = 2; i <= n; i++) { + res *= (double) i / n; + } + return res; +} + + +/*========================================================================*/ + +static double **CreateMatrixD (int N, int M) +{ + int i; + double **T2; + + T2 = (double **) malloc (N * sizeof (double *)); + T2[0] = (double *) malloc ((size_t) N * M * sizeof (double)); + for (i = 1; i < N; i++) + T2[i] = T2[0] + i * M; + return T2; +} + + +static void DeleteMatrixD (double **T) +{ + free (T[0]); + free (T); +} + + +/*========================================================================*/ + +static double KSPlusbarAsymp (int n, double x) +{ + /* Compute the probability of the KS+ distribution using an asymptotic + formula */ + double t = (6.0 * n * x + 1); + double z = t * t / (18.0 * n); + double v = 1.0 - (2.0 * z * z - 4.0 * z - 1.0) / (18.0 * n); + if (v <= 0.0) + return 0.0; + v = v * exp (-z); + if (v >= 1.0) + return 1.0; + return v; +} + + +/*-------------------------------------------------------------------------*/ + +static double KSPlusbarUpper (int n, double x) +{ + /* Compute the probability of the KS+ distribution in the upper tail using + Smirnov's stable formula */ + const double EPSILON = 1.0E-12; + double q; + double Sum = 0.0; + double term; + double t; + double LogCom; + double LOGJMAX; + int j; + int jdiv; + int jmax = (int) (n * (1.0 - x)); + + if (n > 200000) + return KSPlusbarAsymp (n, x); + + /* Avoid log(0) for j = jmax and q ~ 1.0 */ + if ((1.0 - x - (double) jmax / n) <= 0.0) + jmax--; + + if (n > 3000) + jdiv = 2; + else + jdiv = 3; + + j = jmax / jdiv + 1; + LogCom = getLogFactorial (n) - getLogFactorial (j) - + getLogFactorial (n - j); + LOGJMAX = LogCom; + + while (j <= jmax) { + q = (double) j / n + x; + term = LogCom + (j - 1) * log (q) + (n - j) * log1p (-q); + t = exp (term); + Sum += t; + LogCom += log ((double) (n - j) / (j + 1)); + if (t <= Sum * EPSILON) + break; + j++; + } + + j = jmax / jdiv; + LogCom = LOGJMAX + log ((double) (j + 1) / (n - j)); + + while (j > 0) { + q = (double) j / n + x; + term = LogCom + (j - 1) * log (q) + (n - j) * log1p (-q); + t = exp (term); + Sum += t; + LogCom += log ((double) j / (n - j + 1)); + if (t <= Sum * EPSILON) + break; + j--; + } + + Sum *= x; + /* add the term j = 0 */ + Sum += exp (n * log1p (-x)); + return Sum; +} + + +/*========================================================================*/ + +static double Pelz (int n, double x) +{ + /* Approximating the Lower Tail-Areas of the Kolmogorov-Smirnov One-Sample + Statistic, + Wolfgang Pelz and I. J. Good, + Journal of the Royal Statistical Society, Series B. + Vol. 38, No. 2 (1976), pp. 152-156 + */ + + const int JMAX = 20; + const double EPS = 1.0e-10; + const double C = 2.506628274631001; /* sqrt(2*Pi) */ + const double C2 = 1.2533141373155001; /* sqrt(Pi/2) */ + const double PI2 = num_Pi * num_Pi; + const double PI4 = PI2 * PI2; + const double RACN = sqrt ((double) n); + const double z = RACN * x; + const double z2 = z * z; + const double z4 = z2 * z2; + const double z6 = z4 * z2; + const double w = PI2 / (2.0 * z * z); + double ti, term, tom; + double sum; + int j; + + term = 1; + j = 0; + sum = 0; + while (j <= JMAX && term > EPS * sum) { + ti = j + 0.5; + term = exp (-ti * ti * w); + sum += term; + j++; + } + sum *= C / z; + + term = 1; + tom = 0; + j = 0; + while (j <= JMAX && fabs (term) > EPS * fabs (tom)) { + ti = j + 0.5; + term = (PI2 * ti * ti - z2) * exp (-ti * ti * w); + tom += term; + j++; + } + sum += tom * C2 / (RACN * 3.0 * z4); + + term = 1; + tom = 0; + j = 0; + while (j <= JMAX && fabs (term) > EPS * fabs (tom)) { + ti = j + 0.5; + term = 6 * z6 + 2 * z4 + PI2 * (2 * z4 - 5 * z2) * ti * ti + + PI4 * (1 - 2 * z2) * ti * ti * ti * ti; + term *= exp (-ti * ti * w); + tom += term; + j++; + } + sum += tom * C2 / (n * 36.0 * z * z6); + + term = 1; + tom = 0; + j = 1; + while (j <= JMAX && term > EPS * tom) { + ti = j; + term = PI2 * ti * ti * exp (-ti * ti * w); + tom += term; + j++; + } + sum -= tom * C2 / (n * 18.0 * z * z2); + + term = 1; + tom = 0; + j = 0; + while (j <= JMAX && fabs (term) > EPS * fabs (tom)) { + ti = j + 0.5; + ti = ti * ti; + term = -30 * z6 - 90 * z6 * z2 + PI2 * (135 * z4 - 96 * z6) * ti + + PI4 * (212 * z4 - 60 * z2) * ti * ti + PI2 * PI4 * ti * ti * ti * (5 - + 30 * z2); + term *= exp (-ti * w); + tom += term; + j++; + } + sum += tom * C2 / (RACN * n * 3240.0 * z4 * z6); + + term = 1; + tom = 0; + j = 1; + while (j <= JMAX && fabs (term) > EPS * fabs (tom)) { + ti = j * j; + term = (3 * PI2 * ti * z2 - PI4 * ti * ti) * exp (-ti * w); + tom += term; + j++; + } + sum += tom * C2 / (RACN * n * 108.0 * z6); + + return sum; +} + + +/*=========================================================================*/ + +static void CalcFloorCeil ( + int n, /* sample size */ + double t, /* = nx */ + double *A, /* A_i */ + double *Atflo, /* floor (A_i - t) */ + double *Atcei /* ceiling (A_i + t) */ + ) +{ + /* Precompute A_i, floors, and ceilings for limits of sums in the Pomeranz + algorithm */ + int i; + int ell = (int) t; /* floor (t) */ + double z = t - ell; /* t - floor (t) */ + double w = ceil (t) - t; + + if (z > 0.5) { + for (i = 2; i <= 2 * n + 2; i += 2) + Atflo[i] = i / 2 - 2 - ell; + for (i = 1; i <= 2 * n + 2; i += 2) + Atflo[i] = i / 2 - 1 - ell; + + for (i = 2; i <= 2 * n + 2; i += 2) + Atcei[i] = i / 2 + ell; + for (i = 1; i <= 2 * n + 2; i += 2) + Atcei[i] = i / 2 + 1 + ell; + + } else if (z > 0.0) { + for (i = 1; i <= 2 * n + 2; i++) + Atflo[i] = i / 2 - 1 - ell; + + for (i = 2; i <= 2 * n + 2; i++) + Atcei[i] = i / 2 + ell; + Atcei[1] = 1 + ell; + + } else { /* z == 0 */ + for (i = 2; i <= 2 * n + 2; i += 2) + Atflo[i] = i / 2 - 1 - ell; + for (i = 1; i <= 2 * n + 2; i += 2) + Atflo[i] = i / 2 - ell; + + for (i = 2; i <= 2 * n + 2; i += 2) + Atcei[i] = i / 2 - 1 + ell; + for (i = 1; i <= 2 * n + 2; i += 2) + Atcei[i] = i / 2 + ell; + } + + if (w < z) + z = w; + A[0] = A[1] = 0; + A[2] = z; + A[3] = 1 - A[2]; + for (i = 4; i <= 2 * n + 1; i++) + A[i] = A[i - 2] + 1; + A[2 * n + 2] = n; +} + + +/*========================================================================*/ + +static double Pomeranz (int n, double x) +{ + /* The Pomeranz algorithm to compute the KS distribution */ + const double EPS = 1.0e-15; + const int ENO = 350; + const double RENO = ldexp (1.0, ENO); /* for renormalization of V */ + int coreno; /* counter: how many renormalizations */ + const double t = n * x; + double w, sum, minsum; + int i, j, k, s; + int r1, r2; /* Indices i and i-1 for V[i][] */ + int jlow, jup, klow, kup, kup0; + double *A; + double *Atflo; + double *Atcei; + double **V; + double **H; /* = pow(w, j) / Factorial(j) */ + + A = (double *) calloc ((size_t) (2 * n + 3), sizeof (double)); + Atflo = (double *) calloc ((size_t) (2 * n + 3), sizeof (double)); + Atcei = (double *) calloc ((size_t) (2 * n + 3), sizeof (double)); + V = (double **) CreateMatrixD (2, n + 2); + H = (double **) CreateMatrixD (4, n + 2); + + CalcFloorCeil (n, t, A, Atflo, Atcei); + + for (j = 1; j <= n + 1; j++) + V[0][j] = 0; + for (j = 2; j <= n + 1; j++) + V[1][j] = 0; + V[1][1] = RENO; + coreno = 1; + + /* Precompute H[][] = (A[j] - A[j-1]^k / k! for speed */ + H[0][0] = 1; + w = 2.0 * A[2] / n; + for (j = 1; j <= n + 1; j++) + H[0][j] = w * H[0][j - 1] / j; + + H[1][0] = 1; + w = (1.0 - 2.0 * A[2]) / n; + for (j = 1; j <= n + 1; j++) + H[1][j] = w * H[1][j - 1] / j; + + H[2][0] = 1; + w = A[2] / n; + for (j = 1; j <= n + 1; j++) + H[2][j] = w * H[2][j - 1] / j; + + H[3][0] = 1; + for (j = 1; j <= n + 1; j++) + H[3][j] = 0; + + r1 = 0; + r2 = 1; + for (i = 2; i <= 2 * n + 2; i++) { + jlow = 2 + (int) Atflo[i]; + if (jlow < 1) + jlow = 1; + jup = (int) Atcei[i]; + if (jup > n + 1) + jup = n + 1; + + klow = 2 + (int) Atflo[i - 1]; + if (klow < 1) + klow = 1; + kup0 = (int) Atcei[i - 1]; + + /* Find to which case it corresponds */ + w = (A[i] - A[i - 1]) / n; + s = -1; + for (j = 0; j < 4; j++) { + if (fabs (w - H[j][1]) <= EPS) { + s = j; + break; + } + } + /* assert (s >= 0, "Pomeranz: s < 0"); */ + + minsum = RENO; + r1 = (r1 + 1) & 1; /* i - 1 */ + r2 = (r2 + 1) & 1; /* i */ + + for (j = jlow; j <= jup; j++) { + kup = kup0; + if (kup > j) + kup = j; + sum = 0; + for (k = kup; k >= klow; k--) + sum += V[r1][k] * H[s][j - k]; + V[r2][j] = sum; + if (sum < minsum) + minsum = sum; + } + + if (minsum < 1.0e-280) { + /* V is too small: renormalize to avoid underflow of probabilities */ + for (j = jlow; j <= jup; j++) + V[r2][j] *= RENO; + coreno++; /* keep track of log of RENO */ + } + } + + sum = V[r2][n + 1]; + free (A); + free (Atflo); + free (Atcei); + DeleteMatrixD (H); + DeleteMatrixD (V); + w = getLogFactorial (n) - coreno * ENO * num_Ln2 + log (sum); + if (w >= 0.) + return 1.; + return exp (w); +} + + +/*========================================================================*/ + +static double cdfSpecial (int n, double x) +{ + /* The KS distribution is known exactly for these cases */ + + /* For nx^2 > 18, KSfbar(n, x) is smaller than 5e-16 */ + if ((n * x * x >= 18.0) || (x >= 1.0)) + return 1.0; + + if (x <= 0.5 / n) + return 0.0; + + if (n == 1) + return 2.0 * x - 1.0; + + if (x <= 1.0 / n) { + double t = 2.0 * x * n - 1.0; + double w; + if (n <= NEXACT) { + w = rapfac (n); + return w * pow (t, (double) n); + } + w = getLogFactorial (n) + n * log (t / n); + return exp (w); + } + + if (x >= 1.0 - 1.0 / n) { + return 1.0 - 2.0 * pow (1.0 - x, (double) n); + } + + return -1.0; +} + + +/*========================================================================*/ + +double KScdf (int n, double x) +{ + const double w = n * x * x; + double u = cdfSpecial (n, x); + if (u >= 0.0) + return u; + + if (n <= NEXACT) { + if (w < 0.754693) + return DurbinMatrix (n, x); + if (w < 4.0) + return Pomeranz (n, x); + return 1.0 - KSfbar (n, x); + } + + if ((w * x * n <= 7.0) && (n <= NKOLMO)) + return DurbinMatrix (n, x); + + return Pelz (n, x); +} + + +/*=========================================================================*/ + +static double fbarSpecial (int n, double x) +{ + const double w = n * x * x; + + if ((w >= 370.0) || (x >= 1.0)) + return 0.0; + if ((w <= 0.0274) || (x <= 0.5 / n)) + return 1.0; + if (n == 1) + return 2.0 - 2.0 * x; + + if (x <= 1.0 / n) { + double z; + double t = 2.0 * x * n - 1.0; + if (n <= NEXACT) { + z = rapfac (n); + return 1.0 - z * pow (t, (double) n); + } + z = getLogFactorial (n) + n * log (t / n); + return 1.0 - exp (z); + } + + if (x >= 1.0 - 1.0 / n) { + return 2.0 * pow (1.0 - x, (double) n); + } + return -1.0; +} + + +/*========================================================================*/ + +double KSfbar (int n, double x) +{ + const double w = n * x * x; + double v = fbarSpecial (n, x); + if (v >= 0.0) + return v; + + if (n <= NEXACT) { + if (w < 4.0) + return 1.0 - KScdf (n, x); + else + return 2.0 * KSPlusbarUpper (n, x); + } + + if (w >= 2.65) + return 2.0 * KSPlusbarUpper (n, x); + + return 1.0 - KScdf (n, x); +} + + +/*========================================================================= + +The following implements the Durbin matrix algorithm and was programmed by +G. Marsaglia, Wai Wan Tsang and Jingbo Wong. + +I have made small modifications in their program. (Richard Simard) + + + +=========================================================================*/ + +/* + The C program to compute Kolmogorov's distribution + + K(n,d) = Prob(D_n < d), where + + D_n = max(x_1-0/n,x_2-1/n...,x_n-(n-1)/n,1/n-x_1,2/n-x_2,...,n/n-x_n) + + with x_1<x_2,...<x_n a purported set of n independent uniform [0,1) + random variables sorted into increasing order. + See G. Marsaglia, Wai Wan Tsang and Jingbo Wong, + J.Stat.Software, 8, 18, pp 1--4, (2003). +*/ + +#define NORM 1.0e140 +#define INORM 1.0e-140 +#define LOGNORM 140 + + +/* Matrix product */ +static void mMultiply (double *A, double *B, double *C, int m); + +/* Matrix power */ +static void mPower (double *A, int eA, double *V, int *eV, int m, int n); + + +static double DurbinMatrix (int n, double d) +{ + int k, m, i, j, g, eH, eQ; + double h, s, *H, *Q; + /* OMIT NEXT TWO LINES IF YOU REQUIRE >7 DIGIT ACCURACY IN THE RIGHT TAIL */ +#if 0 + s = d * d * n; + if (s > 7.24 || (s > 3.76 && n > 99)) + return 1 - 2 * exp (-(2.000071 + .331 / sqrt (n) + 1.409 / n) * s); +#endif + k = (int) (n * d) + 1; + m = 2 * k - 1; + h = k - n * d; + H = (double *) malloc ((m * m) * sizeof (double)); + Q = (double *) malloc ((m * m) * sizeof (double)); + for (i = 0; i < m; i++) + for (j = 0; j < m; j++) + if (i - j + 1 < 0) + H[i * m + j] = 0; + else + H[i * m + j] = 1; + for (i = 0; i < m; i++) { + H[i * m] -= pow (h, (double) (i + 1)); + H[(m - 1) * m + i] -= pow (h, (double) (m - i)); + } + H[(m - 1) * m] += (2 * h - 1 > 0 ? pow (2 * h - 1, (double) m) : 0); + for (i = 0; i < m; i++) + for (j = 0; j < m; j++) + if (i - j + 1 > 0) + for (g = 1; g <= i - j + 1; g++) + H[i * m + j] /= g; + eH = 0; + mPower (H, eH, Q, &eQ, m, n); + s = Q[(k - 1) * m + k - 1]; + + for (i = 1; i <= n; i++) { + s = s * (double) i / n; + if (s < INORM) { + s *= NORM; + eQ -= LOGNORM; + } + } + s *= pow (10., (double) eQ); + free (H); + free (Q); + return s; +} + + +static void mMultiply (double *A, double *B, double *C, int m) +{ + int i, j, k; + double s; + for (i = 0; i < m; i++) + for (j = 0; j < m; j++) { + s = 0.; + for (k = 0; k < m; k++) + s += A[i * m + k] * B[k * m + j]; + C[i * m + j] = s; + } +} + + +static void renormalize (double *V, int m, int *p) +{ + int i; + for (i = 0; i < m * m; i++) + V[i] *= INORM; + *p += LOGNORM; +} + + +static void mPower (double *A, int eA, double *V, int *eV, int m, int n) +{ + double *B; + int eB, i; + if (n == 1) { + for (i = 0; i < m * m; i++) + V[i] = A[i]; + *eV = eA; + return; + } + mPower (A, eA, V, eV, m, n / 2); + B = (double *) malloc ((m * m) * sizeof (double)); + mMultiply (V, V, B, m); + eB = 2 * (*eV); + if (B[(m / 2) * m + (m / 2)] > NORM) + renormalize (B, m, &eB); + + if (n % 2 == 0) { + for (i = 0; i < m * m; i++) + V[i] = B[i]; + *eV = eB; + } else { + mMultiply (A, B, V, m); + *eV = eA + eB; + } + + if (V[(m / 2) * m + (m / 2)] > NORM) + renormalize (V, m, eV); + free (B); +} diff --git a/database/KolmogorovSmirnovDist.h b/database/KolmogorovSmirnovDist.h new file mode 100644 index 000000000..cf455042a --- /dev/null +++ b/database/KolmogorovSmirnovDist.h @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: GPL-3.0 + +#ifndef KOLMOGOROVSMIRNOVDIST_H +#define KOLMOGOROVSMIRNOVDIST_H + +#ifdef __cplusplus +extern "C" { +#endif + + +/******************************************************************** + * + * File: KolmogorovSmirnovDist.h + * Environment: ISO C99 or ANSI C89 + * Author: Richard Simard + * Organization: DIRO, Université de Montréal + * Date: 1 February 2012 + * Version 1.1 + * + * Copyright March 2010 by Université de Montréal, + Richard Simard and Pierre L'Ecuyer + ===================================================================== + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, version 3 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + + =====================================================================*/ +/* + * + * The Kolmogorov-Smirnov test statistic D_n is defined by + * + * D_n = sup_x |F(x) - S_n(x)| + * + * where n is the sample size, F(x) is a completely specified theoretical + * distribution, and S_n(x) is an empirical distribution function. + * + * + * The function + * + * double KScdf (int n, double x); + * + * computes the cumulative probability P[D_n <= x] of the 2-sided 1-sample + * Kolmogorov-Smirnov distribution with sample size n at x. + * It returns at least 13 decimal digits of precision for n <= 500, + * at least 7 decimal digits of precision for 500 < n <= 100000, + * and a few correct decimal digits for n > 100000. + * + */ + +double KScdf (int n, double x); + + +/* + * The function + * + * double KSfbar (int n, double x); + * + * computes the complementary cumulative probability P[D_n >= x] of the + * 2-sided 1-sample Kolmogorov-Smirnov distribution with sample size n at x. + * It returns at least 10 decimal digits of precision for n <= 500, + * at least 6 decimal digits of precision for 500 < n <= 200000, + * and a few correct decimal digits for n > 200000. + * + */ + +double KSfbar (int n, double x); + + +/* + * NOTE: + * The ISO C99 function log1p of the standard math library does not exist in + * ANSI C89. Here, it is programmed explicitly in KolmogorovSmirnovDist.c. + + * For ANSI C89 compilers, change the preprocessor condition to make it + * available. + */ + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/database/engine/journalfile.c b/database/engine/journalfile.c index 1541eb10a..0b3d3eeb8 100644 --- a/database/engine/journalfile.c +++ b/database/engine/journalfile.c @@ -84,6 +84,7 @@ void * wal_get_transaction_buffer(struct rrdengine_worker_config* wc, unsigned s if (unlikely(ret)) { fatal("posix_memalign:%s", strerror(ret)); } + memset(ctx->commit_log.buf, 0, buf_size); buf_pos = ctx->commit_log.buf_pos = 0; ctx->commit_log.buf_size = buf_size; } diff --git a/database/engine/metadata_log/logfile.c b/database/engine/metadata_log/logfile.c index f5bd9b2d2..07eb9b6fe 100644 --- a/database/engine/metadata_log/logfile.c +++ b/database/engine/metadata_log/logfile.c @@ -375,19 +375,15 @@ static int scan_metalog_files(struct metalog_instance *ctx) struct metalog_pluginsd_state metalog_parser_state; metalog_pluginsd_state_init(&metalog_parser_state, ctx); - PARSER_USER_OBJECT metalog_parser_object; - metalog_parser_object.enabled = cd.enabled; - metalog_parser_object.host = ctx->rrdeng_ctx->host; - metalog_parser_object.cd = &cd; - metalog_parser_object.trust_durations = 0; - metalog_parser_object.private = &metalog_parser_state; + PARSER_USER_OBJECT metalog_parser_object = { + .enabled = cd.enabled, + .host = ctx->rrdeng_ctx->host, + .cd = &cd, + .trust_durations = 0, + .private = &metalog_parser_state + }; PARSER *parser = parser_init(metalog_parser_object.host, &metalog_parser_object, NULL, PARSER_INPUT_SPLIT); - if (unlikely(!parser)) { - error("Failed to initialize metadata log parser."); - failed_to_load = matched_files; - goto after_failed_to_parse; - } parser_add_keyword(parser, PLUGINSD_KEYWORD_HOST, metalog_pluginsd_host); parser_add_keyword(parser, PLUGINSD_KEYWORD_GUID, pluginsd_guid); parser_add_keyword(parser, PLUGINSD_KEYWORD_CONTEXT, pluginsd_context); @@ -428,10 +424,8 @@ static int scan_metalog_files(struct metalog_instance *ctx) size_t count __maybe_unused = metalog_parser_object.count; debug(D_METADATALOG, "Parsing count=%u", (unsigned)count); -after_failed_to_parse: freez(metalogfiles); - return matched_files; } diff --git a/database/engine/pagecache.c b/database/engine/pagecache.c index 40e24b321..cddbf9e1f 100644 --- a/database/engine/pagecache.c +++ b/database/engine/pagecache.c @@ -356,7 +356,7 @@ static void pg_cache_evict_unsafe(struct rrdengine_instance *ctx, struct rrdeng_ { struct page_cache_descr *pg_cache_descr = descr->pg_cache_descr; - freez(pg_cache_descr->page); + dbengine_page_free(pg_cache_descr->page); pg_cache_descr->page = NULL; pg_cache_descr->flags &= ~RRD_PAGE_POPULATED; pg_cache_release_pages_unsafe(ctx, 1); @@ -437,7 +437,6 @@ uint8_t pg_cache_punch_hole(struct rrdengine_instance *ctx, struct rrdeng_page_d ret = JudyLDel(&page_index->JudyL_array, (Word_t)(descr->start_time / USEC_PER_SEC), PJE0); if (unlikely(0 == ret)) { uv_rwlock_wrunlock(&page_index->lock); - error("Page under deletion was not in index."); if (unlikely(debug_flags & D_RRDENGINE)) { print_page_descr(descr); } @@ -1067,10 +1066,13 @@ pg_cache_lookup_next(struct rrdengine_instance *ctx, struct pg_cache_page_index page_not_in_cache = 0; uv_rwlock_rdlock(&page_index->lock); + int retry_count = 0; while (1) { descr = find_first_page_in_time_range(page_index, start_time, end_time); - if (NULL == descr || 0 == descr->page_length) { + if (NULL == descr || 0 == descr->page_length || retry_count == MAX_PAGE_CACHE_RETRY_WAIT) { /* non-empty page not found */ + if (retry_count == MAX_PAGE_CACHE_RETRY_WAIT) + error_report("Page cache timeout while waiting for page %p : returning FAIL", descr); uv_rwlock_rdunlock(&page_index->lock); pg_cache_release_pages(ctx, 1); @@ -1114,7 +1116,11 @@ pg_cache_lookup_next(struct rrdengine_instance *ctx, struct pg_cache_page_index print_page_cache_descr(descr); if (!(flags & RRD_PAGE_POPULATED)) page_not_in_cache = 1; - pg_cache_wait_event_unsafe(descr); + + if (pg_cache_timedwait_event_unsafe(descr, 1) == UV_ETIMEDOUT) { + error_report("Page cache timeout while waiting for page %p : retry count = %d", descr, retry_count); + ++retry_count; + } rrdeng_page_descr_mutex_unlock(ctx, descr); /* reset scan to find again */ @@ -1222,7 +1228,7 @@ void free_page_cache(struct rrdengine_instance *ctx) /* Check rrdenglocking.c */ pg_cache_descr = descr->pg_cache_descr; if (pg_cache_descr->flags & RRD_PAGE_POPULATED) { - freez(pg_cache_descr->page); + dbengine_page_free(pg_cache_descr->page); bytes_freed += RRDENG_BLOCK_SIZE; } rrdeng_destroy_pg_cache_descr(ctx, pg_cache_descr); diff --git a/database/engine/pagecache.h b/database/engine/pagecache.h index d5350ef56..0ba4639ce 100644 --- a/database/engine/pagecache.h +++ b/database/engine/pagecache.h @@ -11,6 +11,7 @@ struct extent_info; struct rrdeng_page_descr; #define INVALID_TIME (0) +#define MAX_PAGE_CACHE_RETRY_WAIT (3) /* Page flags */ #define RRD_PAGE_DIRTY (1LU << 0) diff --git a/database/engine/rrdengine.c b/database/engine/rrdengine.c index a975cfa6e..9f43f4456 100644 --- a/database/engine/rrdengine.c +++ b/database/engine/rrdengine.c @@ -11,8 +11,24 @@ rrdeng_stats_t global_flushing_pressure_page_deletions = 0; static unsigned pages_per_extent = MAX_PAGES_PER_EXTENT; +#if WORKER_UTILIZATION_MAX_JOB_TYPES < (RRDENG_MAX_OPCODE + 2) +#error Please increase WORKER_UTILIZATION_MAX_JOB_TYPES to at least (RRDENG_MAX_OPCODE + 2) +#endif + +void *dbengine_page_alloc() { + void *page = netdata_mmap(NULL, RRDENG_BLOCK_SIZE, MAP_PRIVATE, enable_ksm); + if(!page) fatal("Cannot allocate dbengine page cache page, with mmap()"); + return page; +} + +void dbengine_page_free(void *page) { + munmap(page, RRDENG_BLOCK_SIZE); +} + static void sanity_check(void) { + BUILD_BUG_ON(WORKER_UTILIZATION_MAX_JOB_TYPES < (RRDENG_MAX_OPCODE + 2)); + /* Magic numbers must fit in the super-blocks */ BUILD_BUG_ON(strlen(RRDENG_DF_MAGIC) > RRDENG_MAGIC_SZ); BUILD_BUG_ON(strlen(RRDENG_JF_MAGIC) > RRDENG_MAGIC_SZ); @@ -176,7 +192,7 @@ void read_cached_extent_cb(struct rrdengine_worker_config* wc, unsigned idx, str struct extent_info *extent = xt_io_descr->descr_array[0]->extent; for (i = 0 ; i < xt_io_descr->descr_count; ++i) { - page = mallocz(RRDENG_BLOCK_SIZE); + page = dbengine_page_alloc(); descr = xt_io_descr->descr_array[i]; for (j = 0, page_offset = 0 ; j < extent->number_of_pages ; ++j) { /* care, we don't hold the descriptor mutex */ @@ -331,7 +347,7 @@ after_crc_check: continue; /* Failed to reserve a suitable page */ is_prefetched_page = 1; } - page = mallocz(RRDENG_BLOCK_SIZE); + page = dbengine_page_alloc(); /* care, we don't hold the descriptor mutex */ if (have_read_error) { @@ -735,6 +751,7 @@ static int do_flush_pages(struct rrdengine_worker_config* wc, int force, struct fatal("posix_memalign:%s", strerror(ret)); /* freez(xt_io_descr);*/ } + memset(xt_io_descr->buf, 0, ALIGN_BYTES_CEILING(size_bytes)); (void) memcpy(xt_io_descr->descr_array, eligible_pages, sizeof(struct rrdeng_page_descr *) * count); xt_io_descr->descr_count = count; @@ -1074,13 +1091,17 @@ void async_cb(uv_async_t *handle) void timer_cb(uv_timer_t* handle) { + worker_is_busy(RRDENG_MAX_OPCODE + 1); + struct rrdengine_worker_config* wc = handle->data; struct rrdengine_instance *ctx = wc->ctx; uv_stop(handle->loop); uv_update_time(handle->loop); - if (unlikely(!ctx->metalog_ctx->initialized)) + if (unlikely(!ctx->metalog_ctx->initialized)) { + worker_is_idle(); return; /* Wait for the metadata log to initialize */ + } rrdeng_test_quota(wc); debug(D_RRDENGINE, "%s: timeout reached.", __func__); if (likely(!wc->now_deleting_files && !wc->now_invalidating_dirty_pages)) { @@ -1122,12 +1143,26 @@ void timer_cb(uv_timer_t* handle) debug(D_RRDENGINE, "%s", get_rrdeng_statistics(wc->ctx, buf, sizeof(buf))); } #endif + + worker_is_idle(); } #define MAX_CMD_BATCH_SIZE (256) void rrdeng_worker(void* arg) { + worker_register("DBENGINE"); + worker_register_job_name(RRDENG_NOOP, "noop"); + worker_register_job_name(RRDENG_READ_PAGE, "page read"); + worker_register_job_name(RRDENG_READ_EXTENT, "extent read"); + worker_register_job_name(RRDENG_COMMIT_PAGE, "commit"); + worker_register_job_name(RRDENG_FLUSH_PAGES, "flush"); + worker_register_job_name(RRDENG_SHUTDOWN, "shutdown"); + worker_register_job_name(RRDENG_INVALIDATE_OLDEST_MEMORY_PAGE, "page lru"); + worker_register_job_name(RRDENG_QUIESCE, "quiesce"); + worker_register_job_name(RRDENG_MAX_OPCODE, "cleanup"); + worker_register_job_name(RRDENG_MAX_OPCODE + 1, "timer"); + struct rrdengine_worker_config* wc = arg; struct rrdengine_instance *ctx = wc->ctx; uv_loop_t* loop; @@ -1175,8 +1210,11 @@ void rrdeng_worker(void* arg) fatal_assert(0 == uv_timer_start(&timer_req, timer_cb, TIMER_PERIOD_MS, TIMER_PERIOD_MS)); shutdown = 0; + int set_name = 0; while (likely(shutdown == 0 || rrdeng_threads_alive(wc))) { + worker_is_idle(); uv_run(loop, UV_RUN_DEFAULT); + worker_is_busy(RRDENG_MAX_OPCODE); rrdeng_cleanup_finished_threads(wc); /* wait for commands */ @@ -1193,6 +1231,9 @@ void rrdeng_worker(void* arg) opcode = cmd.opcode; ++cmd_batch_size; + if(likely(opcode != RRDENG_NOOP)) + worker_is_busy(opcode); + switch (opcode) { case RRDENG_NOOP: /* the command queue was empty, do nothing */ @@ -1219,6 +1260,10 @@ void rrdeng_worker(void* arg) break; case RRDENG_READ_EXTENT: do_read_extent(wc, cmd.read_extent.page_cache_descr, cmd.read_extent.page_count, 1); + if (unlikely(!set_name)) { + set_name = 1; + uv_thread_set_name_np(ctx->worker_config.thread, "DBENGINE"); + } break; case RRDENG_COMMIT_PAGE: do_commit_transaction(wc, STORE_DATA, NULL); @@ -1265,6 +1310,7 @@ void rrdeng_worker(void* arg) fatal_assert(0 == uv_loop_close(loop)); freez(loop); + worker_unregister(); return; error_after_timer_init: @@ -1277,6 +1323,7 @@ error_after_loop_init: wc->error = UV_EAGAIN; /* wake up initialization thread */ completion_mark_complete(&ctx->rrdengine_completion); + worker_unregister(); } /* C entry point for development purposes diff --git a/database/engine/rrdengine.h b/database/engine/rrdengine.h index b0c8e4d02..c6f89a37a 100644 --- a/database/engine/rrdengine.h +++ b/database/engine/rrdengine.h @@ -34,6 +34,28 @@ struct rrdengine_instance; #define RRDENG_FILE_NUMBER_SCAN_TMPL "%1u-%10u" #define RRDENG_FILE_NUMBER_PRINT_TMPL "%1.1u-%10.10u" +struct rrdeng_collect_handle { + struct rrdeng_page_descr *descr, *prev_descr; + unsigned long page_correlation_id; + struct rrdengine_instance *ctx; + // set to 1 when this dimension is not page aligned with the other dimensions in the chart + uint8_t unaligned_page; +}; + +struct rrdeng_query_handle { + struct rrdeng_page_descr *descr; + struct rrdengine_instance *ctx; + struct pg_cache_page_index *page_index; + time_t next_page_time; + time_t now; + unsigned position; + unsigned entries; + storage_number *page; + usec_t page_end_time; + uint32_t page_length; + usec_t dt; + time_t dt_sec; +}; typedef enum { RRDENGINE_STATUS_UNINITIALIZED = 0, @@ -227,6 +249,9 @@ struct rrdengine_instance { struct rrdengine_statistics stats; }; +extern void *dbengine_page_alloc(void); +extern void dbengine_page_free(void *page); + extern int init_rrd_files(struct rrdengine_instance *ctx); extern void finalize_rrd_files(struct rrdengine_instance *ctx); extern void rrdeng_test_quota(struct rrdengine_worker_config* wc); diff --git a/database/engine/rrdengineapi.c b/database/engine/rrdengineapi.c index 6ebee1459..76010a7c2 100755 --- a/database/engine/rrdengineapi.c +++ b/database/engine/rrdengineapi.c @@ -126,12 +126,13 @@ void rrdeng_store_metric_init(RRDDIM *rd) struct pg_cache_page_index *page_index; ctx = get_rrdeng_ctx_from_host(rd->rrdset->rrdhost); - handle = &rd->state->handle.rrdeng; - handle->ctx = ctx; + handle = callocz(1, sizeof(struct rrdeng_collect_handle)); + handle->ctx = ctx; handle->descr = NULL; handle->prev_descr = NULL; handle->unaligned_page = 0; + rd->state->handle = (STORAGE_COLLECT_HANDLE *)handle; page_index = rd->state->page_index; uv_rwlock_wrlock(&page_index->lock); @@ -162,7 +163,7 @@ void rrdeng_store_metric_flush_current_page(RRDDIM *rd) struct rrdengine_instance *ctx; struct rrdeng_page_descr *descr; - handle = &rd->state->handle.rrdeng; + handle = (struct rrdeng_collect_handle *)rd->state->handle; ctx = handle->ctx; if (unlikely(!ctx)) return; @@ -202,7 +203,7 @@ void rrdeng_store_metric_flush_current_page(RRDDIM *rd) /* handle->prev_descr = descr;*/ } } else { - freez(descr->pg_cache_descr->page); + dbengine_page_free(descr->pg_cache_descr->page); rrdeng_destroy_pg_cache_descr(ctx, descr->pg_cache_descr); freez(descr); } @@ -211,14 +212,13 @@ void rrdeng_store_metric_flush_current_page(RRDDIM *rd) void rrdeng_store_metric_next(RRDDIM *rd, usec_t point_in_time, storage_number number) { - struct rrdeng_collect_handle *handle; + struct rrdeng_collect_handle *handle = (struct rrdeng_collect_handle *)rd->state->handle; struct rrdengine_instance *ctx; struct page_cache *pg_cache; struct rrdeng_page_descr *descr; storage_number *page; uint8_t must_flush_unaligned_page = 0, perfect_page_alignment = 0; - handle = &rd->state->handle.rrdeng; ctx = handle->ctx; pg_cache = &ctx->pg_cache; descr = handle->descr; @@ -301,7 +301,7 @@ int rrdeng_store_metric_finalize(RRDDIM *rd) struct pg_cache_page_index *page_index; uint8_t can_delete_metric = 0; - handle = &rd->state->handle.rrdeng; + handle = (struct rrdeng_collect_handle *)rd->state->handle; ctx = handle->ctx; page_index = rd->state->page_index; rrdeng_store_metric_flush_current_page(rd); @@ -314,6 +314,7 @@ int rrdeng_store_metric_finalize(RRDDIM *rd) can_delete_metric = 1; } uv_rwlock_wrunlock(&page_index->lock); + freez(handle); return can_delete_metric; } @@ -406,6 +407,7 @@ unsigned rrdeng_variable_step_boundaries(RRDSET *st, time_t start_time, time_t e is_first_region_initialized = 0; region_points = 0; + int is_out_of_order_reported = 0; /* pages loop */ for (i = 0, curr = NULL, prev = NULL ; i < pages_nr ; ++i) { old_prev = prev; @@ -446,7 +448,7 @@ unsigned rrdeng_variable_step_boundaries(RRDSET *st, time_t start_time, time_t e is_metric_out_of_order = 1; if (is_metric_earlier_than_range || unlikely(is_metric_out_of_order)) { if (unlikely(is_metric_out_of_order)) - info("Ignoring metric with out of order timestamp."); + is_out_of_order_reported++; continue; /* next entry */ } /* here is a valid metric */ @@ -519,6 +521,8 @@ unsigned rrdeng_variable_step_boundaries(RRDSET *st, time_t start_time, time_t e freez(region_info_array); } } + if (is_out_of_order_reported) + info("Ignored %d metrics with out of order timestamp in %u regions.", is_out_of_order_reported, regions); return regions; } @@ -535,12 +539,14 @@ void rrdeng_load_metric_init(RRDDIM *rd, struct rrddim_query_handle *rrdimm_hand ctx = get_rrdeng_ctx_from_host(rd->rrdset->rrdhost); rrdimm_handle->start_time = start_time; rrdimm_handle->end_time = end_time; - handle = &rrdimm_handle->rrdeng; + + handle = callocz(1, sizeof(struct rrdeng_query_handle)); handle->next_page_time = start_time; handle->now = start_time; handle->position = 0; handle->ctx = ctx; handle->descr = NULL; + rrdimm_handle->handle = (STORAGE_QUERY_HANDLE *)handle; pages_nr = pg_cache_preload(ctx, rd->state->rrdeng_uuid, start_time * USEC_PER_SEC, end_time * USEC_PER_SEC, NULL, &handle->page_index); if (unlikely(NULL == handle->page_index || 0 == pages_nr)) @@ -548,102 +554,109 @@ void rrdeng_load_metric_init(RRDDIM *rd, struct rrddim_query_handle *rrdimm_hand handle->next_page_time = INVALID_TIME; } -/* Returns the metric and sets its timestamp into current_time */ -storage_number rrdeng_load_metric_next(struct rrddim_query_handle *rrdimm_handle, time_t *current_time) -{ - struct rrdeng_query_handle *handle; - struct rrdengine_instance *ctx; - struct rrdeng_page_descr *descr; - storage_number *page, ret; - unsigned position, entries; - usec_t next_page_time = 0, current_position_time, page_end_time = 0; +static int rrdeng_load_page_next(struct rrddim_query_handle *rrdimm_handle) { + struct rrdeng_query_handle *handle = (struct rrdeng_query_handle *)rrdimm_handle->handle; + + struct rrdengine_instance *ctx = handle->ctx; + struct rrdeng_page_descr *descr = handle->descr; + uint32_t page_length; + usec_t page_end_time; + unsigned position; - handle = &rrdimm_handle->rrdeng; - if (unlikely(INVALID_TIME == handle->next_page_time)) { - return SN_EMPTY_SLOT; - } - ctx = handle->ctx; - if (unlikely(NULL == (descr = handle->descr))) { - /* it's the first call */ - next_page_time = handle->next_page_time * USEC_PER_SEC; - } else { - pg_cache_atomic_get_pg_info(descr, &page_end_time, &page_length); - } - position = handle->position + 1; + if (likely(descr)) { + // Drop old page's reference - if (unlikely(NULL == descr || - position >= (page_length / sizeof(storage_number)))) { - /* We need to get a new page */ - if (descr) { - /* Drop old page's reference */ #ifdef NETDATA_INTERNAL_CHECKS - rrd_stat_atomic_add(&ctx->stats.metric_API_consumers, -1); + rrd_stat_atomic_add(&ctx->stats.metric_API_consumers, -1); #endif - pg_cache_put(ctx, descr); - handle->descr = NULL; - handle->next_page_time = (page_end_time / USEC_PER_SEC) + 1; - if (unlikely(handle->next_page_time > rrdimm_handle->end_time)) { - goto no_more_metrics; - } - next_page_time = handle->next_page_time * USEC_PER_SEC; - } - descr = pg_cache_lookup_next(ctx, handle->page_index, &handle->page_index->id, - next_page_time, rrdimm_handle->end_time * USEC_PER_SEC); - if (NULL == descr) { - goto no_more_metrics; - } + pg_cache_put(ctx, descr); + handle->descr = NULL; + handle->next_page_time = (handle->page_end_time / USEC_PER_SEC) + 1; + + if (unlikely(handle->next_page_time > rrdimm_handle->end_time)) + return 1; + } + + usec_t next_page_time = handle->next_page_time * USEC_PER_SEC; + descr = pg_cache_lookup_next(ctx, handle->page_index, &handle->page_index->id, next_page_time, rrdimm_handle->end_time * USEC_PER_SEC); + if (NULL == descr) + return 1; + #ifdef NETDATA_INTERNAL_CHECKS - rrd_stat_atomic_add(&ctx->stats.metric_API_consumers, 1); + rrd_stat_atomic_add(&ctx->stats.metric_API_consumers, 1); #endif - handle->descr = descr; - pg_cache_atomic_get_pg_info(descr, &page_end_time, &page_length); - if (unlikely(INVALID_TIME == descr->start_time || - INVALID_TIME == page_end_time)) { - goto no_more_metrics; - } - if (unlikely(descr->start_time != page_end_time && next_page_time > descr->start_time)) { - /* we're in the middle of the page somewhere */ - entries = page_length / sizeof(storage_number); - position = ((uint64_t)(next_page_time - descr->start_time)) * (entries - 1) / - (page_end_time - descr->start_time); - } else { - position = 0; - } + + handle->descr = descr; + pg_cache_atomic_get_pg_info(descr, &page_end_time, &page_length); + if (unlikely(INVALID_TIME == descr->start_time || INVALID_TIME == page_end_time)) + return 1; + + if (unlikely(descr->start_time != page_end_time && next_page_time > descr->start_time)) { + // we're in the middle of the page somewhere + unsigned entries = page_length / sizeof(storage_number); + position = ((uint64_t)(next_page_time - descr->start_time)) * (entries - 1) / + (page_end_time - descr->start_time); } - page = descr->pg_cache_descr->page; - ret = page[position]; - entries = page_length / sizeof(storage_number); - if (entries > 1) { - usec_t dt; + else + position = 0; + + handle->page_end_time = page_end_time; + handle->page_length = page_length; + handle->page = descr->pg_cache_descr->page; + usec_t entries = handle->entries = page_length / sizeof(storage_number); + if (likely(entries > 1)) + handle->dt = (page_end_time - descr->start_time) / (entries - 1); + else + handle->dt = 0; - dt = (page_end_time - descr->start_time) / (entries - 1); - current_position_time = descr->start_time + position * dt; - } else { - current_position_time = descr->start_time; + handle->dt_sec = handle->dt / USEC_PER_SEC; + handle->position = position; + + return 0; +} + +/* Returns the metric and sets its timestamp into current_time */ +storage_number rrdeng_load_metric_next(struct rrddim_query_handle *rrdimm_handle, time_t *current_time) { + struct rrdeng_query_handle *handle = (struct rrdeng_query_handle *)rrdimm_handle->handle; + + if (unlikely(INVALID_TIME == handle->next_page_time)) + return SN_EMPTY_SLOT; + + struct rrdeng_page_descr *descr = handle->descr; + unsigned position = handle->position + 1; + time_t now = handle->now + handle->dt_sec; + + if (unlikely(!descr || position >= handle->entries)) { + // We need to get a new page + if(rrdeng_load_page_next(rrdimm_handle)) { + // next calls will not load any more metrics + handle->next_page_time = INVALID_TIME; + return SN_EMPTY_SLOT; + } + + descr = handle->descr; + position = handle->position; + now = (descr->start_time + position * handle->dt) / USEC_PER_SEC; } + + storage_number ret = handle->page[position]; handle->position = position; - handle->now = current_position_time / USEC_PER_SEC; -/* fatal_assert(handle->now >= rrdimm_handle->start_time && handle->now <= rrdimm_handle->end_time); - The above assertion is an approximation and needs to take update_every into account */ - if (unlikely(handle->now >= rrdimm_handle->end_time)) { - /* next calls will not load any more metrics */ + handle->now = now; + + if (unlikely(now >= rrdimm_handle->end_time)) { + // next calls will not load any more metrics handle->next_page_time = INVALID_TIME; } - *current_time = handle->now; - return ret; -no_more_metrics: - handle->next_page_time = INVALID_TIME; - return SN_EMPTY_SLOT; + *current_time = now; + return ret; } int rrdeng_load_metric_is_finished(struct rrddim_query_handle *rrdimm_handle) { - struct rrdeng_query_handle *handle; - - handle = &rrdimm_handle->rrdeng; + struct rrdeng_query_handle *handle = (struct rrdeng_query_handle *)rrdimm_handle->handle; return (INVALID_TIME == handle->next_page_time); } @@ -652,19 +665,20 @@ int rrdeng_load_metric_is_finished(struct rrddim_query_handle *rrdimm_handle) */ void rrdeng_load_metric_finalize(struct rrddim_query_handle *rrdimm_handle) { - struct rrdeng_query_handle *handle; - struct rrdengine_instance *ctx; - struct rrdeng_page_descr *descr; + struct rrdeng_query_handle *handle = (struct rrdeng_query_handle *)rrdimm_handle->handle; + struct rrdengine_instance *ctx = handle->ctx; + struct rrdeng_page_descr *descr = handle->descr; - handle = &rrdimm_handle->rrdeng; - ctx = handle->ctx; - descr = handle->descr; if (descr) { #ifdef NETDATA_INTERNAL_CHECKS rrd_stat_atomic_add(&ctx->stats.metric_API_consumers, -1); #endif pg_cache_put(ctx, descr); } + + // whatever is allocated at rrdeng_load_metric_init() should be freed here + freez(handle); + rrdimm_handle->handle = NULL; } time_t rrdeng_metric_latest_time(RRDDIM *rd) @@ -724,7 +738,7 @@ void *rrdeng_create_page(struct rrdengine_instance *ctx, uuid_t *id, struct rrde descr = pg_cache_create_descr(); descr->id = id; /* TODO: add page type: metric, log, something? */ - page = mallocz(RRDENG_BLOCK_SIZE); /*TODO: add page size */ + page = dbengine_page_alloc(); /*TODO: add page size */ rrdeng_page_descr_mutex_lock(ctx, descr); pg_cache_descr = descr->pg_cache_descr; pg_cache_descr->page = page; @@ -949,7 +963,7 @@ int rrdeng_init(RRDHOST *host, struct rrdengine_instance **ctxp, char *dbfiles_p /* wait for worker thread to initialize */ completion_wait_for(&ctx->rrdengine_completion); completion_destroy(&ctx->rrdengine_completion); - uv_thread_set_name_np(ctx->worker_config.thread, "DBENGINE"); + uv_thread_set_name_np(ctx->worker_config.thread, "LIBUV_WORKER"); if (ctx->worker_config.error) { goto error_after_rrdeng_worker; } diff --git a/database/metric_correlations.c b/database/metric_correlations.c new file mode 100644 index 000000000..3b8968c99 --- /dev/null +++ b/database/metric_correlations.c @@ -0,0 +1,300 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "daemon/common.h" +#include "KolmogorovSmirnovDist.h" + +#define MAX_POINTS 10000 +int enable_metric_correlations = CONFIG_BOOLEAN_NO; +int metric_correlations_version = 1; + +struct charts { + RRDSET *st; + struct charts *next; +}; + +struct per_dim { + char *dimension; + calculated_number baseline[MAX_POINTS]; + calculated_number highlight[MAX_POINTS]; + + double baseline_diffs[MAX_POINTS]; + double highlight_diffs[MAX_POINTS]; +}; + +int find_index(double arr[], long int n, double K, long int start) +{ + for (long int i = start; i < n; i++) { + if (K<arr[i]){ + return i; + } + } + return n; +} + +int compare(const void *left, const void *right) { + double lt = *(double *)left; + double rt = *(double *)right; + + if(unlikely(lt < rt)) return -1; + if(unlikely(lt > rt)) return 1; + return 0; +} + +void kstwo(double data1[], long int n1, double data2[], long int n2, double *d, double *prob) +{ + double en1, en2, en, data_all[MAX_POINTS*2], cdf1[MAX_POINTS], cdf2[MAX_POINTS], cddiffs[MAX_POINTS]; + double min = 0.0, max = 0.0; + qsort(data1, n1, sizeof(double), compare); + qsort(data2, n2, sizeof(double), compare); + + for (int i = 0; i < n1; i++) + data_all[i] = data1[i]; + for (int i = 0; i < n2; i++) + data_all[n1 + i] = data2[i]; + + en1 = (double)n1; + en2 = (double)n2; + *d = 0.0; + cddiffs[0]=0; //for uninitialized warning + + for (int i=0; i<n1+n2;i++) + cdf1[i] = find_index(data1, n1, data_all[i], 0) / en1; //TODO, use the start to reduce loops + + for (int i=0; i<n1+n2;i++) + cdf2[i] = find_index(data2, n2, data_all[i], 0) / en2; + + for ( int i=0;i<n2+n1;i++) + cddiffs[i] = cdf1[i] - cdf2[i]; + + min = cddiffs[0]; + for ( int i=0;i<n2+n1;i++) { + if (cddiffs[i] < min) + min = cddiffs[i]; + } + + //clip min + if (fabs(min) < 0) min = 0; + else if (fabs(min) > 1) min = 1; + + max = fabs(cddiffs[0]); + for ( int i=0;i<n2+n1;i++) + if (cddiffs[i] >= max) max = cddiffs[i]; + + if (fabs(min) < max) + *d = max; + else + *d = fabs(min); + + + + en = (en1*en2 / (en1 + en2)); + *prob = KSfbar(round(en), *d); +} + +void fill_nan (struct per_dim *d, long int hp, long int bp) +{ + int k; + + for (k = 0; k < bp; k++) { + if (isnan(d->baseline[k])) { + d->baseline[k] = 0.0; + } + } + + for (k = 0; k < hp; k++) { + if (isnan(d->highlight[k])) { + d->highlight[k] = 0.0; + } + } +} + +//TODO check counters +void run_diffs_and_rev (struct per_dim *d, long int hp, long int bp) +{ + int k, j; + + for (k = 0, j = bp; k < bp - 1; k++, j--) + d->baseline_diffs[k] = (double)d->baseline[j - 2] - (double)d->baseline[j - 1]; + for (k = 0, j = hp; k < hp - 1; k++, j--) { + d->highlight_diffs[k] = (double)d->highlight[j - 2] - (double)d->highlight[j - 1]; + } +} + +int run_metric_correlations (BUFFER *wb, RRDSET *st, long long baseline_after, long long baseline_before, long long highlight_after, long long highlight_before, long long max_points) +{ + uint32_t options = 0x00000000; + int group_method = RRDR_GROUPING_AVERAGE; + long group_time = 0; + struct context_param *context_param_list = NULL; + long c; + int i=0, j=0; + int b_dims = 0; + long int baseline_points = 0, highlight_points = 0; + + struct per_dim *pd = NULL; + + //TODO get everything in one go, when baseline is right before highlight + //get baseline + ONEWAYALLOC *owa = onewayalloc_create(0); + RRDR *rb = rrd2rrdr(owa, st, max_points, baseline_after, baseline_before, group_method, group_time, options, NULL, context_param_list, 0); + if(!rb) { + info("Cannot generate metric correlations output with these parameters on this chart."); + onewayalloc_destroy(owa); + return 0; + } else { + baseline_points = rrdr_rows(rb); + pd = mallocz(sizeof(struct per_dim) * rb->d); + b_dims = rb->d; + for (c = 0; c != rrdr_rows(rb) ; ++c) { + RRDDIM *d; + for (j = 0, d = rb->st->dimensions ; d && j < rb->d ; ++j, d = d->next) { + calculated_number *cn = &rb->v[ c * rb->d ]; + if (!c) { + //TODO use points from query + pd[j].dimension = strdupz (d->name); + pd[j].baseline[c] = cn[j]; + } else { + pd[j].baseline[c] = cn[j]; + } + } + } + } + rrdr_free(owa, rb); + onewayalloc_destroy(owa); + if (!pd) + return 0; + + //get highlight + owa = onewayalloc_create(0); + RRDR *rh = rrd2rrdr(owa, st, max_points, highlight_after, highlight_before, group_method, group_time, options, NULL, context_param_list, 0); + if(!rh) { + info("Cannot generate metric correlations output with these parameters on this chart."); + freez(pd); + onewayalloc_destroy(owa); + return 0; + } else { + if (rh->d != b_dims) { + //TODO handle different dims + rrdr_free(owa, rh); + onewayalloc_destroy(owa); + freez(pd); + return 0; + } + highlight_points = rrdr_rows(rh); + for (c = 0; c != rrdr_rows(rh) ; ++c) { + RRDDIM *d; + for (j = 0, d = rh->st->dimensions ; d && j < rh->d ; ++j, d = d->next) { + calculated_number *cn = &rh->v[ c * rh->d ]; + pd[j].highlight[c] = cn[j]; + } + } + } + rrdr_free(owa, rh); + onewayalloc_destroy(owa); + + for (i = 0; i < b_dims; i++) { + fill_nan(&pd[i], highlight_points, baseline_points); + } + + for (i = 0; i < b_dims; i++) { + run_diffs_and_rev(&pd[i], highlight_points, baseline_points); + } + + double d=0, prob=0; + for (i=0;i < j ;i++) { + if (baseline_points && highlight_points) { + kstwo(pd[i].baseline_diffs, baseline_points-1, pd[i].highlight_diffs, highlight_points-1, &d, &prob); + buffer_sprintf(wb, "\t\t\t\t\"%s\": %f", pd[i].dimension, prob); + if (i != j-1) + buffer_sprintf(wb, ",\n"); + else + buffer_sprintf(wb, "\n"); + } + } + + freez(pd); + return j; +} + +void metric_correlations (RRDHOST *host, BUFFER *wb, long long baseline_after, long long baseline_before, long long highlight_after, long long highlight_before, long long max_points) +{ + info ("Running metric correlations, highlight_after: %lld, highlight_before: %lld, baseline_after: %lld, baseline_before: %lld, max_points: %lld", highlight_after, highlight_before, baseline_after, baseline_before, max_points); + + if (!enable_metric_correlations) { + error("Metric correlations functionality is not enabled."); + buffer_strcat(wb, "{\"error\": \"Metric correlations functionality is not enabled.\" }"); + return; + } + + if (highlight_before <= highlight_after || baseline_before <= baseline_after) { + error("Invalid baseline or highlight ranges."); + buffer_strcat(wb, "{\"error\": \"Invalid baseline or highlight ranges.\" }"); + return; + } + + long long dims = 0, total_dims = 0; + RRDSET *st; + size_t c = 0; + BUFFER *wdims = buffer_create(1000); + + if (!max_points || max_points > MAX_POINTS) + max_points = MAX_POINTS; + + //dont lock here and wait for results + //get the charts and run mc after + //should not be a problem for the query + struct charts *charts = NULL; + rrdhost_rdlock(host); + rrdset_foreach_read(st, host) { + if (rrdset_is_available_for_viewers(st)) { + rrdset_rdlock(st); + struct charts *chart = callocz(1, sizeof(struct charts)); + chart->st = st; + chart->next = NULL; + if (charts) { + chart->next = charts; + } + charts = chart; + } + } + rrdhost_unlock(host); + + buffer_strcat(wb, "{\n\t\"correlated_charts\": {"); + + for (struct charts *ch = charts; ch; ch = ch->next) { + buffer_flush(wdims); + dims = run_metric_correlations(wdims, ch->st, baseline_after, baseline_before, highlight_after, highlight_before, max_points); + if (dims) { + if (c) + buffer_strcat(wb, "\t\t},"); + buffer_strcat(wb, "\n\t\t\""); + buffer_strcat(wb, ch->st->id); + buffer_strcat(wb, "\": {\n"); + buffer_strcat(wb, "\t\t\t\"context\": \""); + buffer_strcat(wb, ch->st->context); + buffer_strcat(wb, "\",\n\t\t\t\"dimensions\": {\n"); + buffer_sprintf(wb, "%s", buffer_tostring(wdims)); + buffer_strcat(wb, "\t\t\t}\n"); + total_dims += dims; + c++; + } + } + buffer_strcat(wb, "\t\t}\n"); + buffer_sprintf(wb, "\t},\n\t\"total_dimensions_count\": %lld\n}", total_dims); + + if (!total_dims) { + buffer_flush(wb); + buffer_strcat(wb, "{\"error\": \"No results from metric correlations.\" }"); + } + + struct charts* ch; + while(charts){ + ch = charts; + charts = charts->next; + rrdset_unlock(ch->st); + free(ch); + } + + buffer_free(wdims); + info ("Done running metric correlations"); +} diff --git a/database/metric_correlations.h b/database/metric_correlations.h new file mode 100644 index 000000000..83ea9b74d --- /dev/null +++ b/database/metric_correlations.h @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_METRIC_CORRELATIONS_H +#define NETDATA_METRIC_CORRELATIONS_H 1 + +extern int enable_metric_correlations; +extern int metric_correlations_version; + +void metric_correlations (RRDHOST *host, BUFFER *wb, long long selected_after, long long selected_before, long long reference_after, long long reference_before, long long max_points); + +#endif //NETDATA_METRIC_CORRELATIONS_H diff --git a/database/ram/rrddim_mem.c b/database/ram/rrddim_mem.c new file mode 100644 index 000000000..b17f03ca5 --- /dev/null +++ b/database/ram/rrddim_mem.c @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "rrddim_mem.h" + +// ---------------------------------------------------------------------------- +// RRDDIM legacy data collection functions + +void rrddim_collect_init(RRDDIM *rd) { + rd->values[rd->rrdset->current_entry] = SN_EMPTY_SLOT; + rd->state->handle = calloc(1, sizeof(struct mem_collect_handle)); +} +void rrddim_collect_store_metric(RRDDIM *rd, usec_t point_in_time, storage_number number) { + (void)point_in_time; + rd->values[rd->rrdset->current_entry] = number; +} +int rrddim_collect_finalize(RRDDIM *rd) { + free((struct mem_collect_handle*)rd->state->handle); + return 0; +} + +// ---------------------------------------------------------------------------- +// RRDDIM legacy database query functions + +void rrddim_query_init(RRDDIM *rd, struct rrddim_query_handle *handle, time_t start_time, time_t end_time) { + handle->rd = rd; + handle->start_time = start_time; + handle->end_time = end_time; + struct mem_query_handle* h = calloc(1, sizeof(struct mem_query_handle)); + h->slot = rrdset_time2slot(rd->rrdset, start_time); + h->last_slot = rrdset_time2slot(rd->rrdset, end_time); + h->finished = 0; + handle->handle = (STORAGE_QUERY_HANDLE *)h; +} + +storage_number rrddim_query_next_metric(struct rrddim_query_handle *handle, time_t *current_time) { + RRDDIM *rd = handle->rd; + struct mem_query_handle* h = (struct mem_query_handle*)handle->handle; + long entries = rd->rrdset->entries; + long slot = h->slot; + + (void)current_time; + if (unlikely(h->slot == h->last_slot)) + h->finished = 1; + storage_number n = rd->values[slot++]; + + if(unlikely(slot >= entries)) slot = 0; + h->slot = slot; + + return n; +} + +int rrddim_query_is_finished(struct rrddim_query_handle *handle) { + struct mem_query_handle* h = (struct mem_query_handle*)handle->handle; + return h->finished; +} + +void rrddim_query_finalize(struct rrddim_query_handle *handle) { + freez(handle->handle); +} + +time_t rrddim_query_latest_time(RRDDIM *rd) { + return rrdset_last_entry_t_nolock(rd->rrdset); +} + +time_t rrddim_query_oldest_time(RRDDIM *rd) { + return rrdset_first_entry_t_nolock(rd->rrdset); +} diff --git a/database/ram/rrddim_mem.h b/database/ram/rrddim_mem.h new file mode 100644 index 000000000..9a215387a --- /dev/null +++ b/database/ram/rrddim_mem.h @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_RRDDIMMEM_H +#define NETDATA_RRDDIMMEM_H + +#include "database/rrd.h" + +struct mem_collect_handle { + long slot; + long entries; +}; +struct mem_query_handle { + long slot; + long last_slot; + uint8_t finished; +}; + +extern void rrddim_collect_init(RRDDIM *rd); +extern void rrddim_collect_store_metric(RRDDIM *rd, usec_t point_in_time, storage_number number); +extern int rrddim_collect_finalize(RRDDIM *rd); + +extern void rrddim_query_init(RRDDIM *rd, struct rrddim_query_handle *handle, time_t start_time, time_t end_time); +extern storage_number rrddim_query_next_metric(struct rrddim_query_handle *handle, time_t *current_time); +extern int rrddim_query_is_finished(struct rrddim_query_handle *handle); +extern void rrddim_query_finalize(struct rrddim_query_handle *handle); +extern time_t rrddim_query_latest_time(RRDDIM *rd); +extern time_t rrddim_query_oldest_time(RRDDIM *rd); + +#endif diff --git a/database/rrd.c b/database/rrd.c index 321d35615..f91039ea5 100644 --- a/database/rrd.c +++ b/database/rrd.c @@ -2,6 +2,7 @@ #define NETDATA_RRD_INTERNALS 1 #include "rrd.h" +#include "storage_engine.h" // ---------------------------------------------------------------------------- // globals @@ -47,24 +48,19 @@ inline const char *rrd_memory_mode_name(RRD_MEMORY_MODE id) { return RRD_MEMORY_MODE_DBENGINE_NAME; } + STORAGE_ENGINE* eng = storage_engine_get(id); + if (eng) { + return eng->name; + } + return RRD_MEMORY_MODE_SAVE_NAME; } RRD_MEMORY_MODE rrd_memory_mode_id(const char *name) { - if(unlikely(!strcmp(name, RRD_MEMORY_MODE_RAM_NAME))) - return RRD_MEMORY_MODE_RAM; - - else if(unlikely(!strcmp(name, RRD_MEMORY_MODE_MAP_NAME))) - return RRD_MEMORY_MODE_MAP; - - else if(unlikely(!strcmp(name, RRD_MEMORY_MODE_NONE_NAME))) - return RRD_MEMORY_MODE_NONE; - - else if(unlikely(!strcmp(name, RRD_MEMORY_MODE_ALLOC_NAME))) - return RRD_MEMORY_MODE_ALLOC; - - else if(unlikely(!strcmp(name, RRD_MEMORY_MODE_DBENGINE_NAME))) - return RRD_MEMORY_MODE_DBENGINE; + STORAGE_ENGINE* eng = storage_engine_find(name); + if (eng) { + return eng->id; + } return RRD_MEMORY_MODE_SAVE; } @@ -139,8 +135,7 @@ const char *rrdset_type_name(RRDSET_TYPE chart_type) { // ---------------------------------------------------------------------------- // RRD - cache directory -char *rrdset_cache_dir(RRDHOST *host, const char *id, const char *config_section) { - UNUSED(config_section); +char *rrdset_cache_dir(RRDHOST *host, const char *id) { char *ret = NULL; char b[FILENAME_MAX + 1]; diff --git a/database/rrd.h b/database/rrd.h index 071e1d038..dc32b2a2d 100644 --- a/database/rrd.h +++ b/database/rrd.h @@ -170,17 +170,12 @@ typedef enum rrddim_flags { RRDDIM_FLAG_ACLK = (1 << 4), RRDDIM_FLAG_PENDING_FOREACH_ALARM = (1 << 5), // set when foreach alarm has not been initialized yet + RRDDIM_FLAG_META_HIDDEN = (1 << 6), // Status of hidden option in the metadata database } RRDDIM_FLAGS; -#ifdef HAVE_C___ATOMIC #define rrddim_flag_check(rd, flag) (__atomic_load_n(&((rd)->flags), __ATOMIC_SEQ_CST) & (flag)) #define rrddim_flag_set(rd, flag) __atomic_or_fetch(&((rd)->flags), (flag), __ATOMIC_SEQ_CST) #define rrddim_flag_clear(rd, flag) __atomic_and_fetch(&((rd)->flags), ~(flag), __ATOMIC_SEQ_CST) -#else -#define rrddim_flag_check(rd, flag) ((rd)->flags & (flag)) -#define rrddim_flag_set(rd, flag) (rd)->flags |= (flag) -#define rrddim_flag_clear(rd, flag) (rd)->flags &= ~(flag) -#endif typedef enum label_source { LABEL_SOURCE_AUTO = 0, @@ -332,53 +327,56 @@ struct rrddim { }; // ---------------------------------------------------------------------------- -// iterator state for RRD dimension data collection -union rrddim_collect_handle { - struct { - long slot; - long entries; - } slotted; // state the legacy code uses -#ifdef ENABLE_DBENGINE - struct rrdeng_collect_handle { - struct rrdeng_page_descr *descr, *prev_descr; - unsigned long page_correlation_id; - struct rrdengine_instance *ctx; - // set to 1 when this dimension is not page aligned with the other dimensions in the chart - uint8_t unaligned_page; - } rrdeng; // state the database engine uses -#endif -}; +// engine-specific iterator state for dimension data collection +typedef struct storage_collect_handle STORAGE_COLLECT_HANDLE; // ---------------------------------------------------------------------------- -// iterator state for RRD dimension data queries - -#ifdef ENABLE_DBENGINE -struct rrdeng_query_handle { - struct rrdeng_page_descr *descr; - struct rrdengine_instance *ctx; - struct pg_cache_page_index *page_index; - time_t next_page_time; - time_t now; - unsigned position; -}; -#endif +// engine-specific iterator state for dimension data queries +typedef struct storage_query_handle STORAGE_QUERY_HANDLE; +// ---------------------------------------------------------------------------- +// iterator state for RRD dimension data queries struct rrddim_query_handle { RRDDIM *rd; time_t start_time; time_t end_time; - union { - struct { - long slot; - long last_slot; - uint8_t finished; - } slotted; // state the legacy code uses -#ifdef ENABLE_DBENGINE - struct rrdeng_query_handle rrdeng; // state the database engine uses -#endif - }; + STORAGE_QUERY_HANDLE* handle; }; +// ------------------------------------------------------------------------ +// function pointers that handle data collection +struct rrddim_collect_ops { + // an initialization function to run before starting collection + void (*init)(RRDDIM *rd); + + // run this to store each metric into the database + void (*store_metric)(RRDDIM *rd, usec_t point_in_time, storage_number number); + + // an finalization function to run after collection is over + // returns 1 if it's safe to delete the dimension + int (*finalize)(RRDDIM *rd); +}; + +// function pointers that handle database queries +struct rrddim_query_ops { + // run this before starting a series of next_metric() database queries + void (*init)(RRDDIM *rd, struct rrddim_query_handle *handle, time_t start_time, time_t end_time); + + // run this to load each metric number from the database + storage_number (*next_metric)(struct rrddim_query_handle *handle, time_t *current_time); + + // run this to test if the series of next_metric() database queries is finished + int (*is_finished)(struct rrddim_query_handle *handle); + + // run this after finishing a series of load_metric() database queries + void (*finalize)(struct rrddim_query_handle *handle); + + // get the timestamp of the last entry of this metric + time_t (*latest_time)(RRDDIM *rd); + + // get the timestamp of the first entry of this metric + time_t (*oldest_time)(RRDDIM *rd); +}; // ---------------------------------------------------------------------------- // volatile state per RRD dimension @@ -391,42 +389,9 @@ struct rrddim_volatile { int aclk_live_status; #endif uuid_t metric_uuid; // global UUID for this metric (unique_across hosts) - union rrddim_collect_handle handle; - // ------------------------------------------------------------------------ - // function pointers that handle data collection - struct rrddim_collect_ops { - // an initialization function to run before starting collection - void (*init)(RRDDIM *rd); - - // run this to store each metric into the database - void (*store_metric)(RRDDIM *rd, usec_t point_in_time, storage_number number); - - // an finalization function to run after collection is over - // returns 1 if it's safe to delete the dimension - int (*finalize)(RRDDIM *rd); - } collect_ops; - - // function pointers that handle database queries - struct rrddim_query_ops { - // run this before starting a series of next_metric() database queries - void (*init)(RRDDIM *rd, struct rrddim_query_handle *handle, time_t start_time, time_t end_time); - - // run this to load each metric number from the database - storage_number (*next_metric)(struct rrddim_query_handle *handle, time_t *current_time); - - // run this to test if the series of next_metric() database queries is finished - int (*is_finished)(struct rrddim_query_handle *handle); - - // run this after finishing a series of load_metric() database queries - void (*finalize)(struct rrddim_query_handle *handle); - - // get the timestamp of the last entry of this metric - time_t (*latest_time)(RRDDIM *rd); - - // get the timestamp of the first entry of this metric - time_t (*oldest_time)(RRDDIM *rd); - } query_ops; - + STORAGE_COLLECT_HANDLE* handle; + struct rrddim_collect_ops collect_ops; + struct rrddim_query_ops query_ops; ml_dimension_t ml_dimension; }; @@ -434,6 +399,7 @@ struct rrddim_volatile { // volatile state per chart struct rrdset_volatile { char *old_title; + char *old_units; char *old_context; uuid_t hash_id; struct label *new_labels; @@ -459,7 +425,6 @@ struct rrdset_volatile { // and may lead to missing information. typedef enum rrdset_flags { - RRDSET_FLAG_ENABLED = 1 << 0, // enables or disables a chart RRDSET_FLAG_DETAIL = 1 << 1, // if set, the data set should be considered as a detail of another // (the master data set should be the one that has the same family and is not detail) RRDSET_FLAG_DEBUG = 1 << 2, // enables or disables debugging for a chart @@ -483,16 +448,9 @@ typedef enum rrdset_flags { RRDSET_FLAG_ANOMALY_DETECTION = 1 << 18 // flag to identify anomaly detection charts. } RRDSET_FLAGS; -#ifdef HAVE_C___ATOMIC #define rrdset_flag_check(st, flag) (__atomic_load_n(&((st)->flags), __ATOMIC_SEQ_CST) & (flag)) #define rrdset_flag_set(st, flag) __atomic_or_fetch(&((st)->flags), flag, __ATOMIC_SEQ_CST) #define rrdset_flag_clear(st, flag) __atomic_and_fetch(&((st)->flags), ~flag, __ATOMIC_SEQ_CST) -#else -#define rrdset_flag_check(st, flag) ((st)->flags & (flag)) -#define rrdset_flag_set(st, flag) (st)->flags |= (flag) -#define rrdset_flag_clear(st, flag) (st)->flags &= ~(flag) -#endif -#define rrdset_flag_check_noatomic(st, flag) ((st)->flags & (flag)) struct rrdset { // ------------------------------------------------------------------------ @@ -511,7 +469,7 @@ struct rrdset { // since the config always has a higher priority // (the user overwrites the name of the charts) - char *config_section; // the config section for the chart + void *unused_ptr; // Unused field (previously it held the config section of the chart) char *type; // the type of graph RRD_TYPE_* (a category, for determining graphing options) char *family; // grouping sets under the same family @@ -642,15 +600,9 @@ typedef enum rrdhost_flags { RRDHOST_FLAG_PENDING_FOREACH_ALARMS = 1 << 7, // contains dims with uninitialized foreach alarms } RRDHOST_FLAGS; -#ifdef HAVE_C___ATOMIC #define rrdhost_flag_check(host, flag) (__atomic_load_n(&((host)->flags), __ATOMIC_SEQ_CST) & (flag)) #define rrdhost_flag_set(host, flag) __atomic_or_fetch(&((host)->flags), flag, __ATOMIC_SEQ_CST) #define rrdhost_flag_clear(host, flag) __atomic_and_fetch(&((host)->flags), ~flag, __ATOMIC_SEQ_CST) -#else -#define rrdhost_flag_check(host, flag) ((host)->flags & (flag)) -#define rrdhost_flag_set(host, flag) (host)->flags |= (flag) -#define rrdhost_flag_clear(host, flag) (host)->flags &= ~(flag) -#endif #ifdef NETDATA_INTERNAL_CHECKS #define rrdset_debug(st, fmt, args...) do { if(unlikely(debug_flags & D_RRD_STATS && rrdset_flag_check(st, RRDSET_FLAG_DEBUG))) \ @@ -767,6 +719,7 @@ struct rrdhost_system_info { char *install_type; char *prebuilt_arch; char *prebuilt_dist; + int mc_version; }; struct rrdhost { @@ -815,6 +768,8 @@ struct rrdhost { unsigned int rrdpush_send_enabled; // 1 when this host sends metrics to another netdata char *rrdpush_send_destination; // where to send metrics to char *rrdpush_send_api_key; // the api key at the receiving netdata + struct rrdpush_destinations *destinations; // a linked list of possible destinations + struct rrdpush_destinations *destination; // the current destination from the above list // the following are state information for the threading // streaming metrics from this netdata to an upstream netdata @@ -842,10 +797,14 @@ struct rrdhost { volatile size_t connected_senders; // when remote hosts are streaming to this // host, this is the counter of connected clients + time_t senders_connect_time; // the time the last sender was connected + time_t senders_last_chart_command; // the time of the last CHART streaming command time_t senders_disconnected_time; // the time the last sender was disconnected struct receiver_state *receiver; netdata_mutex_t receiver_lock; + int trigger_chart_obsoletion_check; // set when child connects, will instruct parent to + // trigger a check for obsoleted charts since previous connect // ------------------------------------------------------------------------ // health monitoring options @@ -1120,8 +1079,8 @@ extern void rrdset_is_obsolete(RRDSET *st); extern void rrdset_isnot_obsolete(RRDSET *st); // checks if the RRDSET should be offered to viewers -#define rrdset_is_available_for_viewers(st) (rrdset_flag_check(st, RRDSET_FLAG_ENABLED) && !rrdset_flag_check(st, RRDSET_FLAG_HIDDEN) && !rrdset_flag_check(st, RRDSET_FLAG_OBSOLETE) && !rrdset_flag_check(st, RRDSET_FLAG_ARCHIVED) && (st)->dimensions && (st)->rrd_memory_mode != RRD_MEMORY_MODE_NONE) -#define rrdset_is_available_for_exporting_and_alarms(st) (rrdset_flag_check(st, RRDSET_FLAG_ENABLED) && !rrdset_flag_check(st, RRDSET_FLAG_OBSOLETE) && !rrdset_flag_check(st, RRDSET_FLAG_ARCHIVED) && (st)->dimensions) +#define rrdset_is_available_for_viewers(st) (!rrdset_flag_check(st, RRDSET_FLAG_HIDDEN) && !rrdset_flag_check(st, RRDSET_FLAG_OBSOLETE) && !rrdset_flag_check(st, RRDSET_FLAG_ARCHIVED) && (st)->dimensions && (st)->rrd_memory_mode != RRD_MEMORY_MODE_NONE) +#define rrdset_is_available_for_exporting_and_alarms(st) (!rrdset_flag_check(st, RRDSET_FLAG_OBSOLETE) && !rrdset_flag_check(st, RRDSET_FLAG_ARCHIVED) && (st)->dimensions) #define rrdset_is_archived(st) (rrdset_flag_check(st, RRDSET_FLAG_ARCHIVED) && (st)->dimensions) // get the total duration in seconds of the round robin database @@ -1322,7 +1281,9 @@ extern void rrddim_isnot_obsolete(RRDSET *st, RRDDIM *rd); extern collected_number rrddim_set_by_pointer(RRDSET *st, RRDDIM *rd, collected_number value); extern collected_number rrddim_set(RRDSET *st, const char *id, collected_number value); - +#if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL) +extern time_t calc_dimension_liveness(RRDDIM *rd, time_t now); +#endif extern long align_entries_to_pagesize(RRD_MEMORY_MODE mode, long entries); // ---------------------------------------------------------------------------- @@ -1339,10 +1300,9 @@ extern int alarm_compare_name(void *a, void *b); extern avl_tree_lock rrdhost_root_index; extern char *rrdset_strncpyz_name(char *to, const char *from, size_t length); -extern char *rrdset_cache_dir(RRDHOST *host, const char *id, const char *config_section); +extern char *rrdset_cache_dir(RRDHOST *host, const char *id); -#define rrddim_free(st, rd) rrddim_free_custom(st, rd, 0) -extern void rrddim_free_custom(RRDSET *st, RRDDIM *rd, int db_rotated); +extern void rrddim_free(RRDSET *st, RRDDIM *rd); extern int rrddim_compare(void* a, void* b); extern int rrdset_compare(void* a, void* b); diff --git a/database/rrdcalc.c b/database/rrdcalc.c index 1b1a14960..b29a0ffc0 100644 --- a/database/rrdcalc.c +++ b/database/rrdcalc.c @@ -81,35 +81,32 @@ static void rrdsetcalc_link(RRDSET *st, RRDCALC *rc) { if(!rc->units) rc->units = strdupz(st->units); - if(!rrdcalc_isrepeating(rc)) { - time_t now = now_realtime_sec(); - ALARM_ENTRY *ae = health_create_alarm_entry( - host, - rc->id, - rc->next_event_id++, - rc->config_hash_id, - now, - rc->name, - rc->rrdset->id, - rc->rrdset->family, - rc->classification, - rc->component, - rc->type, - rc->exec, - rc->recipient, - now - rc->last_status_change, - rc->old_value, - rc->value, - rc->status, - RRDCALC_STATUS_UNINITIALIZED, - rc->source, - rc->units, - rc->info, - 0, - 0 - ); - health_alarm_log(host, ae); - } + time_t now = now_realtime_sec(); + ALARM_ENTRY *ae = health_create_alarm_entry( + host, + rc->id, + rc->next_event_id++, + rc->config_hash_id, + now, + rc->name, + rc->rrdset->id, + rc->rrdset->family, + rc->classification, + rc->component, + rc->type, + rc->exec, + rc->recipient, + now - rc->last_status_change, + rc->old_value, + rc->value, + rc->status, + RRDCALC_STATUS_UNINITIALIZED, + rc->source, + rc->units, + rc->info, + 0, + 0); + health_alarm_log(host, ae); } static inline int rrdcalc_test_additional_restriction(RRDCALC *rc, RRDSET *st){ @@ -119,6 +116,31 @@ static inline int rrdcalc_test_additional_restriction(RRDCALC *rc, RRDSET *st){ if (rc->plugin_match && !simple_pattern_matches(rc->plugin_pattern, st->plugin_name)) return 0; + if (rc->labels) { + int labels_count=1; + int labels_match=0; + char *s = rc->labels; + while (*s) { + if (*s==' ') + labels_count++; + s++; + } + RRDHOST *host = st->rrdhost; + char cmp[CONFIG_FILE_LINE_MAX+1]; + struct label *move = host->labels.head; + while(move) { + snprintf(cmp, CONFIG_FILE_LINE_MAX, "%s=%s", move->key, move->value); + if (simple_pattern_matches(rc->splabels, move->key) || + simple_pattern_matches(rc->splabels, cmp)) { + labels_match++; + } + move = move->next; + } + + if (labels_match != labels_count) + return 0; + } + return 1; } @@ -159,35 +181,32 @@ inline void rrdsetcalc_unlink(RRDCALC *rc) { RRDHOST *host = st->rrdhost; - if(!rrdcalc_isrepeating(rc)) { - time_t now = now_realtime_sec(); - ALARM_ENTRY *ae = health_create_alarm_entry( - host, - rc->id, - rc->next_event_id++, - rc->config_hash_id, - now, - rc->name, - rc->rrdset->id, - rc->rrdset->family, - rc->classification, - rc->component, - rc->type, - rc->exec, - rc->recipient, - now - rc->last_status_change, - rc->old_value, - rc->value, - rc->status, - RRDCALC_STATUS_REMOVED, - rc->source, - rc->units, - rc->info, - 0, - 0 - ); - health_alarm_log(host, ae); - } + time_t now = now_realtime_sec(); + ALARM_ENTRY *ae = health_create_alarm_entry( + host, + rc->id, + rc->next_event_id++, + rc->config_hash_id, + now, + rc->name, + rc->rrdset->id, + rc->rrdset->family, + rc->classification, + rc->component, + rc->type, + rc->exec, + rc->recipient, + now - rc->last_status_change, + rc->old_value, + rc->value, + rc->status, + RRDCALC_STATUS_REMOVED, + rc->source, + rc->units, + rc->info, + 0, + 0); + health_alarm_log(host, ae); debug(D_HEALTH, "Health unlinking alarm '%s.%s' from chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, host->hostname); @@ -293,19 +312,15 @@ inline uint32_t rrdcalc_get_unique_id(RRDHOST *host, const char *chart, const ch char *alarm_name_with_dim(char *name, size_t namelen, const char *dim, size_t dimlen) { char *newname,*move; - newname = malloc(namelen + dimlen + 2); - if(newname) { - move = newname; - memcpy(move, name, namelen); - move += namelen; + newname = mallocz(namelen + dimlen + 2); + move = newname; + memcpy(move, name, namelen); + move += namelen; - *move++ = '_'; - memcpy(move, dim, dimlen); - move += dimlen; - *move = '\0'; - } else { - newname = name; - } + *move++ = '_'; + memcpy(move, dim, dimlen); + move += dimlen; + *move = '\0'; return newname; } @@ -422,6 +437,7 @@ inline RRDCALC *rrdcalc_create_from_template(RRDHOST *host, RRDCALCTEMPLATE *rt, rc->delay_multiplier = rt->delay_multiplier; rc->last_repeat = 0; + rc->times_repeat = 0; rc->warn_repeat_every = rt->warn_repeat_every; rc->crit_repeat_every = rt->crit_repeat_every; @@ -534,6 +550,7 @@ inline RRDCALC *rrdcalc_create_from_rrdcalc(RRDCALC *rc, RRDHOST *host, const ch newrc->delay_multiplier = rc->delay_multiplier; newrc->last_repeat = 0; + newrc->times_repeat = 0; newrc->warn_repeat_every = rc->warn_repeat_every; newrc->crit_repeat_every = rc->crit_repeat_every; diff --git a/database/rrdcalc.h b/database/rrdcalc.h index d7446f63a..2ae47788e 100644 --- a/database/rrdcalc.h +++ b/database/rrdcalc.h @@ -121,6 +121,7 @@ struct rrdcalc { time_t next_update; // the next update timestamp of the alarm time_t last_status_change; // the timestamp of the last time this alarm changed status time_t last_repeat; // the last time the alarm got repeated + uint32_t times_repeat; // number of times the alarm got repeated time_t db_after; // the first timestamp evaluated by the db lookup time_t db_before; // the last timestamp evaluated by the db lookup diff --git a/database/rrddim.c b/database/rrddim.c index df45363bc..e488d8b0b 100644 --- a/database/rrddim.c +++ b/database/rrddim.c @@ -2,6 +2,10 @@ #define NETDATA_RRD_INTERNALS #include "rrd.h" +#ifdef ENABLE_DBENGINE +#include "database/engine/rrdengineapi.h" +#endif +#include "storage_engine.h" // ---------------------------------------------------------------------------- // RRDDIM index @@ -38,14 +42,15 @@ inline RRDDIM *rrddim_find(RRDSET *st, const char *id) { // RRDDIM rename a dimension inline int rrddim_set_name(RRDSET *st, RRDDIM *rd, const char *name) { - if(unlikely(!name || !*name || !strcmp(rd->name, name))) + if(unlikely(!name || !*name || (rd->name && !strcmp(rd->name, name)))) return 0; debug(D_RRD_CALLS, "rrddim_set_name() from %s.%s to %s.%s", st->name, rd->name, st->name, name); - char varname[CONFIG_MAX_NAME + 1]; - snprintfz(varname, CONFIG_MAX_NAME, "dim %s name", rd->id); - rd->name = config_set_default(st->config_section, varname, name); + if (rd->name) + freez((void *) rd->name); + + rd->name = strdupz(name); rd->hash_name = simple_hash(rd->name); if (!st->state->is_ar_chart) @@ -96,113 +101,65 @@ inline int rrddim_set_divisor(RRDSET *st, RRDDIM *rd, collected_number divisor) } // ---------------------------------------------------------------------------- -// RRDDIM legacy data collection functions - -static void rrddim_collect_init(RRDDIM *rd) { - rd->values[rd->rrdset->current_entry] = SN_EMPTY_SLOT; -} -static void rrddim_collect_store_metric(RRDDIM *rd, usec_t point_in_time, storage_number number) { - (void)point_in_time; - - rd->values[rd->rrdset->current_entry] = number; -} -static int rrddim_collect_finalize(RRDDIM *rd) { - (void)rd; - - return 0; -} - -// ---------------------------------------------------------------------------- -// RRDDIM legacy database query functions - -static void rrddim_query_init(RRDDIM *rd, struct rrddim_query_handle *handle, time_t start_time, time_t end_time) { - handle->rd = rd; - handle->start_time = start_time; - handle->end_time = end_time; - handle->slotted.slot = rrdset_time2slot(rd->rrdset, start_time); - handle->slotted.last_slot = rrdset_time2slot(rd->rrdset, end_time); - handle->slotted.finished = 0; -} - -static storage_number rrddim_query_next_metric(struct rrddim_query_handle *handle, time_t *current_time) { - RRDDIM *rd = handle->rd; - long entries = rd->rrdset->entries; - long slot = handle->slotted.slot; - - (void)current_time; - if (unlikely(handle->slotted.slot == handle->slotted.last_slot)) - handle->slotted.finished = 1; - storage_number n = rd->values[slot++]; - - if(unlikely(slot >= entries)) slot = 0; - handle->slotted.slot = slot; - - return n; -} - -static int rrddim_query_is_finished(struct rrddim_query_handle *handle) { - return handle->slotted.finished; -} - -static void rrddim_query_finalize(struct rrddim_query_handle *handle) { - (void)handle; - - return; -} - -static time_t rrddim_query_latest_time(RRDDIM *rd) { - return rrdset_last_entry_t_nolock(rd->rrdset); -} - -static time_t rrddim_query_oldest_time(RRDDIM *rd) { - return rrdset_first_entry_t_nolock(rd->rrdset); -} - - -// ---------------------------------------------------------------------------- // RRDDIM create a dimension void rrdcalc_link_to_rrddim(RRDDIM *rd, RRDSET *st, RRDHOST *host) { RRDCALC *rrdc; + for (rrdc = host->alarms_with_foreach; rrdc ; rrdc = rrdc->next) { if (simple_pattern_matches(rrdc->spdim, rd->id) || simple_pattern_matches(rrdc->spdim, rd->name)) { if (rrdc->hash_chart == st->hash_name || !strcmp(rrdc->chart, st->name) || !strcmp(rrdc->chart, st->id)) { char *name = alarm_name_with_dim(rrdc->name, strlen(rrdc->name), rd->name, strlen(rd->name)); - if (name) { - if(rrdcalc_exists(host, st->name, name, 0, 0)){ - freez(name); - continue; - } + if(rrdcalc_exists(host, st->name, name, 0, 0)) { + freez(name); + continue; + } + + netdata_rwlock_wrlock(&host->health_log.alarm_log_rwlock); + RRDCALC *child = rrdcalc_create_from_rrdcalc(rrdc, host, name, rd->name); + netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock); - RRDCALC *child = rrdcalc_create_from_rrdcalc(rrdc, host, name, rd->name); - if (child) { - rrdcalc_add_to_host(host, child); - RRDCALC *rdcmp = (RRDCALC *) avl_insert_lock(&(host)->alarms_idx_health_log,(avl_t *)child); - if (rdcmp != child) { - error("Cannot insert the alarm index ID %s",child->name); - } - } else { - error("Cannot allocate a new alarm."); - rrdc->foreachcounter--; + if (child) { + rrdcalc_add_to_host(host, child); + RRDCALC *rdcmp = (RRDCALC *) avl_insert_lock(&(host)->alarms_idx_health_log,(avl_t *)child); + if (rdcmp != child) { + error("Cannot insert the alarm index ID %s",child->name); } } + else { + error("Cannot allocate a new alarm."); + rrdc->foreachcounter--; + } } } } -#ifdef ENABLE_ACLK - rrdset_flag_clear(st, RRDSET_FLAG_ACLK); -#endif } +// Return either +// 0 : Dimension is live +// last collected time : Dimension is not live + +#if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL) +time_t calc_dimension_liveness(RRDDIM *rd, time_t now) +{ + time_t last_updated = rd->last_collected_time.tv_sec; + int live; + if (rd->state->aclk_live_status == 1) + live = + ((now - last_updated) < + MIN(rrdset_free_obsolete_time, RRDSET_MINIMUM_DIM_OFFLINE_MULTIPLIER * rd->update_every)); + else + live = ((now - last_updated) < RRDSET_MINIMUM_DIM_LIVE_MULTIPLIER * rd->update_every); + return live ? 0 : last_updated; +} +#endif + RRDDIM *rrddim_add_custom(RRDSET *st, const char *id, const char *name, collected_number multiplier, collected_number divisor, RRD_ALGORITHM algorithm, RRD_MEMORY_MODE memory_mode) { RRDHOST *host = st->rrdhost; rrdset_wrlock(st); - rrdset_flag_set(st, RRDSET_FLAG_SYNC_CLOCK); - rrdset_flag_clear(st, RRDSET_FLAG_UPSTREAM_EXPOSED); - RRDDIM *rd = rrddim_find(st, id); if(unlikely(rd)) { debug(D_RRD_CALLS, "Cannot create rrd dimension '%s/%s', it already exists.", st->id, name?name:"<NONAME>"); @@ -227,11 +184,19 @@ RRDDIM *rrddim_add_custom(RRDSET *st, const char *id, const char *name, collecte debug(D_METADATALOG, "DIMENSION [%s] metadata updated", rd->id); (void)sql_store_dimension(&rd->state->metric_uuid, rd->rrdset->chart_uuid, rd->id, rd->name, rd->multiplier, rd->divisor, rd->algorithm); +#if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL) + queue_dimension_to_aclk(rd, calc_dimension_liveness(rd, now_realtime_sec())); +#endif + rrdset_flag_set(st, RRDSET_FLAG_SYNC_CLOCK); + rrdset_flag_clear(st, RRDSET_FLAG_UPSTREAM_EXPOSED); } rrdset_unlock(st); return rd; } + rrdset_flag_set(st, RRDSET_FLAG_SYNC_CLOCK); + rrdset_flag_clear(st, RRDSET_FLAG_UPSTREAM_EXPOSED); + char filename[FILENAME_MAX + 1]; char fullfilename[FILENAME_MAX + 1]; @@ -244,12 +209,11 @@ RRDDIM *rrddim_add_custom(RRDSET *st, const char *id, const char *name, collecte if(memory_mode == RRD_MEMORY_MODE_SAVE || memory_mode == RRD_MEMORY_MODE_MAP || memory_mode == RRD_MEMORY_MODE_RAM) { - rd = (RRDDIM *)mymmap( - (memory_mode == RRD_MEMORY_MODE_RAM) ? NULL : fullfilename - , size - , ((memory_mode == RRD_MEMORY_MODE_MAP) ? MAP_SHARED : MAP_PRIVATE) - , 1 - ); + rd = (RRDDIM *)netdata_mmap( + (memory_mode == RRD_MEMORY_MODE_RAM) ? NULL : fullfilename, + size, + ((memory_mode == RRD_MEMORY_MODE_MAP) ? MAP_SHARED : MAP_PRIVATE), + 1); if(likely(rd)) { // we have a file mapped for rd @@ -369,30 +333,16 @@ RRDDIM *rrddim_add_custom(RRDSET *st, const char *id, const char *name, collecte rd->state->aclk_live_status = -1; #endif (void) find_dimension_uuid(st, rd, &(rd->state->metric_uuid)); - if(memory_mode == RRD_MEMORY_MODE_DBENGINE) { + + STORAGE_ENGINE* eng = storage_engine_get(memory_mode); + rd->state->collect_ops = eng->api.collect_ops; + rd->state->query_ops = eng->api.query_ops; + #ifdef ENABLE_DBENGINE + if(memory_mode == RRD_MEMORY_MODE_DBENGINE) { rrdeng_metric_init(rd); - rd->state->collect_ops.init = rrdeng_store_metric_init; - rd->state->collect_ops.store_metric = rrdeng_store_metric_next; - rd->state->collect_ops.finalize = rrdeng_store_metric_finalize; - rd->state->query_ops.init = rrdeng_load_metric_init; - rd->state->query_ops.next_metric = rrdeng_load_metric_next; - rd->state->query_ops.is_finished = rrdeng_load_metric_is_finished; - rd->state->query_ops.finalize = rrdeng_load_metric_finalize; - rd->state->query_ops.latest_time = rrdeng_metric_latest_time; - rd->state->query_ops.oldest_time = rrdeng_metric_oldest_time; -#endif - } else { - rd->state->collect_ops.init = rrddim_collect_init; - rd->state->collect_ops.store_metric = rrddim_collect_store_metric; - rd->state->collect_ops.finalize = rrddim_collect_finalize; - rd->state->query_ops.init = rrddim_query_init; - rd->state->query_ops.next_metric = rrddim_query_next_metric; - rd->state->query_ops.is_finished = rrddim_query_is_finished; - rd->state->query_ops.finalize = rrddim_query_finalize; - rd->state->query_ops.latest_time = rrddim_query_latest_time; - rd->state->query_ops.oldest_time = rrddim_query_oldest_time; } +#endif store_active_dimension(&rd->state->metric_uuid); rd->state->collect_ops.init(rd); // append this dimension @@ -437,22 +387,16 @@ RRDDIM *rrddim_add_custom(RRDSET *st, const char *id, const char *name, collecte ml_new_dimension(rd); rrdset_unlock(st); -#ifdef ENABLE_ACLK - rrdset_flag_clear(st, RRDSET_FLAG_ACLK); -#endif return(rd); } // ---------------------------------------------------------------------------- // RRDDIM remove / free a dimension -void rrddim_free_custom(RRDSET *st, RRDDIM *rd, int db_rotated) +void rrddim_free(RRDSET *st, RRDDIM *rd) { ml_delete_dimension(rd); - -#ifndef ENABLE_ACLK - UNUSED(db_rotated); -#endif + debug(D_RRD_CALLS, "rrddim_free() %s.%s", st->name, rd->name); if (!rrddim_flag_check(rd, RRDDIM_FLAG_ARCHIVED)) { @@ -483,10 +427,10 @@ void rrddim_free_custom(RRDSET *st, RRDDIM *rd, int db_rotated) error("RRDDIM: INTERNAL ERROR: attempt to remove from index dimension '%s' on chart '%s', removed a different dimension.", rd->id, st->id); // free(rd->annotations); -#if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL) - if (!netdata_exit) - aclk_send_dimension_update(rd); -#endif +//#if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL) +// if (!netdata_exit) +// aclk_send_dimension_update(rd); +//#endif RRD_MEMORY_MODE rrd_memory_mode = rd->rrd_memory_mode; switch(rrd_memory_mode) { @@ -512,10 +456,6 @@ void rrddim_free_custom(RRDSET *st, RRDDIM *rd, int db_rotated) freez(rd); break; } -#ifdef ENABLE_ACLK - if (db_rotated || RRD_MEMORY_MODE_DBENGINE != rrd_memory_mode) - rrdset_flag_clear(st, RRDSET_FLAG_ACLK); -#endif } @@ -532,12 +472,11 @@ int rrddim_hide(RRDSET *st, const char *id) { error("Cannot find dimension with id '%s' on stats '%s' (%s) on host '%s'.", id, st->name, st->id, host->hostname); return 1; } - (void) sql_set_dimension_option(&rd->state->metric_uuid, "hidden"); + if (!rrddim_flag_check(rd, RRDDIM_FLAG_META_HIDDEN)) + (void)sql_set_dimension_option(&rd->state->metric_uuid, "hidden"); rrddim_flag_set(rd, RRDDIM_FLAG_HIDDEN); -#ifdef ENABLE_ACLK - rrdset_flag_clear(st, RRDSET_FLAG_ACLK); -#endif + rrddim_flag_set(rd, RRDDIM_FLAG_META_HIDDEN); return 0; } @@ -550,12 +489,11 @@ int rrddim_unhide(RRDSET *st, const char *id) { error("Cannot find dimension with id '%s' on stats '%s' (%s) on host '%s'.", id, st->name, st->id, host->hostname); return 1; } - (void) sql_set_dimension_option(&rd->state->metric_uuid, NULL); + if (rrddim_flag_check(rd, RRDDIM_FLAG_META_HIDDEN)) + (void)sql_set_dimension_option(&rd->state->metric_uuid, NULL); rrddim_flag_clear(rd, RRDDIM_FLAG_HIDDEN); -#ifdef ENABLE_ACLK - rrdset_flag_clear(st, RRDSET_FLAG_ACLK); -#endif + rrddim_flag_clear(rd, RRDDIM_FLAG_META_HIDDEN); return 0; } @@ -568,18 +506,12 @@ inline void rrddim_is_obsolete(RRDSET *st, RRDDIM *rd) { } rrddim_flag_set(rd, RRDDIM_FLAG_OBSOLETE); rrdset_flag_set(st, RRDSET_FLAG_OBSOLETE_DIMENSIONS); -#ifdef ENABLE_ACLK - rrdset_flag_clear(st, RRDSET_FLAG_ACLK); -#endif } inline void rrddim_isnot_obsolete(RRDSET *st __maybe_unused, RRDDIM *rd) { debug(D_RRD_CALLS, "rrddim_isnot_obsolete() for chart %s, dimension %s", st->name, rd->name); rrddim_flag_clear(rd, RRDDIM_FLAG_OBSOLETE); -#ifdef ENABLE_ACLK - rrdset_flag_clear(st, RRDSET_FLAG_ACLK); -#endif } // ---------------------------------------------------------------------------- diff --git a/database/rrdhost.c b/database/rrdhost.c index 649736ca4..cb56bf353 100644 --- a/database/rrdhost.c +++ b/database/rrdhost.c @@ -181,6 +181,8 @@ RRDHOST *rrdhost_create(const char *hostname, host->rrdpush_send_enabled = (rrdpush_enabled && rrdpush_destination && *rrdpush_destination && rrdpush_api_key && *rrdpush_api_key) ? 1 : 0; host->rrdpush_send_destination = (host->rrdpush_send_enabled)?strdupz(rrdpush_destination):NULL; + if (host->rrdpush_send_destination) + host->destinations = destinations_init(host->rrdpush_send_destination); host->rrdpush_send_api_key = (host->rrdpush_send_enabled)?strdupz(rrdpush_api_key):NULL; host->rrdpush_send_charts_matching = simple_pattern_create(rrdpush_send_charts_matching, NULL, SIMPLE_PATTERN_EXACT); @@ -390,6 +392,7 @@ RRDHOST *rrdhost_create(const char *hostname, if (is_localhost && host->system_info) { host->system_info->ml_capable = ml_capable(); host->system_info->ml_enabled = ml_enabled(host); + host->system_info->mc_version = enable_metric_correlations ? metric_correlations_version : 0; } ml_new_host(host); @@ -698,7 +701,7 @@ int rrd_init(char *hostname, struct rrdhost_system_info *system_info) { if (gap_when_lost_iterations_above < 1) gap_when_lost_iterations_above = 1; - if (unlikely(sql_init_database(DB_CHECK_NONE))) { + if (unlikely(sql_init_database(DB_CHECK_NONE, 0))) { if (default_rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) fatal("Failed to initialize SQLite"); info("Skipping SQLITE metadata initialization since memory mode is not db engine"); @@ -1488,9 +1491,8 @@ restart_after_removal: continue; } #if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL) - else { - aclk_send_dimension_update(rd); - } + else + queue_dimension_to_aclk(rd, rd->last_collected_time.tv_sec); #endif } last = rd; @@ -1528,6 +1530,18 @@ restart_after_removal: } } +void rrdset_check_obsoletion(RRDHOST *host) +{ + RRDSET *st; + time_t last_entry_t; + rrdset_foreach_read(st, host) { + last_entry_t = rrdset_last_entry_t(st); + if (last_entry_t && last_entry_t < host->senders_connect_time) { + rrdset_is_obsolete(st); + } + } +} + void rrd_cleanup_obsolete_charts() { rrd_rdlock(); @@ -1547,6 +1561,16 @@ void rrd_cleanup_obsolete_charts() #endif rrdhost_unlock(host); } + + if (host != localhost && + host->trigger_chart_obsoletion_check && + host->senders_last_chart_command && + host->senders_last_chart_command + 120 < now_realtime_sec()) { + rrdhost_rdlock(host); + rrdset_check_obsoletion(host); + rrdhost_unlock(host); + host->trigger_chart_obsoletion_check = 0; + } } rrd_unlock(); diff --git a/database/rrdset.c b/database/rrdset.c index f8e471be7..e7cb89df0 100644 --- a/database/rrdset.c +++ b/database/rrdset.c @@ -125,7 +125,7 @@ char *rrdset_strncpyz_name(char *to, const char *from, size_t length) { char c, *p = to; while (length-- && (c = *from++)) { - if(c != '.' && !isalnum(c)) + if(c != '.' && c != '-' && !isalnum(c)) c = '_'; *p++ = c; @@ -366,11 +366,6 @@ void rrdset_free(RRDSET *st) { rrdvar_free_remaining_variables(host, &st->rrdvar_root_index); // ------------------------------------------------------------------------ - // remove it from the configuration - - appconfig_section_destroy_non_loaded(&netdata_config, st->config_section); - - // ------------------------------------------------------------------------ // unlink it from the host if(st == host->rrdset_root) { @@ -402,10 +397,10 @@ void rrdset_free(RRDSET *st) { freez(st->units); freez(st->context); freez(st->cache_dir); - freez(st->config_section); freez(st->plugin_name); freez(st->module_name); freez(st->state->old_title); + freez(st->state->old_units); freez(st->state->old_context); free_label_list(st->state->labels.head); freez(st->state); @@ -557,6 +552,10 @@ RRDSET *rrdset_create_custom( return NULL; } + if (host != localhost) { + host->senders_last_chart_command = now_realtime_sec(); + } + // ------------------------------------------------------------------------ // check if it already exists @@ -567,15 +566,13 @@ RRDSET *rrdset_create_custom( RRDSET *st = rrdset_find_on_create(host, fullid); if (st) { int mark_rebuild = 0; - rrdset_flag_set(st, RRDSET_FLAG_SYNC_CLOCK); - rrdset_flag_clear(st, RRDSET_FLAG_UPSTREAM_EXPOSED); if (rrdset_flag_check(st, RRDSET_FLAG_ARCHIVED)) { rrdset_flag_clear(st, RRDSET_FLAG_ARCHIVED); changed_from_archived_to_active = 1; mark_rebuild |= META_CHART_ACTIVATED; } char *old_plugin = NULL, *old_module = NULL, *old_title = NULL, *old_context = NULL, - *old_title_v = NULL, *old_context_v = NULL; + *old_title_v = NULL, *old_context_v = NULL, *old_units_v = NULL, *old_units = NULL; int rc; if(unlikely(name)) @@ -635,6 +632,17 @@ RRDSET *rrdset_create_custom( mark_rebuild |= META_CHART_UPDATED; } + if (unlikely(units && st->state->old_units && strcmp(st->state->old_units, units))) { + char *new_units = strdupz(units); + old_units_v = st->state->old_units; + st->state->old_units = strdupz(units); + json_fix_string(new_units); + old_units= st->units; + st->units = new_units; + mark_rebuild |= META_CHART_UPDATED; + } + + if (st->chart_type != chart_type) { st->chart_type = chart_type; mark_rebuild |= META_CHART_UPDATED; @@ -671,8 +679,10 @@ RRDSET *rrdset_create_custom( freez(old_plugin); freez(old_module); freez(old_title); + freez(old_units); freez(old_context); freez(old_title_v); + freez(old_units_v); freez(old_context_v); if (mark_rebuild != META_CHART_ACTIVATED) { info("Collector updated metadata for chart %s", st->id); @@ -684,6 +694,11 @@ RRDSET *rrdset_create_custom( int rc = update_chart_metadata(st->chart_uuid, st, id, name); if (unlikely(rc)) error_report("Failed to update chart metadata in the database"); + + if (!changed_from_archived_to_active) { + rrdset_flag_set(st, RRDSET_FLAG_SYNC_CLOCK); + rrdset_flag_clear(st, RRDSET_FLAG_UPSTREAM_EXPOSED); + } } /* Fall-through during switch from archived to active so that the host lock is taken and health is linked */ if (!changed_from_archived_to_active) @@ -713,26 +728,14 @@ RRDSET *rrdset_create_custom( char fullfilename[FILENAME_MAX + 1]; // ------------------------------------------------------------------------ - // compose the config_section for this chart - - char config_section[RRD_ID_LENGTH_MAX + GUID_LEN + 2]; - if(host == localhost) - strcpy(config_section, fullid); - else - snprintfz(config_section, RRD_ID_LENGTH_MAX + GUID_LEN + 1, "%s/%s", host->machine_guid, fullid); - - // ------------------------------------------------------------------------ // get the options from the config, we need to create it - long entries; - int enabled = config_get_boolean(config_section, "enabled", 1); - if(!enabled || memory_mode == RRD_MEMORY_MODE_DBENGINE) - entries = 5; - else + long entries = 5; + if (memory_mode != RRD_MEMORY_MODE_DBENGINE) entries = align_entries_to_pagesize(memory_mode, history_entries); unsigned long size = sizeof(RRDSET); - char *cache_dir = rrdset_cache_dir(host, fullid, config_section); + char *cache_dir = rrdset_cache_dir(host, fullid); time_t now = now_realtime_sec(); @@ -744,12 +747,11 @@ RRDSET *rrdset_create_custom( snprintfz(fullfilename, FILENAME_MAX, "%s/main.db", cache_dir); if(memory_mode == RRD_MEMORY_MODE_SAVE || memory_mode == RRD_MEMORY_MODE_MAP || memory_mode == RRD_MEMORY_MODE_RAM) { - st = (RRDSET *) mymmap( - (memory_mode == RRD_MEMORY_MODE_RAM) ? NULL : fullfilename - , size - , ((memory_mode == RRD_MEMORY_MODE_MAP) ? MAP_SHARED : MAP_PRIVATE) - , 0 - ); + st = (RRDSET *)netdata_mmap( + (memory_mode == RRD_MEMORY_MODE_RAM) ? NULL : fullfilename, + size, + ((memory_mode == RRD_MEMORY_MODE_MAP) ? MAP_SHARED : MAP_PRIVATE), + 0); if(st) { memset(&st->avl, 0, sizeof(avl_t)); @@ -759,7 +761,6 @@ RRDSET *rrdset_create_custom( memset(&st->rrdset_rwlock, 0, sizeof(netdata_rwlock_t)); st->name = NULL; - st->config_section = NULL; st->type = NULL; st->family = NULL; st->title = NULL; @@ -832,7 +833,6 @@ RRDSET *rrdset_create_custom( st->plugin_name = plugin?strdupz(plugin):NULL; st->module_name = module?strdupz(module):NULL; - st->config_section = strdupz(config_section); st->rrdhost = host; st->memsize = size; st->entries = entries; @@ -859,6 +859,7 @@ RRDSET *rrdset_create_custom( st->state->is_ar_chart = strcmp(st->id, ML_ANOMALY_RATES_CHART_ID) == 0; st->units = units ? strdupz(units) : strdupz(""); + st->state->old_units = strdupz(st->units); json_fix_string(st->units); st->context = context ? strdupz(context) : strdupz(st->id); @@ -867,27 +868,9 @@ RRDSET *rrdset_create_custom( st->hash_context = simple_hash(st->context); st->priority = priority; - if(enabled) - rrdset_flag_set(st, RRDSET_FLAG_ENABLED); - else - rrdset_flag_clear(st, RRDSET_FLAG_ENABLED); - rrdset_flag_clear(st, RRDSET_FLAG_DETAIL); - rrdset_flag_clear(st, RRDSET_FLAG_DEBUG); - rrdset_flag_clear(st, RRDSET_FLAG_OBSOLETE); - rrdset_flag_clear(st, RRDSET_FLAG_EXPORTING_SEND); - rrdset_flag_clear(st, RRDSET_FLAG_EXPORTING_IGNORE); - rrdset_flag_clear(st, RRDSET_FLAG_UPSTREAM_SEND); - rrdset_flag_clear(st, RRDSET_FLAG_UPSTREAM_IGNORE); - rrdset_flag_clear(st, RRDSET_FLAG_UPSTREAM_EXPOSED); rrdset_flag_set(st, RRDSET_FLAG_SYNC_CLOCK); - // if(!strcmp(st->id, "disk_util.dm-0")) { - // st->debug = 1; - // error("enabled debugging for '%s'", st->id); - // } - // else error("not enabled debugging for '%s'", st->id); - st->green = NAN; st->red = NAN; @@ -960,7 +943,7 @@ RRDSET *rrdset_create_custom( // RRDSET - data collection iteration control inline void rrdset_next_usec_unfiltered(RRDSET *st, usec_t microseconds) { - if(unlikely(!st->last_collected_time.tv_sec || !microseconds || (rrdset_flag_check_noatomic(st, RRDSET_FLAG_SYNC_CLOCK)))) { + if(unlikely(!st->last_collected_time.tv_sec || !microseconds || (rrdset_flag_check(st, RRDSET_FLAG_SYNC_CLOCK)))) { // call the full next_usec() function rrdset_next_usec(st, microseconds); return; @@ -978,7 +961,7 @@ inline void rrdset_next_usec(RRDSET *st, usec_t microseconds) { usec_t discarded = microseconds; #endif - if(unlikely(rrdset_flag_check_noatomic(st, RRDSET_FLAG_SYNC_CLOCK))) { + if(unlikely(rrdset_flag_check(st, RRDSET_FLAG_SYNC_CLOCK))) { // the chart needs to be re-synced to current time rrdset_flag_clear(st, RRDSET_FLAG_SYNC_CLOCK); @@ -1010,7 +993,9 @@ inline void rrdset_next_usec(RRDSET *st, usec_t microseconds) { if(unlikely(since_last_usec < 0)) { // oops! the database is in the future + #ifdef NETDATA_INTERNAL_CHECKS info("RRD database for chart '%s' on host '%s' is %0.5" LONG_DOUBLE_MODIFIER " secs in the future (counter #%zu, update #%zu). Adjusting it to current time.", st->id, st->rrdhost->hostname, (LONG_DOUBLE)-since_last_usec / USEC_PER_SEC, st->counter, st->counter_done); + #endif st->last_collected_time.tv_sec = now.tv_sec - st->update_every; st->last_collected_time.tv_usec = now.tv_usec; @@ -1027,7 +1012,9 @@ inline void rrdset_next_usec(RRDSET *st, usec_t microseconds) { } else if(unlikely((usec_t)since_last_usec > (usec_t)(st->update_every * 5 * USEC_PER_SEC))) { // oops! the database is too far behind + #ifdef NETDATA_INTERNAL_CHECKS info("RRD database for chart '%s' on host '%s' is %0.5" LONG_DOUBLE_MODIFIER " secs in the past (counter #%zu, update #%zu). Adjusting it to current time.", st->id, st->rrdhost->hostname, (LONG_DOUBLE)since_last_usec / USEC_PER_SEC, st->counter, st->counter_done); + #endif microseconds = (usec_t)since_last_usec; #ifdef NETDATA_INTERNAL_CHECKS @@ -1403,8 +1390,9 @@ void rrdset_done(RRDSET *st) { #ifdef ENABLE_ACLK if (likely(!st->state->is_ar_chart)) { if (unlikely(!rrdset_flag_check(st, RRDSET_FLAG_ACLK))) { - if (likely(st->dimensions && st->counter_done && !queue_chart_to_aclk(st))) + if (likely(st->dimensions && st->counter_done && !queue_chart_to_aclk(st))) { rrdset_flag_set(st, RRDSET_FLAG_ACLK); + } } } #endif @@ -1823,7 +1811,7 @@ after_first_database_work: after_second_database_work: st->last_collected_total = st->collected_total; -#ifdef ENABLE_NEW_CLOUD_PROTOCOL +#if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL) time_t mark = now_realtime_sec(); #endif rrddim_foreach_read(rd, st) { @@ -1831,20 +1819,10 @@ after_second_database_work: continue; #if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL) - if (likely(!st->state->is_ar_chart)) { - if (!rrddim_flag_check(rd, RRDDIM_FLAG_HIDDEN)) { - int live = - ((mark - rd->last_collected_time.tv_sec) < RRDSET_MINIMUM_DIM_LIVE_MULTIPLIER * rd->update_every); - if (unlikely(live != rd->state->aclk_live_status)) { - if (likely(rrdset_flag_check(st, RRDSET_FLAG_ACLK))) { - if (likely(!queue_dimension_to_aclk(rd))) { - rd->state->aclk_live_status = live; - rrddim_flag_set(rd, RRDDIM_FLAG_ACLK); - } - } - } + if (likely(!st->state->is_ar_chart)) { + if (!rrddim_flag_check(rd, RRDDIM_FLAG_HIDDEN) && likely(rrdset_flag_check(st, RRDSET_FLAG_ACLK))) + queue_dimension_to_aclk(rd, calc_dimension_liveness(rd, mark)); } - } #endif if(unlikely(!rd->updated)) continue; @@ -1946,7 +1924,7 @@ after_second_database_work: } else { /* Do not delete this dimension */ #if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL) - aclk_send_dimension_update(rd); + queue_dimension_to_aclk(rd, calc_dimension_liveness(rd, mark)); #endif last = rd; rd = rd->next; @@ -1996,7 +1974,7 @@ void rrdset_finalize_labels(RRDSET *st) replace_label_list(labels, new_labels); } - netdata_rwlock_wrlock(&labels->labels_rwlock); + netdata_rwlock_rdlock(&labels->labels_rwlock); struct label *lbl = labels->head; while (lbl) { sql_store_chart_label(st->chart_uuid, (int)lbl->label_source, lbl->key, lbl->value); diff --git a/database/sqlite/sqlite_aclk.c b/database/sqlite/sqlite_aclk.c index 989328097..950856d9a 100644 --- a/database/sqlite/sqlite_aclk.c +++ b/database/sqlite/sqlite_aclk.c @@ -10,6 +10,11 @@ #include "../../aclk/aclk.h" #endif +void sanity_check(void) { + // make sure the compiler will stop on misconfigurations + BUILD_BUG_ON(WORKER_UTILIZATION_MAX_JOB_TYPES < ACLK_MAX_ENUMERATIONS_DEFINED); +} + const char *aclk_sync_config[] = { "CREATE TABLE IF NOT EXISTS dimension_delete (dimension_id blob, dimension_name text, chart_type_id text, " "dim_id blob, chart_id blob, host_id blob, date_created);", @@ -29,6 +34,28 @@ const char *aclk_sync_config[] = { uv_mutex_t aclk_async_lock; struct aclk_database_worker_config *aclk_thread_head = NULL; +int retention_running = 0; + +#ifdef ENABLE_NEW_CLOUD_PROTOCOL +static void stop_retention_run() +{ + uv_mutex_lock(&aclk_async_lock); + retention_running = 0; + uv_mutex_unlock(&aclk_async_lock); +} + +static int request_retention_run() +{ + int rc = 0; + uv_mutex_lock(&aclk_async_lock); + if (unlikely(retention_running)) + rc = 1; + else + retention_running = 1; + uv_mutex_unlock(&aclk_async_lock); + return rc; +} +#endif int claimed() { @@ -313,9 +340,6 @@ static void timer_cb(uv_timer_t* handle) if (aclk_use_new_cloud_arch && aclk_connected) { if (wc->rotation_after && wc->rotation_after < now) { - cmd.opcode = ACLK_DATABASE_NODE_INFO; - aclk_database_enq_cmd_noblock(wc, &cmd); - cmd.opcode = ACLK_DATABASE_UPD_RETENTION; if (!aclk_database_enq_cmd_noblock(wc, &cmd)) wc->rotation_after += ACLK_DATABASE_ROTATION_INTERVAL; @@ -339,7 +363,7 @@ static void timer_cb(uv_timer_t* handle) } } - if (wc->alert_updates) { + if (wc->alert_updates && !wc->pause_alert_updates) { cmd.opcode = ACLK_DATABASE_PUSH_ALERT; cmd.count = ACLK_MAX_ALERT_UPDATES; aclk_database_enq_cmd_noblock(wc, &cmd); @@ -348,10 +372,65 @@ static void timer_cb(uv_timer_t* handle) #endif } + +#ifdef ENABLE_NEW_CLOUD_PROTOCOL +void after_send_retention(uv_work_t *req, int status) +{ + struct aclk_database_worker_config *wc = req->data; + (void)status; + stop_retention_run(); + wc->retention_running = 0; + + struct aclk_database_cmd cmd; + memset(&cmd, 0, sizeof(cmd)); + cmd.opcode = ACLK_DATABASE_DIM_DELETION; + if (aclk_database_enq_cmd_noblock(wc, &cmd)) + info("Failed to queue a dimension deletion message"); + + cmd.opcode = ACLK_DATABASE_NODE_INFO; + if (aclk_database_enq_cmd_noblock(wc, &cmd)) + info("Failed to queue a node update info message"); +} + + +static void send_retention(uv_work_t *req) +{ + struct aclk_database_worker_config *wc = req->data; + + if (unlikely(wc->is_shutting_down)) + return; + + aclk_update_retention(wc); +} +#endif + #define MAX_CMD_BATCH_SIZE (256) void aclk_database_worker(void *arg) { + worker_register("ACLKSYNC"); + worker_register_job_name(ACLK_DATABASE_NOOP, "noop"); +#ifdef ENABLE_NEW_CLOUD_PROTOCOL + worker_register_job_name(ACLK_DATABASE_ADD_CHART, "chart add"); + worker_register_job_name(ACLK_DATABASE_ADD_DIMENSION, "dimension add"); + worker_register_job_name(ACLK_DATABASE_PUSH_CHART, "chart push"); + worker_register_job_name(ACLK_DATABASE_PUSH_CHART_CONFIG, "chart conf push"); + worker_register_job_name(ACLK_DATABASE_RESET_CHART, "chart reset"); + worker_register_job_name(ACLK_DATABASE_CHART_ACK, "chart ack"); + worker_register_job_name(ACLK_DATABASE_UPD_RETENTION, "retention check"); + worker_register_job_name(ACLK_DATABASE_DIM_DELETION, "dimension delete"); + worker_register_job_name(ACLK_DATABASE_ORPHAN_HOST, "node orphan"); +#endif + worker_register_job_name(ACLK_DATABASE_ALARM_HEALTH_LOG, "alert log"); + worker_register_job_name(ACLK_DATABASE_CLEANUP, "cleanup"); + worker_register_job_name(ACLK_DATABASE_DELETE_HOST, "node delete"); + worker_register_job_name(ACLK_DATABASE_NODE_INFO, "node info"); + worker_register_job_name(ACLK_DATABASE_PUSH_ALERT, "alert push"); + worker_register_job_name(ACLK_DATABASE_PUSH_ALERT_CONFIG, "alert conf push"); + worker_register_job_name(ACLK_DATABASE_PUSH_ALERT_SNAPSHOT, "alert snapshot"); + worker_register_job_name(ACLK_DATABASE_QUEUE_REMOVED_ALERTS, "alerts check"); + worker_register_job_name(ACLK_DATABASE_TIMER, "timer"); + struct aclk_database_worker_config *wc = arg; uv_loop_t *loop; int ret; @@ -401,6 +480,7 @@ void aclk_database_worker(void *arg) memset(&cmd, 0, sizeof(cmd)); #ifdef ENABLE_NEW_CLOUD_PROTOCOL + uv_work_t retention_work; sql_get_last_chart_sequence(wc); wc->chart_payload_count = sql_get_pending_count(wc); if (!wc->chart_payload_count) @@ -412,7 +492,9 @@ void aclk_database_worker(void *arg) wc->rotation_after = wc->startup_time + ACLK_DATABASE_ROTATION_DELAY; debug(D_ACLK_SYNC,"Node %s reports pending message count = %u", wc->node_id, wc->chart_payload_count); + while (likely(!netdata_exit)) { + worker_is_idle(); uv_run(loop, UV_RUN_DEFAULT); /* wait for commands */ @@ -427,6 +509,10 @@ void aclk_database_worker(void *arg) opcode = cmd.opcode; ++cmd_batch_size; + + if(likely(opcode != ACLK_DATABASE_NOOP)) + worker_is_busy(opcode); + switch (opcode) { case ACLK_DATABASE_NOOP: /* the command queue was empty, do nothing */ @@ -439,6 +525,7 @@ void aclk_database_worker(void *arg) if (wc->host == localhost) sql_check_aclk_table_list(wc); break; + case ACLK_DATABASE_DELETE_HOST: debug(D_ACLK_SYNC,"Cleaning ACLK tables for %s", (char *) cmd.data); sql_delete_aclk_table_list(wc, cmd); @@ -504,9 +591,21 @@ void aclk_database_worker(void *arg) aclk_process_dimension_deletion(wc, cmd); break; case ACLK_DATABASE_UPD_RETENTION: + if (unlikely(wc->retention_running)) + break; + + if (unlikely(request_retention_run())) { + wc->rotation_after = now_realtime_sec() + ACLK_DATABASE_RETENTION_RETRY; + break; + } + debug(D_ACLK_SYNC,"Sending retention info for %s", wc->uuid_str); - aclk_update_retention(wc, cmd); - aclk_process_dimension_deletion(wc, cmd); + retention_work.data = wc; + wc->retention_running = 1; + if (unlikely(uv_queue_work(loop, &retention_work, send_retention, after_send_retention))) { + wc->retention_running = 0; + stop_retention_run(); + } break; // NODE_INSTANCE DETECTION @@ -535,6 +634,8 @@ void aclk_database_worker(void *arg) cmd.completion = NULL; wc->node_info_send = aclk_database_enq_cmd_noblock(wc, &cmd); } + if (localhost == wc->host) + (void) sqlite3_wal_checkpoint(db_meta, NULL); break; default: debug(D_ACLK_SYNC, "%s: default.", __func__); @@ -577,6 +678,8 @@ void aclk_database_worker(void *arg) wc->host->dbsync_worker = NULL; freez(wc); rrd_unlock(); + + worker_unregister(); return; error_after_timer_init: @@ -585,6 +688,7 @@ error_after_async_init: fatal_assert(0 == uv_loop_close(loop)); error_after_loop_init: freez(loop); + worker_unregister(); } // ------------------------------------------------------------- @@ -628,7 +732,7 @@ void sql_create_aclk_table(RRDHOST *host, uuid_t *host_uuid, uuid_t *node_id) db_execute(buffer_tostring(sql)); buffer_flush(sql); - buffer_sprintf(sql, TABLE_ACLK_ALERT, uuid_str, uuid_str, uuid_str); + buffer_sprintf(sql, TABLE_ACLK_ALERT, uuid_str); db_execute(buffer_tostring(sql)); buffer_flush(sql); diff --git a/database/sqlite/sqlite_aclk.h b/database/sqlite/sqlite_aclk.h index 894d93489..37e3d4530 100644 --- a/database/sqlite/sqlite_aclk.h +++ b/database/sqlite/sqlite_aclk.h @@ -16,7 +16,8 @@ #endif #define ACLK_MAX_ALERT_UPDATES (5) #define ACLK_DATABASE_CLEANUP_FIRST (60) -#define ACLK_DATABASE_ROTATION_DELAY (60) +#define ACLK_DATABASE_ROTATION_DELAY (180) +#define ACLK_DATABASE_RETENTION_RETRY (60) #define ACLK_DATABASE_CLEANUP_INTERVAL (3600) #define ACLK_DATABASE_ROTATION_INTERVAL (3600) #define ACLK_DELETE_ACK_INTERNAL (600) @@ -103,9 +104,7 @@ static inline char *get_str_from_uuid(uuid_t *uuid) #define TABLE_ACLK_ALERT "CREATE TABLE IF NOT EXISTS aclk_alert_%s (sequence_id INTEGER PRIMARY KEY, " \ "alert_unique_id, date_created, date_submitted, date_cloud_ack, " \ - "unique(alert_unique_id)); " \ - "insert into aclk_alert_%s (alert_unique_id, date_created) " \ - "select unique_id alert_unique_id, strftime('%%s') date_created from health_log_%s where new_status <> 0 and new_status <> -2 order by unique_id asc on conflict (alert_unique_id) do nothing;" + "unique(alert_unique_id));" #define INDEX_ACLK_CHART "CREATE INDEX IF NOT EXISTS aclk_chart_index_%s ON aclk_chart_%s (unique_id);" @@ -135,7 +134,11 @@ enum aclk_database_opcode { ACLK_DATABASE_PUSH_ALERT_CONFIG, ACLK_DATABASE_PUSH_ALERT_SNAPSHOT, ACLK_DATABASE_QUEUE_REMOVED_ALERTS, - ACLK_DATABASE_TIMER + ACLK_DATABASE_TIMER, + + // leave this last + // we need it to check for worker utilization + ACLK_MAX_ENUMERATIONS_DEFINED }; struct aclk_chart_payload_t { @@ -176,6 +179,7 @@ struct aclk_database_worker_config { uint64_t alerts_batch_id; // batch id for alerts to use uint64_t alerts_start_seq_id; // cloud has asked to start streaming from uint64_t alert_sequence_id; // last alert sequence_id + int pause_alert_updates; uint32_t chart_payload_count; uint64_t alerts_snapshot_id; //will contain the snapshot_id value if snapshot was requested uint64_t alerts_ack_sequence_id; //last sequence_id ack'ed from cloud via sendsnapshot message @@ -194,6 +198,7 @@ struct aclk_database_worker_config { int node_info_send; int chart_pending; int chart_reset_count; + int retention_running; volatile unsigned is_shutting_down; volatile unsigned is_orphan; struct aclk_database_worker_config *next; diff --git a/database/sqlite/sqlite_aclk_alert.c b/database/sqlite/sqlite_aclk_alert.c index 54e8be4a7..53c6c2a65 100644 --- a/database/sqlite/sqlite_aclk_alert.c +++ b/database/sqlite/sqlite_aclk_alert.c @@ -8,9 +8,120 @@ #include "../../aclk/aclk.h" #endif +time_t removed_when(uint32_t alarm_id, uint32_t before_unique_id, uint32_t after_unique_id, char *uuid_str) { + sqlite3_stmt *res = NULL; + int rc = 0; + time_t when = 0; + char sql[ACLK_SYNC_QUERY_SIZE]; + + snprintfz(sql,ACLK_SYNC_QUERY_SIZE-1, "select when_key from health_log_%s where alarm_id = %u " \ + "and unique_id > %u and unique_id < %u " \ + "and new_status = -2;", uuid_str, alarm_id, after_unique_id, before_unique_id); + + rc = sqlite3_prepare_v2(db_meta, sql, -1, &res, 0); + if (rc != SQLITE_OK) { + error_report("Failed to prepare statement when trying to find removed gap."); + return 0; + } + + rc = sqlite3_step(res); + if (likely(rc == SQLITE_ROW)) { + when = (time_t) sqlite3_column_int64(res, 0); + } + + rc = sqlite3_finalize(res); + if (unlikely(rc != SQLITE_OK)) + error_report("Failed to finalize statement when trying to find removed gap, rc = %d", rc); + + return when; +} + +#define MAX_REMOVED_PERIOD 900 +//decide if some events should be sent or not +int should_send_to_cloud(RRDHOST *host, ALARM_ENTRY *ae) +{ + sqlite3_stmt *res = NULL; + char uuid_str[GUID_LEN + 1]; + uuid_unparse_lower_fix(&host->host_uuid, uuid_str); + int send = 1, rc = 0; + + if (ae->new_status == RRDCALC_STATUS_REMOVED || ae->new_status == RRDCALC_STATUS_UNINITIALIZED) { + return 0; + } + + if (unlikely(uuid_is_null(ae->config_hash_id))) + return 0; + + char sql[ACLK_SYNC_QUERY_SIZE]; + uuid_t config_hash_id; + RRDCALC_STATUS status; + uint32_t unique_id; + + //get the previous sent event of this alarm_id + snprintfz(sql,ACLK_SYNC_QUERY_SIZE-1, "select hl.new_status, hl.config_hash_id, hl.unique_id from health_log_%s hl, aclk_alert_%s aa \ + where hl.unique_id = aa.alert_unique_id \ + and hl.alarm_id = %u and hl.unique_id <> %u \ + order by alarm_event_id desc LIMIT 1;", uuid_str, uuid_str, ae->alarm_id, ae->unique_id); + + rc = sqlite3_prepare_v2(db_meta, sql, -1, &res, 0); + if (rc != SQLITE_OK) { + error_report("Failed to prepare statement when trying to filter alert events."); + send = 1; + return send; + } + + rc = sqlite3_step(res); + if (likely(rc == SQLITE_ROW)) { + status = (RRDCALC_STATUS) sqlite3_column_int(res, 0); + if (sqlite3_column_type(res, 1) != SQLITE_NULL) + uuid_copy(config_hash_id, *((uuid_t *) sqlite3_column_blob(res, 1))); + unique_id = (uint32_t) sqlite3_column_int64(res, 2); + + } else { + send = 1; + goto done; + } + + if (ae->new_status != (RRDCALC_STATUS)status) { + send = 1; + goto done; + } + + if (uuid_compare(ae->config_hash_id, config_hash_id)) { + send = 1; + goto done; + } + + //same status, same config + if (ae->new_status == RRDCALC_STATUS_CLEAR) { + send = 0; + goto done; + } + + //detect a long off period of the agent, TODO make global + if (ae->new_status == RRDCALC_STATUS_WARNING || ae->new_status == RRDCALC_STATUS_CRITICAL) { + time_t when = removed_when(ae->alarm_id, ae->unique_id, unique_id, uuid_str); + + if (when && (when + (time_t)MAX_REMOVED_PERIOD) < ae->when) { + send = 1; + goto done; + } else { + send = 0; + goto done; + } + } + +done: + rc = sqlite3_finalize(res); + if (unlikely(rc != SQLITE_OK)) + error_report("Failed to finalize statement when trying to filter alert events, rc = %d", rc); + + return send; +} + // will replace call to aclk_update_alarm in health/health_log.c // and handle both cases -int sql_queue_alarm_to_aclk(RRDHOST *host, ALARM_ENTRY *ae) +int sql_queue_alarm_to_aclk(RRDHOST *host, ALARM_ENTRY *ae, int skip_filter) { //check aclk architecture and handle old json alarm update to cloud //include also the valid statuses for this case @@ -30,17 +141,15 @@ int sql_queue_alarm_to_aclk(RRDHOST *host, ALARM_ENTRY *ae) if (!claimed()) return 0; - if (ae->flags & HEALTH_ENTRY_FLAG_ACLK_QUEUED) + if (ae->flags & HEALTH_ENTRY_FLAG_ACLK_QUEUED) { return 0; + } - if (ae->new_status == RRDCALC_STATUS_REMOVED || ae->new_status == RRDCALC_STATUS_UNINITIALIZED) - return 0; - - if (unlikely(!host->dbsync_worker)) - return 1; - - if (unlikely(uuid_is_null(ae->config_hash_id))) - return 0; + if (!skip_filter) { + if (!should_send_to_cloud(host, ae)) { + return 0; + } + } int rc = 0; @@ -76,6 +185,10 @@ int sql_queue_alarm_to_aclk(RRDHOST *host, ALARM_ENTRY *ae) } ae->flags |= HEALTH_ENTRY_FLAG_ACLK_QUEUED; + struct aclk_database_worker_config *wc = (struct aclk_database_worker_config *)host->dbsync_worker; + if (wc) { + wc->pause_alert_updates = 0; + } bind_fail: if (unlikely(sqlite3_finalize(res_alert) != SQLITE_OK)) @@ -86,6 +199,7 @@ bind_fail: #else UNUSED(host); UNUSED(ae); + UNUSED(skip_filter); #endif return 0; } @@ -283,6 +397,7 @@ void aclk_push_alert_event(struct aclk_database_worker_config *wc, struct aclk_d wc->alerts_batch_id); log_first_sequence_id = 0; log_last_sequence_id = 0; + wc->pause_alert_updates = 1; } rc = sqlite3_finalize(res); @@ -296,6 +411,27 @@ void aclk_push_alert_event(struct aclk_database_worker_config *wc, struct aclk_d return; } +void sql_queue_existing_alerts_to_aclk(RRDHOST *host) +{ + char uuid_str[GUID_LEN + 1]; + uuid_unparse_lower_fix(&host->host_uuid, uuid_str); + BUFFER *sql = buffer_create(1024); + + buffer_sprintf(sql,"insert into aclk_alert_%s (alert_unique_id, date_created) " \ + "select unique_id alert_unique_id, strftime('%%s') date_created from health_log_%s " \ + "where new_status <> 0 and new_status <> -2 and config_hash_id is not null and updated_by_id = 0 " \ + "order by unique_id asc on conflict (alert_unique_id) do nothing;", uuid_str, uuid_str); + + db_execute(buffer_tostring(sql)); + + buffer_free(sql); + + struct aclk_database_worker_config *wc = (struct aclk_database_worker_config *)host->dbsync_worker; + if (wc) { + wc->pause_alert_updates = 0; + } +} + void aclk_send_alarm_health_log(char *node_id) { if (unlikely(!node_id)) @@ -421,6 +557,8 @@ void aclk_push_alarm_health_log(struct aclk_database_worker_config *wc, struct a freez(claim_id); buffer_free(sql); + + aclk_alert_reloaded = 1; #endif return; @@ -593,6 +731,9 @@ void aclk_start_alert_streaming(char *node_id, uint64_t batch_id, uint64_t start log_access("ACLK STA [%s (N/A)]: Ignoring request to stream alert state changes, health is disabled.", node_id); return; } + + if (unlikely(batch_id == 1) && unlikely(start_seq_id == 1)) + sql_queue_existing_alerts_to_aclk(host); } else wc = (struct aclk_database_worker_config *)find_inactive_wc_by_node_id(node_id); @@ -602,6 +743,7 @@ void aclk_start_alert_streaming(char *node_id, uint64_t batch_id, uint64_t start wc->alerts_batch_id = batch_id; wc->alerts_start_seq_id = start_seq_id; wc->alert_updates = 1; + wc->pause_alert_updates = 0; __sync_synchronize(); } else @@ -631,9 +773,11 @@ void sql_process_queue_removed_alerts_to_aclk(struct aclk_database_worker_config db_execute(buffer_tostring(sql)); - log_access("ACLK STA [%s (%s)]: Queued removed alerts.", wc->node_id, wc->host ? wc->host->hostname : "N/A"); + log_access("ACLK STA [%s (%s)]: QUEUED REMOVED ALERTS", wc->node_id, wc->host ? wc->host->hostname : "N/A"); buffer_free(sql); + + wc->pause_alert_updates = 0; #endif return; } @@ -644,6 +788,9 @@ void sql_queue_removed_alerts_to_aclk(RRDHOST *host) if (unlikely(!host->dbsync_worker)) return; + if (!claimed()) + return; + struct aclk_database_cmd cmd; memset(&cmd, 0, sizeof(cmd)); cmd.opcode = ACLK_DATABASE_QUEUE_REMOVED_ALERTS; @@ -912,9 +1059,6 @@ void sql_aclk_alert_clean_dead_entries(RRDHOST *host) if (!claimed()) return; - if (unlikely(!host->dbsync_worker)) - return; - char uuid_str[GUID_LEN + 1]; uuid_unparse_lower_fix(&host->host_uuid, uuid_str); diff --git a/database/sqlite/sqlite_aclk_alert.h b/database/sqlite/sqlite_aclk_alert.h index 957cb94ac..0181b4842 100644 --- a/database/sqlite/sqlite_aclk_alert.h +++ b/database/sqlite/sqlite_aclk_alert.h @@ -26,5 +26,6 @@ void sql_process_queue_removed_alerts_to_aclk(struct aclk_database_worker_config void aclk_push_alert_snapshot_event(struct aclk_database_worker_config *wc, struct aclk_database_cmd cmd); void aclk_process_send_alarm_snapshot(char *node_id, char *claim_id, uint64_t snapshot_id, uint64_t sequence_id); int get_proto_alert_status(RRDHOST *host, struct proto_alert_status *proto_alert_status); +extern int sql_queue_alarm_to_aclk(RRDHOST *host, ALARM_ENTRY *ae, int skip_filter); #endif //NETDATA_SQLITE_ACLK_ALERT_H diff --git a/database/sqlite/sqlite_aclk_chart.c b/database/sqlite/sqlite_aclk_chart.c index 7afa1d451..a9db5282a 100644 --- a/database/sqlite/sqlite_aclk_chart.c +++ b/database/sqlite/sqlite_aclk_chart.c @@ -22,20 +22,20 @@ sql_queue_chart_payload(struct aclk_database_worker_config *wc, void *data, enum return rc; } -static int payload_sent(char *uuid_str, uuid_t *uuid, void *payload, size_t payload_size) +static time_t payload_sent(char *uuid_str, uuid_t *uuid, void *payload, size_t payload_size) { static __thread sqlite3_stmt *res = NULL; int rc; - int send_status = 0; + time_t send_status = 0; if (unlikely(!res)) { char sql[ACLK_SYNC_QUERY_SIZE]; - snprintfz(sql,ACLK_SYNC_QUERY_SIZE-1, "SELECT 1 FROM aclk_chart_latest_%s acl, aclk_chart_payload_%s acp " - "WHERE acl.unique_id = acp.unique_id AND acl.uuid = @uuid AND acp.payload = @payload;", - uuid_str, uuid_str); + snprintfz(sql,ACLK_SYNC_QUERY_SIZE-1, "SELECT acl.date_submitted FROM aclk_chart_latest_%s acl, aclk_chart_payload_%s acp " + "WHERE acl.unique_id = acp.unique_id AND acl.uuid = @uuid AND acp.payload = @payload;", + uuid_str, uuid_str); rc = prepare_statement(db_meta, sql, &res); if (rc != SQLITE_OK) { - error_report("Failed to prepare statement to check payload data"); + error_report("Failed to prepare statement to check payload data on %s", sql); return 0; } } @@ -49,7 +49,7 @@ static int payload_sent(char *uuid_str, uuid_t *uuid, void *payload, size_t payl goto bind_fail; while (sqlite3_step(res) == SQLITE_ROW) { - send_status = sqlite3_column_int(res, 0); + send_status = (time_t) sqlite3_column_int64(res, 0); } bind_fail: @@ -58,23 +58,36 @@ bind_fail: return send_status; } -static int aclk_add_chart_payload(struct aclk_database_worker_config *wc, uuid_t *uuid, char *claim_id, - ACLK_PAYLOAD_TYPE payload_type, void *payload, size_t payload_size, int *send_status) +static int aclk_add_chart_payload( + struct aclk_database_worker_config *wc, + uuid_t *uuid, + char *claim_id, + ACLK_PAYLOAD_TYPE payload_type, + void *payload, + size_t payload_size, + time_t *send_status, + int check_sent) { static __thread sqlite3_stmt *res_chart = NULL; int rc; + time_t date_submitted; - rc = payload_sent(wc->uuid_str, uuid, payload, payload_size); - if (send_status) - *send_status = rc; - if (rc == 1) + if (unlikely(!payload)) return 0; + if (check_sent) { + date_submitted = payload_sent(wc->uuid_str, uuid, payload, payload_size); + if (send_status) + *send_status = date_submitted; + if (date_submitted) + return 0; + } + if (unlikely(!res_chart)) { char sql[ACLK_SYNC_QUERY_SIZE]; snprintfz(sql,ACLK_SYNC_QUERY_SIZE-1, - "INSERT INTO aclk_chart_payload_%s (unique_id, uuid, claim_id, date_created, type, payload) " \ - "VALUES (@unique_id, @uuid, @claim_id, strftime('%%s','now'), @type, @payload);", wc->uuid_str); + "INSERT INTO aclk_chart_payload_%s (unique_id, uuid, claim_id, date_created, type, payload) " \ + "VALUES (@unique_id, @uuid, @claim_id, strftime('%%s','now'), @type, @payload);", wc->uuid_str); rc = prepare_statement(db_meta, sql, &res_chart); if (rc != SQLITE_OK) { error_report("Failed to prepare statement to store chart payload data"); @@ -146,7 +159,7 @@ int aclk_add_chart_event(struct aclk_database_worker_config *wc, struct aclk_dat chart_payload.id = strdupz(st->id); struct label_index *labels = &st->state->labels; - netdata_rwlock_wrlock(&labels->labels_rwlock); + netdata_rwlock_rdlock(&labels->labels_rwlock); struct label *label_list = labels->head; struct label *chart_label = NULL; while (label_list) { @@ -159,7 +172,7 @@ int aclk_add_chart_event(struct aclk_database_worker_config *wc, struct aclk_dat size_t size; char *payload = generate_chart_instance_updated(&size, &chart_payload); if (likely(payload)) - rc = aclk_add_chart_payload(wc, st->chart_uuid, claim_id, ACLK_PAYLOAD_CHART, (void *) payload, size, NULL); + rc = aclk_add_chart_payload(wc, st->chart_uuid, claim_id, ACLK_PAYLOAD_CHART, (void *) payload, size, NULL, 1); freez(payload); chart_instance_updated_destroy(&chart_payload); } @@ -168,7 +181,7 @@ int aclk_add_chart_event(struct aclk_database_worker_config *wc, struct aclk_dat static inline int aclk_upd_dimension_event(struct aclk_database_worker_config *wc, char *claim_id, uuid_t *dim_uuid, const char *dim_id, const char *dim_name, const char *chart_type_id, time_t first_time, time_t last_time, - int *send_status) + time_t *send_status) { int rc = 0; size_t size; @@ -197,7 +210,7 @@ static inline int aclk_upd_dimension_event(struct aclk_database_worker_config *w dim_payload.last_timestamp.tv_sec = last_time; char *payload = generate_chart_dimension_updated(&size, &dim_payload); if (likely(payload)) - rc = aclk_add_chart_payload(wc, dim_uuid, claim_id, ACLK_PAYLOAD_DIMENSION, (void *)payload, size, send_status); + rc = aclk_add_chart_payload(wc, dim_uuid, claim_id, ACLK_PAYLOAD_DIMENSION, (void *)payload, size, send_status, 1); freez(payload); return rc; } @@ -271,39 +284,22 @@ bind_fail: int aclk_add_dimension_event(struct aclk_database_worker_config *wc, struct aclk_database_cmd cmd) { - int rc = 0; + int rc = 1; CHECK_SQLITE_CONNECTION(db_meta); - char *claim_id = is_agent_claimed(); - - RRDDIM *rd = cmd.data; - - if (likely(claim_id)) { - int send_status = 0; - time_t now = now_realtime_sec(); - - time_t first_t = rd->state->query_ops.oldest_time(rd); - time_t last_t = rd->state->query_ops.latest_time(rd); - - int live = ((now - last_t) < (RRDSET_MINIMUM_DIM_LIVE_MULTIPLIER * rd->update_every)); + struct aclk_chart_dimension_data *aclk_cd_data = cmd.data; - rc = aclk_upd_dimension_event( - wc, - claim_id, - &rd->state->metric_uuid, - rd->id, - rd->name, - rd->rrdset->id, - first_t, - live ? 0 : last_t, - &send_status); + char *claim_id = is_agent_claimed(); + if (!claim_id) + goto cleanup; - if (!send_status) - rd->state->aclk_live_status = live; + rc = aclk_add_chart_payload(wc, &aclk_cd_data->uuid, claim_id, ACLK_PAYLOAD_DIMENSION, + (void *) aclk_cd_data->payload, aclk_cd_data->payload_size, NULL, aclk_cd_data->check_payload); - freez(claim_id); - } - rrddim_flag_clear(rd, RRDDIM_FLAG_ACLK); + freez(claim_id); +cleanup: + freez(aclk_cd_data->payload); + freez(aclk_cd_data); return rc; } @@ -337,6 +333,12 @@ void aclk_send_chart_event(struct aclk_database_worker_config *wc, struct aclk_d char sql[ACLK_SYNC_QUERY_SIZE]; static __thread sqlite3_stmt *res = NULL; + char *hostname = NULL; + if (wc->host) + hostname = strdupz(wc->host->hostname); + else + hostname = get_hostname_by_node_id(wc->node_id); + if (unlikely(!res)) { snprintfz(sql,ACLK_SYNC_QUERY_SIZE-1,"SELECT ac.sequence_id, acp.payload, ac.date_created, ac.type, ac.uuid " \ "FROM aclk_chart_%s ac, aclk_chart_payload_%s acp " \ @@ -346,6 +348,7 @@ void aclk_send_chart_event(struct aclk_database_worker_config *wc, struct aclk_d if (rc != SQLITE_OK) { error_report("Failed to prepare statement when trying to send a chart update via ACLK"); freez(claim_id); + freez(hostname); return; } } @@ -419,7 +422,7 @@ void aclk_send_chart_event(struct aclk_database_worker_config *wc, struct aclk_d log_access( "ACLK RES [%s (%s)]: CHARTS SENT from %" PRIu64 " to %" PRIu64 " batch=%" PRIu64, wc->node_id, - wc->host ? wc->host->hostname : "N/A", + hostname ? hostname : "N/A", first_sequence, last_sequence, wc->batch_id); @@ -440,7 +443,7 @@ void aclk_send_chart_event(struct aclk_database_worker_config *wc, struct aclk_d log_access( "ACLK STA [%s (%s)]: Sync of charts and dimensions done in %ld seconds.", wc->node_id, - wc->host ? wc->host->hostname : "N/A", + hostname ? hostname : "N/A", now_realtime_sec() - wc->startup_time); } @@ -459,6 +462,7 @@ bind_fail: error_report("Failed to reset statement when pushing chart events, rc = %d", rc); freez(claim_id); + freez(hostname); return; } @@ -562,7 +566,7 @@ void aclk_receive_chart_ack(struct aclk_database_worker_config *wc, struct aclk_ error_report("Failed to ACK sequence id, rc = %d", rc); else log_access( - "ACLK STA [%s (%s)]: CHARTS ACKNOWLEDGED in the database upto %" PRIu64, + "ACLK STA [%s (%s)]: CHARTS ACKNOWLEDGED IN THE DATABASE UP TO %" PRIu64, wc->node_id, wc->host ? wc->host->hostname : "N/A", cmd.param1); @@ -583,8 +587,13 @@ void aclk_receive_chart_reset(struct aclk_database_worker_config *wc, struct acl cmd.param1); db_execute(buffer_tostring(sql)); if (cmd.param1 == 1) { + char *hostname = NULL; + if (wc->host) + hostname = strdupz(wc->host->hostname); + else + hostname = get_hostname_by_node_id(wc->node_id); buffer_flush(sql); - log_access("ACLK REQ [%s (%s)]: Received chart full resync.", wc->node_id, wc->host ? wc->host->hostname : "N/A"); + log_access("ACLK REQ [%s (%s)]: Received chart full resync.", wc->node_id, hostname? hostname : "N/A"); buffer_sprintf(sql, "DELETE FROM aclk_chart_payload_%s; DELETE FROM aclk_chart_%s; " \ "DELETE FROM aclk_chart_latest_%s;", wc->uuid_str, wc->uuid_str, wc->uuid_str); db_lock(); @@ -609,6 +618,7 @@ void aclk_receive_chart_reset(struct aclk_database_worker_config *wc, struct acl RRDDIM *rd; rrddim_foreach_read(rd, st) { + rrddim_flag_clear(rd, RRDDIM_FLAG_ACLK); rd->state->aclk_live_status = (rd->state->aclk_live_status == 0); } rrdset_unlock(st); @@ -616,9 +626,10 @@ void aclk_receive_chart_reset(struct aclk_database_worker_config *wc, struct acl rrdhost_unlock(host); } else error_report("ACLK synchronization thread for %s is not linked to HOST", wc->host_guid); + freez(hostname); } else { log_access( - "ACLK STA [%s (%s)]: Restarting chart sync from sequence %" PRIu64, + "ACLK STA [%s (%s)]: RESTARTING CHART SYNC FROM SEQUENCE %" PRIu64, wc->node_id, wc->host ? wc->host->hostname : "N/A", cmd.param1); @@ -705,25 +716,28 @@ void aclk_start_streaming(char *node_id, uint64_t sequence_id, time_t created_at if (unlikely(!node_id)) return; - // log_access("ACLK REQ [%s (N/A)]: CHARTS STREAM from %"PRIu64" t=%ld batch=%"PRIu64, node_id, - // sequence_id, created_at, batch_id); - uuid_t node_uuid; if (uuid_parse(node_id, node_uuid)) { log_access("ACLK REQ [%s (N/A)]: CHARTS STREAM ignored, invalid node id", node_id); return; } - struct aclk_database_worker_config *wc = NULL; + struct aclk_database_worker_config *wc = find_inactive_wc_by_node_id(node_id); rrd_rdlock(); RRDHOST *host = localhost; while(host) { - if (host->node_id && !(uuid_compare(*host->node_id, node_uuid))) { + if (wc || (host->node_id && !(uuid_compare(*host->node_id, node_uuid)))) { rrd_unlock(); - wc = (struct aclk_database_worker_config *)host->dbsync_worker ? - (struct aclk_database_worker_config *)host->dbsync_worker : - (struct aclk_database_worker_config *)find_inactive_wc_by_node_id(node_id); + if (!wc) + wc = (struct aclk_database_worker_config *)host->dbsync_worker ? + (struct aclk_database_worker_config *)host->dbsync_worker : + (struct aclk_database_worker_config *)find_inactive_wc_by_node_id(node_id); + char *hostname = NULL; if (likely(wc)) { + if (wc->host) + hostname = strdupz(wc->host->hostname); + else + hostname = get_hostname_by_node_id(node_id); wc->chart_reset_count++; __sync_synchronize(); wc->chart_updates = 0; @@ -731,9 +745,10 @@ void aclk_start_streaming(char *node_id, uint64_t sequence_id, time_t created_at __sync_synchronize(); wc->batch_created = now_realtime_sec(); log_access( - "ACLK REQ [%s (%s)]: CHARTS STREAM from %" PRIu64 " t=%ld resets=%d", + "ACLK REQ [%s (%s)]: CHARTS STREAM from %"PRIu64" (LOCAL %"PRIu64") t=%ld resets=%d" , wc->node_id, - wc->host ? wc->host->hostname : "N/A", + hostname ? hostname : "N/A", + sequence_id + 1, wc->chart_sequence_id, wc->chart_timestamp, wc->chart_reset_count); @@ -742,7 +757,7 @@ void aclk_start_streaming(char *node_id, uint64_t sequence_id, time_t created_at "ACLK RES [%s (%s)]: CHARTS FULL RESYNC REQUEST " "remote_seq=%" PRIu64 " local_seq=%" PRIu64 " resets=%d ", wc->node_id, - wc->host ? wc->host->hostname : "N/A", + hostname ? hostname : "N/A", sequence_id, wc->chart_sequence_id, wc->chart_reset_count); @@ -756,7 +771,6 @@ void aclk_start_streaming(char *node_id, uint64_t sequence_id, time_t created_at freez(chart_reset.claim_id); wc->chart_reset_count = -1; } - return; } else { struct aclk_database_cmd cmd; memset(&cmd, 0, sizeof(cmd)); @@ -766,8 +780,8 @@ void aclk_start_streaming(char *node_id, uint64_t sequence_id, time_t created_at log_access( "ACLK REQ [%s (%s)]: CHART RESET from %" PRIu64 " t=%ld batch=%" PRIu64, wc->node_id, - wc->host ? wc->host->hostname : "N/A", - wc->chart_sequence_id, + hostname ? hostname : "N/A", + sequence_id + 1, wc->chart_timestamp, wc->batch_id); cmd.opcode = ACLK_DATABASE_RESET_CHART; @@ -775,20 +789,15 @@ void aclk_start_streaming(char *node_id, uint64_t sequence_id, time_t created_at cmd.completion = NULL; aclk_database_enq_cmd(wc, &cmd); } else { -// log_access( -// "ACLK RES [%s (%s)]: CHARTS STREAM from %" PRIu64 -// " t=%ld resets=%d", -// wc->node_id, -// wc->host ? wc->host->hostname : "N/A", -// wc->chart_sequence_id, -// wc->chart_timestamp, -// wc->chart_reset_count); wc->chart_reset_count = 0; wc->chart_updates = 1; } } - } else - log_access("ACLK STA [%s (N/A)]: ACLK synchronization thread is not active.", node_id); + } else { + hostname = get_hostname_by_node_id(node_id); + log_access("ACLK STA [%s (%s)]: ACLK synchronization thread is not active.", node_id, hostname ? hostname : "N/A"); + } + freez(hostname); return; } host = host->next; @@ -838,9 +847,8 @@ failed: "SELECT distinct h.host_id, c.update_every, c.type||'.'||c.id FROM chart c, host h " \ "WHERE c.host_id = h.host_id AND c.host_id = @host_id ORDER BY c.update_every ASC;" -void aclk_update_retention(struct aclk_database_worker_config *wc, struct aclk_database_cmd cmd) +void aclk_update_retention(struct aclk_database_worker_config *wc) { - UNUSED(cmd); int rc; if (!aclk_use_new_cloud_arch || !aclk_connected) @@ -887,7 +895,10 @@ void aclk_update_retention(struct aclk_database_worker_config *wc, struct aclk_d time_t last_entry_t; uint32_t update_every = 0; uint32_t dimension_update_count = 0; - int send_status; + uint32_t total_checked = 0; + uint32_t total_deleted= 0; + uint32_t total_stopped= 0; + time_t send_status; struct retention_updated rotate_data; @@ -904,7 +915,9 @@ void aclk_update_retention(struct aclk_database_worker_config *wc, struct aclk_d rotate_data.node_id = strdupz(wc->node_id); time_t now = now_realtime_sec(); - while (sqlite3_step(res) == SQLITE_ROW) { + while (sqlite3_step(res) == SQLITE_ROW && dimension_update_count < ACLK_MAX_DIMENSION_CLEANUP) { + if (unlikely(netdata_exit)) + break; if (!update_every || update_every != (uint32_t)sqlite3_column_int(res, 1)) { if (update_every) { debug(D_ACLK_SYNC, "Update %s for %u oldest time = %ld", wc->host_guid, update_every, start_time); @@ -942,23 +955,40 @@ void aclk_update_retention(struct aclk_database_worker_config *wc, struct aclk_d if (likely(!rc && first_entry_t)) start_time = MIN(start_time, first_entry_t); - if (memory_mode == RRD_MEMORY_MODE_DBENGINE && wc->chart_updates) { + if (memory_mode == RRD_MEMORY_MODE_DBENGINE && wc->chart_updates && (dimension_update_count < ACLK_MAX_DIMENSION_CLEANUP)) { int live = ((now - last_entry_t) < (RRDSET_MINIMUM_DIM_LIVE_MULTIPLIER * update_every)); - if ((!live || !first_entry_t) && (dimension_update_count < ACLK_MAX_DIMENSION_CLEANUP)) { - (void)aclk_upd_dimension_event( - wc, - claim_id, - (uuid_t *)sqlite3_column_blob(res, 0), - (const char *)(const char *)sqlite3_column_text(res, 3), - (const char *)(const char *)sqlite3_column_text(res, 4), - (const char *)(const char *)sqlite3_column_text(res, 2), - first_entry_t, - live ? 0 : last_entry_t, - &send_status); - if (!send_status) + if (rc) { + first_entry_t = 0; + last_entry_t = 0; + live = 0; + } + if (!wc->host || !first_entry_t) { + if (!first_entry_t) { + delete_dimension_uuid((uuid_t *)sqlite3_column_blob(res, 0)); + total_deleted++; dimension_update_count++; + } + else { + (void)aclk_upd_dimension_event( + wc, + claim_id, + (uuid_t *)sqlite3_column_blob(res, 0), + (const char *)(const char *)sqlite3_column_text(res, 3), + (const char *)(const char *)sqlite3_column_text(res, 4), + (const char *)(const char *)sqlite3_column_text(res, 2), + first_entry_t, + live ? 0 : last_entry_t, + &send_status); + + if (!send_status) { + if (last_entry_t) + total_stopped++; + dimension_update_count++; + } + } } } + total_checked++; } if (update_every) { debug(D_ACLK_SYNC, "Update %s for %u oldest time = %ld", wc->host_guid, update_every, start_time); @@ -970,7 +1000,20 @@ void aclk_update_retention(struct aclk_database_worker_config *wc, struct aclk_d rotate_data.interval_duration_count++; } + char *hostname = NULL; + if (!wc->host) + hostname = get_hostname_by_node_id(wc->node_id); + + if (dimension_update_count < ACLK_MAX_DIMENSION_CLEANUP && !netdata_exit) + log_access("ACLK STA [%s (%s)]: UPDATES %d RETENTION MESSAGE SENT. CHECKED %u DIMENSIONS. %u DELETED, %u STOPPED COLLECTING", + wc->node_id, wc->host ? wc->host->hostname : hostname ? hostname : "N/A", wc->chart_updates, total_checked, total_deleted, total_stopped); + else + log_access("ACLK STA [%s (%s)]: UPDATES %d RETENTION MESSAGE NOT SENT. CHECKED %u DIMENSIONS. %u DELETED, %u STOPPED COLLECTING", + wc->node_id, wc->host ? wc->host->hostname : hostname ? hostname : "N/A", wc->chart_updates, total_checked, total_deleted, total_stopped); + freez(hostname); + #ifdef NETDATA_INTERNAL_CHECKS + info("Retention update for %s (chart updates = %d)", wc->host_guid, wc->chart_updates); for (int i = 0; i < rotate_data.interval_duration_count; ++i) info( "Update for host %s (node %s) for %u Retention = %u", @@ -979,7 +1022,8 @@ void aclk_update_retention(struct aclk_database_worker_config *wc, struct aclk_d rotate_data.interval_durations[i].update_every, rotate_data.interval_durations[i].retention); #endif - aclk_retention_updated(&rotate_data); + if (dimension_update_count < ACLK_MAX_DIMENSION_CLEANUP && !netdata_exit) + aclk_retention_updated(&rotate_data); freez(rotate_data.node_id); freez(rotate_data.interval_durations); @@ -1048,11 +1092,64 @@ void sql_get_last_chart_sequence(struct aclk_database_worker_config *wc) return; } -int queue_dimension_to_aclk(RRDDIM *rd) +void queue_dimension_to_aclk(RRDDIM *rd, time_t last_updated) { - int rc = sql_queue_chart_payload((struct aclk_database_worker_config *) rd->rrdset->rrdhost->dbsync_worker, - rd, ACLK_DATABASE_ADD_DIMENSION); - return rc; + int live = !last_updated; + + if (likely(rd->state->aclk_live_status == live)) + return; + + time_t created_at = rd->state->query_ops.oldest_time(rd); + + if (unlikely(!created_at && rd->updated)) + created_at = rd->last_collected_time.tv_sec; + + rd->state->aclk_live_status = live; + + struct aclk_database_worker_config *wc = rd->rrdset->rrdhost->dbsync_worker; + if (unlikely(!wc)) + return; + + char *claim_id = is_agent_claimed(); + if (unlikely(!claim_id)) + return; + + struct chart_dimension_updated dim_payload; + memset(&dim_payload, 0, sizeof(dim_payload)); + dim_payload.node_id = wc->node_id; + dim_payload.claim_id = claim_id; + dim_payload.name = rd->name; + dim_payload.id = rd->id; + dim_payload.chart_id = rd->rrdset->id; + dim_payload.created_at.tv_sec = created_at; + dim_payload.last_timestamp.tv_sec = last_updated; + + size_t size = 0; + char *payload = generate_chart_dimension_updated(&size, &dim_payload); + + freez(claim_id); + if (unlikely(!payload)) + return; + + struct aclk_chart_dimension_data *aclk_cd_data = mallocz(sizeof(*aclk_cd_data)); + uuid_copy(aclk_cd_data->uuid, rd->state->metric_uuid); + aclk_cd_data->payload = payload; + aclk_cd_data->payload_size = size; + aclk_cd_data->check_payload = 1; + + struct aclk_database_cmd cmd; + memset(&cmd, 0, sizeof(cmd)); + + cmd.opcode = ACLK_DATABASE_ADD_DIMENSION; + cmd.data = aclk_cd_data; + int rc = aclk_database_enq_cmd_noblock(wc, &cmd); + + if (unlikely(rc)) { + freez(aclk_cd_data->payload); + freez(aclk_cd_data); + rd->state->aclk_live_status = !live; + } + return; } void aclk_send_dimension_update(RRDDIM *rd) @@ -1203,6 +1300,12 @@ void sql_check_chart_liveness(RRDSET *st) { return; rrdset_rdlock(st); + + if (unlikely(!rrdset_flag_check(st, RRDSET_FLAG_ACLK))) { + rrdset_unlock(st); + return; + } + if (unlikely(!rrdset_flag_check(st, RRDSET_FLAG_ACLK))) { if (likely(st->dimensions && st->counter_done && !queue_chart_to_aclk(st))) { debug(D_ACLK_SYNC,"Check chart liveness [%s] submit chart definition", st->name); @@ -1215,20 +1318,8 @@ void sql_check_chart_liveness(RRDSET *st) { debug(D_ACLK_SYNC,"Check chart liveness [%s] scanning dimensions", st->name); rrddim_foreach_read(rd, st) { - if (!rrddim_flag_check(rd, RRDDIM_FLAG_HIDDEN)) { - int live = (mark - rd->last_collected_time.tv_sec) < RRDSET_MINIMUM_DIM_LIVE_MULTIPLIER * rd->update_every; - if (unlikely(live != rd->state->aclk_live_status)) { - if (likely(rrdset_flag_check(st, RRDSET_FLAG_ACLK))) { - if (likely(!queue_dimension_to_aclk(rd))) { - debug(D_ACLK_SYNC,"Dimension change [%s] on [%s] from live %d --> %d", rd->id, rd->rrdset->name, rd->state->aclk_live_status, live); - rd->state->aclk_live_status = live; - rrddim_flag_set(rd, RRDDIM_FLAG_ACLK); - } - } - } - else - debug(D_ACLK_SYNC,"Dimension check [%s] on [%s] liveness matches", rd->id, st->name); - } + if (!rrddim_flag_check(rd, RRDDIM_FLAG_HIDDEN)) + queue_dimension_to_aclk(rd, calc_dimension_liveness(rd, mark)); } rrdset_unlock(st); } diff --git a/database/sqlite/sqlite_aclk_chart.h b/database/sqlite/sqlite_aclk_chart.h index 1d25de24e..84325bf6c 100644 --- a/database/sqlite/sqlite_aclk_chart.h +++ b/database/sqlite/sqlite_aclk_chart.h @@ -16,10 +16,21 @@ extern sqlite3 *db_meta; #define RRDSET_MINIMUM_DIM_LIVE_MULTIPLIER (3) #endif +#ifndef RRDSET_MINIMUM_DIM_OFFLINE_MULTIPLIER +#define RRDSET_MINIMUM_DIM_OFFLINE_MULTIPLIER (30) +#endif + #ifndef ACLK_MAX_DIMENSION_CLEANUP #define ACLK_MAX_DIMENSION_CLEANUP (500) #endif +struct aclk_chart_dimension_data { + uuid_t uuid; + char *payload; + size_t payload_size; + uint8_t check_payload; +}; + struct aclk_chart_sync_stats { int updates; uint64_t batch_id; @@ -37,9 +48,8 @@ struct aclk_chart_sync_stats { }; extern int queue_chart_to_aclk(RRDSET *st); -extern int queue_dimension_to_aclk(RRDDIM *rd); +extern void queue_dimension_to_aclk(RRDDIM *rd, time_t last_updated); extern void sql_create_aclk_table(RRDHOST *host, uuid_t *host_uuid, uuid_t *node_id); -extern int sql_queue_alarm_to_aclk(RRDHOST *host, ALARM_ENTRY *ae); int aclk_add_chart_event(struct aclk_database_worker_config *wc, struct aclk_database_cmd cmd); int aclk_add_dimension_event(struct aclk_database_worker_config *wc, struct aclk_database_cmd cmd); int aclk_send_chart_config(struct aclk_database_worker_config *wc, struct aclk_database_cmd cmd); @@ -57,4 +67,5 @@ uint32_t sql_get_pending_count(struct aclk_database_worker_config *wc); void aclk_send_dimension_update(RRDDIM *rd); struct aclk_chart_sync_stats *aclk_get_chart_sync_stats(RRDHOST *host); void sql_check_chart_liveness(RRDSET *st); +void aclk_update_retention(struct aclk_database_worker_config *wc); #endif //NETDATA_SQLITE_ACLK_CHART_H diff --git a/database/sqlite/sqlite_aclk_node.c b/database/sqlite/sqlite_aclk_node.c index 97e6bebd1..239a24b8c 100644 --- a/database/sqlite/sqlite_aclk_node.c +++ b/database/sqlite/sqlite_aclk_node.c @@ -24,6 +24,15 @@ void sql_build_node_info(struct aclk_database_worker_config *wc, struct aclk_dat node_info.child = (wc->host != localhost); node_info.ml_info.ml_capable = ml_capable(localhost); node_info.ml_info.ml_enabled = ml_enabled(wc->host); + + struct capability instance_caps[] = { + { .name = "proto", .version = 1, .enabled = 1 }, + { .name = "ml", .version = ml_capable(localhost), .enabled = ml_enabled(wc->host) }, + { .name = "mc", .version = enable_metric_correlations ? metric_correlations_version : 0, .enabled = enable_metric_correlations }, + { .name = NULL, .version = 0, .enabled = 0 } + }; + node_info.node_instance_capabilities = instance_caps; + now_realtime_timeval(&node_info.updated_at); RRDHOST *host = wc->host; @@ -47,7 +56,7 @@ void sql_build_node_info(struct aclk_database_worker_config *wc, struct aclk_dat node_info.data.memory = host->system_info->host_ram_total ? host->system_info->host_ram_total : "0"; node_info.data.disk_space = host->system_info->host_disk_space ? host->system_info->host_disk_space : "0"; node_info.data.version = host_version ? host_version : VERSION; - node_info.data.release_channel = "nightly"; + node_info.data.release_channel = (char *) get_release_channel(); node_info.data.timezone = (char *) host->abbrev_timezone; node_info.data.virtualization_type = host->system_info->virtualization ? host->system_info->virtualization : "unknown"; node_info.data.container_type = host->system_info->container ? host->system_info->container : "unknown"; @@ -55,11 +64,19 @@ void sql_build_node_info(struct aclk_database_worker_config *wc, struct aclk_dat node_info.data.services = NULL; // char ** node_info.data.service_count = 0; node_info.data.machine_guid = wc->host_guid; + + struct capability node_caps[] = { + { .name = "ml", .version = host->system_info->ml_capable, .enabled = host->system_info->ml_enabled }, + { .name = "mc", .version = host->system_info->mc_version ? host->system_info->mc_version : 0, .enabled = host->system_info->mc_version ? 1 : 0 }, + { .name = NULL, .version = 0, .enabled = 0 } + }; + node_info.node_capabilities = node_caps; + node_info.data.ml_info.ml_capable = host->system_info->ml_capable; node_info.data.ml_info.ml_enabled = host->system_info->ml_enabled; struct label_index *labels = &host->labels; - netdata_rwlock_wrlock(&labels->labels_rwlock); + netdata_rwlock_rdlock(&labels->labels_rwlock); node_info.data.host_labels_head = labels->head; aclk_update_node_info(&node_info); diff --git a/database/sqlite/sqlite_aclk_node.h b/database/sqlite/sqlite_aclk_node.h index 9cb411586..b8f8c6bbf 100644 --- a/database/sqlite/sqlite_aclk_node.h +++ b/database/sqlite/sqlite_aclk_node.h @@ -4,5 +4,4 @@ #define NETDATA_SQLITE_ACLK_NODE_H void sql_build_node_info(struct aclk_database_worker_config *wc, struct aclk_database_cmd cmd); -void aclk_update_retention(struct aclk_database_worker_config *wc, struct aclk_database_cmd cmd); #endif //NETDATA_SQLITE_ACLK_NODE_H diff --git a/database/sqlite/sqlite_functions.c b/database/sqlite/sqlite_functions.c index 1e1d2a741..502633c67 100644 --- a/database/sqlite/sqlite_functions.c +++ b/database/sqlite/sqlite_functions.c @@ -5,8 +5,6 @@ #define DB_METADATA_VERSION "1" const char *database_config[] = { - "PRAGMA auto_vacuum=incremental; PRAGMA synchronous=1 ; PRAGMA journal_mode=WAL; PRAGMA temp_store=MEMORY;", - "PRAGMA journal_size_limit=16777216;", "CREATE TABLE IF NOT EXISTS host(host_id blob PRIMARY KEY, hostname text, " "registry_hostname text, update_every int, os text, timezone text, tags text);", "CREATE TABLE IF NOT EXISTS chart(chart_id blob PRIMARY KEY, host_id blob, type text, id text, name text, " @@ -62,6 +60,9 @@ const char *database_cleanup[] = { "delete from chart where chart_id not in (select chart_id from dimension);", "delete from host where host_id not in (select host_id from chart);", "delete from chart_label where chart_id not in (select chart_id from chart);", + "DELETE FROM chart_hash_map WHERE chart_id NOT IN (SELECT chart_id FROM chart);", + "DELETE FROM chart_hash WHERE hash_id NOT IN (SELECT hash_id FROM chart_hash_map);", + "DELETE FROM node_instance WHERE host_id NOT IN (SELECT host_id FROM host);", NULL }; @@ -72,10 +73,12 @@ static uv_mutex_t sqlite_transaction_lock; int execute_insert(sqlite3_stmt *res) { int rc; - - while ((rc = sqlite3_step(res)) != SQLITE_DONE && unlikely(netdata_exit)) { - if (likely(rc == SQLITE_BUSY || rc == SQLITE_LOCKED)) + int cnt = 0; + while ((rc = sqlite3_step(res)) != SQLITE_DONE && ++cnt < SQL_MAX_RETRY && likely(!netdata_exit)) { + if (likely(rc == SQLITE_BUSY || rc == SQLITE_LOCKED)) { usleep(SQLITE_INSERT_DELAY * USEC_PER_MS); + error_report("Failed to insert/update, rc = %d -- attempt %d", rc, cnt); + } else { error_report("SQLite error %d", rc); break; @@ -93,8 +96,12 @@ static void add_stmt_to_list(sqlite3_stmt *res) static sqlite3_stmt *statements[MAX_OPEN_STATEMENTS]; if (unlikely(!res)) { - while (idx > 0) - sqlite3_finalize(statements[--idx]); + while (idx > 0) { + int rc; + rc = sqlite3_finalize(statements[--idx]); + if (unlikely(rc != SQLITE_OK)) + error_report("Failed to finalize statement during shutdown, rc = %d", rc); + } return; } @@ -302,7 +309,7 @@ static int attempt_database_fix() error_report("Failed to close database, rc = %d", rc); info("Attempting to fix database"); db_meta = NULL; - return sql_init_database(DB_CHECK_FIX_DB | DB_CHECK_CONT); + return sql_init_database(DB_CHECK_FIX_DB | DB_CHECK_CONT, 0); } static int init_database_batch(int rebuild, int init_type, const char *batch[]) @@ -333,13 +340,17 @@ static int init_database_batch(int rebuild, int init_type, const char *batch[]) * Initialize the SQLite database * Return 0 on success */ -int sql_init_database(db_check_action_type_t rebuild) +int sql_init_database(db_check_action_type_t rebuild, int memory) { char *err_msg = NULL; char sqlite_database[FILENAME_MAX + 1]; int rc; - snprintfz(sqlite_database, FILENAME_MAX, "%s/netdata-meta.db", netdata_configured_cache_dir); + if (likely(!memory)) + snprintfz(sqlite_database, FILENAME_MAX, "%s/netdata-meta.db", netdata_configured_cache_dir); + else + strcpy(sqlite_database, ":memory:"); + rc = sqlite3_open(sqlite_database, &db_meta); if (rc != SQLITE_OK) { error_report("Failed to initialize database at %s, due to \"%s\"", sqlite_database, sqlite3_errstr(rc)); @@ -390,6 +401,40 @@ int sql_init_database(db_check_action_type_t rebuild) info("SQLite database %s initialization", sqlite_database); + char buf[1024 + 1] = ""; + const char *list[2] = { buf, NULL }; + + // https://www.sqlite.org/pragma.html#pragma_auto_vacuum + // PRAGMA schema.auto_vacuum = 0 | NONE | 1 | FULL | 2 | INCREMENTAL; + snprintfz(buf, 1024, "PRAGMA auto_vacuum=%s;", config_get(CONFIG_SECTION_SQLITE, "auto vacuum", "INCREMENTAL")); + if(init_database_batch(rebuild, 0, list)) return 1; + + // https://www.sqlite.org/pragma.html#pragma_synchronous + // PRAGMA schema.synchronous = 0 | OFF | 1 | NORMAL | 2 | FULL | 3 | EXTRA; + snprintfz(buf, 1024, "PRAGMA synchronous=%s;", config_get(CONFIG_SECTION_SQLITE, "synchronous", "NORMAL")); + if(init_database_batch(rebuild, 0, list)) return 1; + + // https://www.sqlite.org/pragma.html#pragma_journal_mode + // PRAGMA schema.journal_mode = DELETE | TRUNCATE | PERSIST | MEMORY | WAL | OFF + snprintfz(buf, 1024, "PRAGMA journal_mode=%s;", config_get(CONFIG_SECTION_SQLITE, "journal mode", "WAL")); + if(init_database_batch(rebuild, 0, list)) return 1; + + // https://www.sqlite.org/pragma.html#pragma_temp_store + // PRAGMA temp_store = 0 | DEFAULT | 1 | FILE | 2 | MEMORY; + snprintfz(buf, 1024, "PRAGMA temp_store=%s;", config_get(CONFIG_SECTION_SQLITE, "temp store", "MEMORY")); + if(init_database_batch(rebuild, 0, list)) return 1; + + // https://www.sqlite.org/pragma.html#pragma_journal_size_limit + // PRAGMA schema.journal_size_limit = N ; + snprintfz(buf, 1024, "PRAGMA journal_size_limit=%lld;", config_get_number(CONFIG_SECTION_SQLITE, "journal size limit", 16777216)); + if(init_database_batch(rebuild, 0, list)) return 1; + + // https://www.sqlite.org/pragma.html#pragma_cache_size + // PRAGMA schema.cache_size = pages; + // PRAGMA schema.cache_size = -kibibytes; + snprintfz(buf, 1024, "PRAGMA cache_size=%lld;", config_get_number(CONFIG_SECTION_SQLITE, "cache size", -2000)); + if(init_database_batch(rebuild, 0, list)) return 1; + if (init_database_batch(rebuild, 0, &database_config[0])) return 1; @@ -1160,8 +1205,24 @@ failed: return; } +void free_temporary_host(RRDHOST *host) +{ + if (host) { + freez(host->hostname); + freez((char *)host->os); + freez((char *)host->tags); + freez((char *)host->timezone); + freez(host->program_name); + freez(host->program_version); + freez(host->registry_hostname); + freez(host->system_info); + freez(host); + } +} + #define SELECT_HOST "select host_id, registry_hostname, update_every, os, timezone, tags from host where hostname = @hostname order by rowid desc;" -#define SELECT_HOST_BY_UUID "select host_id, registry_hostname, update_every, os, timezone, tags from host where host_id = @host_id ;" +#define SELECT_HOST_BY_UUID "select h.host_id, h.registry_hostname, h.update_every, h.os, h.timezone, h.tags from host h, node_instance ni " \ + "where (ni.host_id = @host_id or ni.node_id = @host_id) AND ni.host_id = h.host_id;" RRDHOST *sql_create_host_by_uuid(char *hostname) { @@ -1229,8 +1290,6 @@ failed: return host; } -#define SQL_MAX_RETRY 100 - void db_execute(const char *cmd) { int rc; @@ -1430,13 +1489,13 @@ int find_dimension_first_last_t(char *machine_guid, char *chart_id, char *dim_id } #ifdef ENABLE_DBENGINE -static RRDDIM *create_rrdim_entry(RRDSET *st, char *id, char *name, uuid_t *metric_uuid) +static RRDDIM *create_rrdim_entry(ONEWAYALLOC *owa, RRDSET *st, char *id, char *name, uuid_t *metric_uuid) { - RRDDIM *rd = callocz(1, sizeof(*rd)); + RRDDIM *rd = onewayalloc_callocz(owa, 1, sizeof(*rd)); rd->rrdset = st; rd->last_stored_value = NAN; rrddim_flag_set(rd, RRDDIM_FLAG_NONE); - rd->state = mallocz(sizeof(*rd->state)); + rd->state = onewayalloc_mallocz(owa, sizeof(*rd->state)); rd->rrd_memory_mode = RRD_MEMORY_MODE_DBENGINE; rd->state->query_ops.init = rrdeng_load_metric_init; rd->state->query_ops.next_metric = rrdeng_load_metric_next; @@ -1444,11 +1503,11 @@ static RRDDIM *create_rrdim_entry(RRDSET *st, char *id, char *name, uuid_t *metr rd->state->query_ops.finalize = rrdeng_load_metric_finalize; rd->state->query_ops.latest_time = rrdeng_metric_latest_time; rd->state->query_ops.oldest_time = rrdeng_metric_oldest_time; - rd->state->rrdeng_uuid = mallocz(sizeof(uuid_t)); + rd->state->rrdeng_uuid = onewayalloc_mallocz(owa, sizeof(uuid_t)); uuid_copy(*rd->state->rrdeng_uuid, *metric_uuid); uuid_copy(rd->state->metric_uuid, *metric_uuid); - rd->id = strdupz(id); - rd->name = strdupz(name); + rd->id = onewayalloc_strdupz(owa, id); + rd->name = onewayalloc_strdupz(owa, name); return rd; } #endif @@ -1465,7 +1524,7 @@ static RRDDIM *create_rrdim_entry(RRDSET *st, char *id, char *name, uuid_t *metr "where d.chart_id = c.chart_id and c.host_id = h.host_id and c.host_id = @host_id and c.type||'.'||c.id = @chart " \ "order by c.chart_id asc, c.type||'.'||c.id desc;" -void sql_build_context_param_list(struct context_param **param_list, RRDHOST *host, char *context, char *chart) +void sql_build_context_param_list(ONEWAYALLOC *owa, struct context_param **param_list, RRDHOST *host, char *context, char *chart) { #ifdef ENABLE_DBENGINE int rc; @@ -1474,7 +1533,7 @@ void sql_build_context_param_list(struct context_param **param_list, RRDHOST *ho return; if (unlikely(!(*param_list))) { - *param_list = mallocz(sizeof(struct context_param)); + *param_list = onewayalloc_mallocz(owa, sizeof(struct context_param)); (*param_list)->first_entry_t = LONG_MAX; (*param_list)->last_entry_t = 0; (*param_list)->rd = NULL; @@ -1523,21 +1582,21 @@ void sql_build_context_param_list(struct context_param **param_list, RRDHOST *ho if (!st || uuid_compare(*(uuid_t *)sqlite3_column_blob(res, 7), chart_id)) { if (unlikely(st && !st->counter)) { - freez(st->context); - freez((char *) st->name); - freez(st); + onewayalloc_freez(owa, st->context); + onewayalloc_freez(owa, (char *) st->name); + onewayalloc_freez(owa, st); } - st = callocz(1, sizeof(*st)); + st = onewayalloc_callocz(owa, 1, sizeof(*st)); char n[RRD_ID_LENGTH_MAX + 1]; snprintfz( n, RRD_ID_LENGTH_MAX, "%s.%s", (char *)sqlite3_column_text(res, 4), (char *)sqlite3_column_text(res, 3)); - st->name = strdupz(n); + st->name = onewayalloc_strdupz(owa, n); st->update_every = sqlite3_column_int(res, 6); st->counter = 0; if (chart) { - st->context = strdupz((char *)sqlite3_column_text(res, 8)); + st->context = onewayalloc_strdupz(owa, (char *)sqlite3_column_text(res, 8)); strncpyz(st->id, chart, RRD_ID_LENGTH_MAX); } uuid_copy(chart_id, *(uuid_t *)sqlite3_column_blob(res, 7)); @@ -1553,7 +1612,7 @@ void sql_build_context_param_list(struct context_param **param_list, RRDHOST *ho st->counter++; st->last_entry_t = MAX(st->last_entry_t, (*param_list)->last_entry_t); - RRDDIM *rd = create_rrdim_entry(st, (char *)sqlite3_column_text(res, 1), (char *)sqlite3_column_text(res, 2), &rrdeng_uuid); + RRDDIM *rd = create_rrdim_entry(owa, st, (char *)sqlite3_column_text(res, 1), (char *)sqlite3_column_text(res, 2), &rrdeng_uuid); if (sqlite3_column_int(res, 9) == 1) rrddim_flag_set(rd, RRDDIM_FLAG_HIDDEN); rd->next = (*param_list)->rd; @@ -1561,13 +1620,13 @@ void sql_build_context_param_list(struct context_param **param_list, RRDHOST *ho } if (st) { if (!st->counter) { - freez(st->context); - freez((char *)st->name); - freez(st); + onewayalloc_freez(owa,st->context); + onewayalloc_freez(owa,(char *)st->name); + onewayalloc_freez(owa,st); } else if (!st->context && context) - st->context = strdupz(context); + st->context = onewayalloc_strdupz(owa,context); } failed: diff --git a/database/sqlite/sqlite_functions.h b/database/sqlite/sqlite_functions.h index 30b8dee6c..d24484774 100644 --- a/database/sqlite/sqlite_functions.h +++ b/database/sqlite/sqlite_functions.h @@ -24,6 +24,7 @@ typedef enum db_check_action_type { DB_CHECK_CONT = 0x00008 } db_check_action_type_t; +#define SQL_MAX_RETRY (100) #define SQLITE_INSERT_DELAY (50) // Insert delay in case of lock #define SQL_STORE_HOST "insert or replace into host (host_id,hostname,registry_hostname,update_every,os,timezone,tags) values (?1,?2,?3,?4,?5,?6,?7);" @@ -56,7 +57,7 @@ typedef enum db_check_action_type { return 1; \ } -extern int sql_init_database(db_check_action_type_t rebuild); +extern int sql_init_database(db_check_action_type_t rebuild, int memory); extern void sql_close_database(void); extern int sql_store_host(uuid_t *guid, const char *hostname, const char *registry_hostname, int update_every, const char *os, const char *timezone, const char *tags); @@ -89,7 +90,7 @@ extern void db_unlock(void); extern void db_lock(void); extern void delete_dimension_uuid(uuid_t *dimension_uuid); extern void sql_store_chart_label(uuid_t *chart_uuid, int source_type, char *label, char *value); -extern void sql_build_context_param_list(struct context_param **param_list, RRDHOST *host, char *context, char *chart); +extern void sql_build_context_param_list(ONEWAYALLOC *owa, struct context_param **param_list, RRDHOST *host, char *context, char *chart); extern void store_claim_id(uuid_t *host_id, uuid_t *claim_id); extern int update_node_id(uuid_t *host_id, uuid_t *node_id); extern int get_node_id(uuid_t *host_id, uuid_t *node_id); @@ -100,4 +101,5 @@ extern void sql_load_node_id(RRDHOST *host); extern void compute_chart_hash(RRDSET *st); extern int sql_set_dimension_option(uuid_t *dim_uuid, char *option); char *get_hostname_by_node_id(char *node_id); +void free_temporary_host(RRDHOST *host); #endif //NETDATA_SQLITE_FUNCTIONS_H diff --git a/database/sqlite/sqlite_health.c b/database/sqlite/sqlite_health.c index 8ba95628f..53742a1a6 100644 --- a/database/sqlite/sqlite_health.c +++ b/database/sqlite/sqlite_health.c @@ -433,6 +433,168 @@ void sql_health_alarm_log_count(RRDHOST *host) { info("HEALTH [%s]: Table health_log_%s, contains %lu entries.", host->hostname, uuid_str, host->health_log_entries_written); } +#define SQL_INJECT_REMOVED(guid, guid2) "insert into health_log_%s (hostname, unique_id, alarm_id, alarm_event_id, config_hash_id, updated_by_id, updates_id, when_key, duration, non_clear_duration, flags, exec_run_timestamp, " \ +"delay_up_to_timestamp, name, chart, family, exec, recipient, source, units, info, exec_code, new_status, old_status, delay, new_value, old_value, last_repeat, class, component, type) " \ +"select hostname, ?1, ?2, ?3, config_hash_id, 0, ?4, strftime('%%s'), 0, 0, flags, exec_run_timestamp, " \ +"strftime('%%s'), name, chart, family, exec, recipient, source, units, info, exec_code, -2, new_status, delay, NULL, new_value, 0, class, component, type " \ +"from health_log_%s where unique_id = ?5", guid, guid2 +#define SQL_INJECT_REMOVED_UPDATE(guid) "update health_log_%s set flags = flags | ?1, updated_by_id = ?2 where unique_id = ?3; ", guid +void sql_inject_removed_status(char *uuid_str, uint32_t alarm_id, uint32_t alarm_event_id, uint32_t unique_id, uint32_t max_unique_id) +{ + int rc = 0; + char command[MAX_HEALTH_SQL_SIZE + 1]; + + if (!alarm_id || !alarm_event_id || !unique_id || !max_unique_id) + return; + + sqlite3_stmt *res = NULL; + + snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_INJECT_REMOVED(uuid_str, uuid_str)); + rc = sqlite3_prepare_v2(db_meta, command, -1, &res, 0); + if (rc != SQLITE_OK) { + error_report("Failed to prepare statement when trying to inject removed event"); + return; + } + + rc = sqlite3_bind_int64(res, 1, (sqlite3_int64) max_unique_id); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind max_unique_id parameter for SQL_INJECT_REMOVED"); + goto failed; + } + + rc = sqlite3_bind_int64(res, 2, (sqlite3_int64) alarm_id); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind alarm_id parameter for SQL_INJECT_REMOVED"); + goto failed; + } + + rc = sqlite3_bind_int64(res, 3, (sqlite3_int64) alarm_event_id + 1); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind alarm_event_id parameter for SQL_INJECT_REMOVED"); + goto failed; + } + + rc = sqlite3_bind_int64(res, 4, (sqlite3_int64) unique_id); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind unique_id parameter for SQL_INJECT_REMOVED"); + goto failed; + } + + rc = sqlite3_bind_int64(res, 5, (sqlite3_int64) unique_id); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind unique_id parameter for SQL_INJECT_REMOVED"); + goto failed; + } + + rc = execute_insert(res); + if (unlikely(rc != SQLITE_DONE)) { + error_report("HEALTH [N/A]: Failed to execute SQL_INJECT_REMOVED, rc = %d", rc); + goto failed; + } + + if (unlikely(sqlite3_finalize(res) != SQLITE_OK)) + error_report("HEALTH [N/A]: Failed to finalize the prepared statement for injecting removed event."); + + //update the old entry + snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_INJECT_REMOVED_UPDATE(uuid_str)); + rc = sqlite3_prepare_v2(db_meta, command, -1, &res, 0); + if (rc != SQLITE_OK) { + error_report("Failed to prepare statement when trying to update during inject removed event"); + return; + } + + rc = sqlite3_bind_int64(res, 1, (sqlite3_int64) HEALTH_ENTRY_FLAG_UPDATED); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind flags parameter for SQL_INJECT_REMOVED (update)"); + goto failed; + } + + rc = sqlite3_bind_int64(res, 2, (sqlite3_int64) max_unique_id); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind max_unique_id parameter for SQL_INJECT_REMOVED (update)"); + goto failed; + } + + rc = sqlite3_bind_int64(res, 3, (sqlite3_int64) unique_id); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to bind unique_id parameter for SQL_INJECT_REMOVED (update)"); + goto failed; + } + + rc = execute_insert(res); + if (unlikely(rc != SQLITE_DONE)) { + error_report("HEALTH [N/A]: Failed to execute SQL_INJECT_REMOVED_UPDATE, rc = %d", rc); + goto failed; + } + +failed: + if (unlikely(sqlite3_finalize(res) != SQLITE_OK)) + error_report("HEALTH [N/A]: Failed to finalize the prepared statement for injecting removed event."); + return; + +} + +#define SQL_SELECT_MAX_UNIQUE_ID(guid) "SELECT MAX(unique_id) from health_log_%s", guid +uint32_t sql_get_max_unique_id (char *uuid_str) +{ + int rc = 0; + char command[MAX_HEALTH_SQL_SIZE + 1]; + uint32_t max_unique_id = 0; + + sqlite3_stmt *res = NULL; + + snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_SELECT_MAX_UNIQUE_ID(uuid_str)); + rc = sqlite3_prepare_v2(db_meta, command, -1, &res, 0); + if (rc != SQLITE_OK) { + error_report("Failed to prepare statement when trying to get max unique id"); + return 0; + } + + while (sqlite3_step(res) == SQLITE_ROW) { + max_unique_id = (uint32_t) sqlite3_column_int64(res, 0); + } + + rc = sqlite3_finalize(res); + if (unlikely(rc != SQLITE_OK)) + error_report("Failed to finalize the statement"); + + return max_unique_id; +} + +#define SQL_SELECT_LAST_STATUSES(guid) "SELECT new_status, unique_id, alarm_id, alarm_event_id from health_log_%s group by alarm_id having max(alarm_event_id)", guid +void sql_check_removed_alerts_state(char *uuid_str) +{ + int rc = 0; + char command[MAX_HEALTH_SQL_SIZE + 1]; + RRDCALC_STATUS status; + uint32_t alarm_id = 0, alarm_event_id = 0, unique_id = 0, max_unique_id = 0; + + sqlite3_stmt *res = NULL; + + snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_SELECT_LAST_STATUSES(uuid_str)); + rc = sqlite3_prepare_v2(db_meta, command, -1, &res, 0); + if (rc != SQLITE_OK) { + error_report("Failed to prepare statement when trying to check removed statuses"); + return; + } + + while (sqlite3_step(res) == SQLITE_ROW) { + status = (RRDCALC_STATUS) sqlite3_column_int(res, 0); + unique_id = (uint32_t) sqlite3_column_int64(res, 1); + alarm_id = (uint32_t) sqlite3_column_int64(res, 2); + alarm_event_id = (uint32_t) sqlite3_column_int64(res, 3); + if (unlikely(status != RRDCALC_STATUS_REMOVED)) { + if (unlikely(!max_unique_id)) + max_unique_id = sql_get_max_unique_id (uuid_str); + sql_inject_removed_status (uuid_str, alarm_id, alarm_event_id, unique_id, ++max_unique_id); + } + } + + rc = sqlite3_finalize(res); + if (unlikely(rc != SQLITE_OK)) + error_report("Failed to finalize the statement"); +} + /* Health related SQL queries Load from the health log table */ @@ -454,6 +616,8 @@ void sql_health_alarm_log_load(RRDHOST *host) { char uuid_str[GUID_LEN + 1]; uuid_unparse_lower_fix(&host->host_uuid, uuid_str); + sql_check_removed_alerts_state(uuid_str); + snprintfz(command, MAX_HEALTH_SQL_SIZE, SQL_LOAD_HEALTH_LOG(uuid_str, host->health_log.max)); rc = sqlite3_prepare_v2(db_meta, command, -1, &res, 0); diff --git a/database/storage_engine.c b/database/storage_engine.c new file mode 100644 index 000000000..36f01de16 --- /dev/null +++ b/database/storage_engine.c @@ -0,0 +1,120 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "storage_engine.h" +#include "ram/rrddim_mem.h" +#ifdef ENABLE_DBENGINE +#include "engine/rrdengineapi.h" +#endif + +#define im_collect_ops { \ + .init = rrddim_collect_init,\ + .store_metric = rrddim_collect_store_metric,\ + .finalize = rrddim_collect_finalize\ +} + +#define im_query_ops { \ + .init = rrddim_query_init, \ + .next_metric = rrddim_query_next_metric, \ + .is_finished = rrddim_query_is_finished, \ + .finalize = rrddim_query_finalize, \ + .latest_time = rrddim_query_latest_time, \ + .oldest_time = rrddim_query_oldest_time \ +} + +static STORAGE_ENGINE engines[] = { + { + .id = RRD_MEMORY_MODE_NONE, + .name = RRD_MEMORY_MODE_NONE_NAME, + .api = { + .collect_ops = im_collect_ops, + .query_ops = im_query_ops + } + }, + { + .id = RRD_MEMORY_MODE_RAM, + .name = RRD_MEMORY_MODE_RAM_NAME, + .api = { + .collect_ops = im_collect_ops, + .query_ops = im_query_ops + } + }, + { + .id = RRD_MEMORY_MODE_MAP, + .name = RRD_MEMORY_MODE_MAP_NAME, + .api = { + .collect_ops = im_collect_ops, + .query_ops = im_query_ops + } + }, + { + .id = RRD_MEMORY_MODE_SAVE, + .name = RRD_MEMORY_MODE_SAVE_NAME, + .api = { + .collect_ops = im_collect_ops, + .query_ops = im_query_ops + } + }, + { + .id = RRD_MEMORY_MODE_ALLOC, + .name = RRD_MEMORY_MODE_ALLOC_NAME, + .api = { + .collect_ops = im_collect_ops, + .query_ops = im_query_ops + } + }, +#ifdef ENABLE_DBENGINE + { + .id = RRD_MEMORY_MODE_DBENGINE, + .name = RRD_MEMORY_MODE_DBENGINE_NAME, + .api = { + .collect_ops = { + .init = rrdeng_store_metric_init, + .store_metric = rrdeng_store_metric_next, + .finalize = rrdeng_store_metric_finalize + }, + .query_ops = { + .init = rrdeng_load_metric_init, + .next_metric = rrdeng_load_metric_next, + .is_finished = rrdeng_load_metric_is_finished, + .finalize = rrdeng_load_metric_finalize, + .latest_time = rrdeng_metric_latest_time, + .oldest_time = rrdeng_metric_oldest_time + } + } + }, +#endif + { .id = RRD_MEMORY_MODE_NONE, .name = NULL } +}; + +STORAGE_ENGINE* storage_engine_find(const char* name) +{ + for (STORAGE_ENGINE* it = engines; it->name; it++) { + if (strcmp(it->name, name) == 0) + return it; + } + return NULL; +} + +STORAGE_ENGINE* storage_engine_get(RRD_MEMORY_MODE mmode) +{ + for (STORAGE_ENGINE* it = engines; it->name; it++) { + if (it->id == mmode) + return it; + } + return NULL; +} + +STORAGE_ENGINE* storage_engine_foreach_init() +{ + // Assuming at least one engine exists + return &engines[0]; +} + +STORAGE_ENGINE* storage_engine_foreach_next(STORAGE_ENGINE* it) +{ + if (!it || !it->name) + return NULL; + + it++; + return it->name ? it : NULL; +} diff --git a/database/storage_engine.h b/database/storage_engine.h new file mode 100644 index 000000000..0aa70d093 --- /dev/null +++ b/database/storage_engine.h @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_STORAGEENGINEAPI_H +#define NETDATA_STORAGEENGINEAPI_H + +#include "rrd.h" + +typedef struct storage_engine STORAGE_ENGINE; + +// ------------------------------------------------------------------------ +// function pointers for all APIs provided by a storge engine +typedef struct storage_engine_api { + struct rrddim_collect_ops collect_ops; + struct rrddim_query_ops query_ops; +} STORAGE_ENGINE_API; + +struct storage_engine { + RRD_MEMORY_MODE id; + const char* name; + STORAGE_ENGINE_API api; +}; + +extern STORAGE_ENGINE* storage_engine_get(RRD_MEMORY_MODE mmode); +extern STORAGE_ENGINE* storage_engine_find(const char* name); + +// Iterator over existing engines +extern STORAGE_ENGINE* storage_engine_foreach_init(); +extern STORAGE_ENGINE* storage_engine_foreach_next(STORAGE_ENGINE* it); + +#endif |