diff options
Diffstat (limited to 'regress.c')
-rw-r--r-- | regress.c | 704 |
1 files changed, 704 insertions, 0 deletions
diff --git a/regress.c b/regress.c new file mode 100644 index 0000000..e767e2f --- /dev/null +++ b/regress.c @@ -0,0 +1,704 @@ +/* + chronyd/chronyc - Programs for keeping computer clocks accurate. + + ********************************************************************** + * Copyright (C) Richard P. Curnow 1997-2003 + * Copyright (C) Miroslav Lichvar 2011, 2016-2017 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + ********************************************************************** + + ======================================================================= + + Regression algorithms. + + */ + +#include "config.h" + +#include "sysincl.h" + +#include "regress.h" +#include "logging.h" +#include "util.h" + +#define MAX_POINTS 64 + +void +RGR_WeightedRegression +(double *x, /* independent variable */ + double *y, /* measured data */ + double *w, /* weightings (large => data + less reliable) */ + + int n, /* number of data points */ + + /* And now the results */ + + double *b0, /* estimated y axis intercept */ + double *b1, /* estimated slope */ + double *s2, /* estimated variance of data points */ + + double *sb0, /* estimated standard deviation of + intercept */ + double *sb1 /* estimated standard deviation of + slope */ + + /* Could add correlation stuff later if required */ +) +{ + double P, Q, U, V, W; + double diff; + double u, ui, aa; + int i; + + assert(n >= 3); + + W = U = 0; + for (i=0; i<n; i++) { + U += x[i] / w[i]; + W += 1.0 / w[i]; + } + + u = U / W; + + /* Calculate statistics from data */ + P = Q = V = 0.0; + for (i=0; i<n; i++) { + ui = x[i] - u; + P += y[i] / w[i]; + Q += y[i] * ui / w[i]; + V += ui * ui / w[i]; + } + + *b1 = Q / V; + *b0 = (P / W) - (*b1) * u; + + *s2 = 0.0; + for (i=0; i<n; i++) { + diff = y[i] - *b0 - *b1*x[i]; + *s2 += diff*diff / w[i]; + } + + *s2 /= (double)(n-2); + + *sb1 = sqrt(*s2 / V); + aa = u * (*sb1); + *sb0 = sqrt(*s2 / W + aa * aa); + + *s2 *= (n / W); /* Giving weighted average of variances */ +} + +/* ================================================== */ +/* Get the coefficient to multiply the standard deviation by, to get a + particular size of confidence interval (assuming a t-distribution) */ + +double +RGR_GetTCoef(int dof) +{ + /* Assuming now the 99.95% quantile */ + static const float coefs[] = + { 636.6, 31.6, 12.92, 8.61, 6.869, + 5.959, 5.408, 5.041, 4.781, 4.587, + 4.437, 4.318, 4.221, 4.140, 4.073, + 4.015, 3.965, 3.922, 3.883, 3.850, + 3.819, 3.792, 3.768, 3.745, 3.725, + 3.707, 3.690, 3.674, 3.659, 3.646, + 3.633, 3.622, 3.611, 3.601, 3.591, + 3.582, 3.574, 3.566, 3.558, 3.551}; + + if (dof <= 40) { + return coefs[dof-1]; + } else { + return 3.5; /* Until I can be bothered to do something better */ + } +} + +/* ================================================== */ +/* Get 90% quantile of chi-square distribution */ + +double +RGR_GetChi2Coef(int dof) +{ + static const float coefs[] = { + 2.706, 4.605, 6.251, 7.779, 9.236, 10.645, 12.017, 13.362, + 14.684, 15.987, 17.275, 18.549, 19.812, 21.064, 22.307, 23.542, + 24.769, 25.989, 27.204, 28.412, 29.615, 30.813, 32.007, 33.196, + 34.382, 35.563, 36.741, 37.916, 39.087, 40.256, 41.422, 42.585, + 43.745, 44.903, 46.059, 47.212, 48.363, 49.513, 50.660, 51.805, + 52.949, 54.090, 55.230, 56.369, 57.505, 58.641, 59.774, 60.907, + 62.038, 63.167, 64.295, 65.422, 66.548, 67.673, 68.796, 69.919, + 71.040, 72.160, 73.279, 74.397, 75.514, 76.630, 77.745, 78.860 + }; + + if (dof <= 64) { + return coefs[dof-1]; + } else { + return 1.2 * dof; /* Until I can be bothered to do something better */ + } +} + +/* ================================================== */ +/* Critical value for number of runs of residuals with same sign. + 5% critical region for now. */ + +static char critical_runs[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, + 3, 3, 4, 4, 5, 5, 5, 6, 6, 7, + 7, 7, 8, 8, 9, 9, 9, 10, 10, 11, + 11, 11, 12, 12, 13, 13, 14, 14, 14, 15, + 15, 16, 16, 17, 17, 18, 18, 18, 19, 19, + 20, 20, 21, 21, 21, 22, 22, 23, 23, 24, + 24, 25, 25, 26, 26, 26, 27, 27, 28, 28, + 29, 29, 30, 30, 30, 31, 31, 32, 32, 33, + 33, 34, 34, 35, 35, 35, 36, 36, 37, 37, + 38, 38, 39, 39, 40, 40, 40, 41, 41, 42, + 42, 43, 43, 44, 44, 45, 45, 46, 46, 46, + 47, 47, 48, 48, 49, 49, 50, 50, 51, 51, + 52, 52, 52, 53, 53, 54, 54, 55, 55, 56 +}; + +/* ================================================== */ + +static int +n_runs_from_residuals(double *resid, int n) +{ + int nruns; + int i; + + nruns = 1; + for (i=1; i<n; i++) { + if (((resid[i-1] < 0.0) && (resid[i] < 0.0)) || + ((resid[i-1] > 0.0) && (resid[i] > 0.0))) { + /* Nothing to do */ + } else { + nruns++; + } + } + + return nruns; +} + +/* ================================================== */ +/* Return a boolean indicating whether we had enough points for + regression */ + +int +RGR_FindBestRegression +(double *x, /* independent variable */ + double *y, /* measured data */ + double *w, /* weightings (large => data + less reliable) */ + + int n, /* number of data points */ + int m, /* number of extra samples in x and y arrays + (negative index) which can be used to + extend runs test */ + int min_samples, /* minimum number of samples to be kept after + changing the starting index to pass the runs + test */ + + /* And now the results */ + + double *b0, /* estimated y axis intercept */ + double *b1, /* estimated slope */ + double *s2, /* estimated variance of data points */ + + double *sb0, /* estimated standard deviation of + intercept */ + double *sb1, /* estimated standard deviation of + slope */ + + int *new_start, /* the new starting index to make the + residuals pass the two tests */ + + int *n_runs, /* number of runs amongst the residuals */ + + int *dof /* degrees of freedom in statistics (needed + to get confidence intervals later) */ + +) +{ + double P, Q, U, V, W; /* total */ + double resid[MAX_POINTS * REGRESS_RUNS_RATIO]; + double ss; + double a, b, u, ui, aa; + + int start, resid_start, nruns, npoints; + int i; + + assert(n <= MAX_POINTS && m >= 0); + assert(n * REGRESS_RUNS_RATIO < sizeof (critical_runs) / sizeof (critical_runs[0])); + + if (n < MIN_SAMPLES_FOR_REGRESS) { + return 0; + } + + start = 0; + do { + + W = U = 0; + for (i=start; i<n; i++) { + U += x[i] / w[i]; + W += 1.0 / w[i]; + } + + u = U / W; + + P = Q = V = 0.0; + for (i=start; i<n; i++) { + ui = x[i] - u; + P += y[i] / w[i]; + Q += y[i] * ui / w[i]; + V += ui * ui / w[i]; + } + + b = Q / V; + a = (P / W) - (b * u); + + /* Get residuals also for the extra samples before start */ + resid_start = n - (n - start) * REGRESS_RUNS_RATIO; + if (resid_start < -m) + resid_start = -m; + + for (i=resid_start; i<n; i++) { + resid[i - resid_start] = y[i] - a - b*x[i]; + } + + /* Count number of runs */ + nruns = n_runs_from_residuals(resid, n - resid_start); + + if (nruns > critical_runs[n - resid_start] || + n - start <= MIN_SAMPLES_FOR_REGRESS || + n - start <= min_samples) { + if (start != resid_start) { + /* Ignore extra samples in returned nruns */ + nruns = n_runs_from_residuals(resid + (start - resid_start), n - start); + } + break; + } else { + /* Try dropping one sample at a time until the runs test passes. */ + ++start; + } + + } while (1); + + /* Work out statistics from full dataset */ + *b1 = b; + *b0 = a; + + ss = 0.0; + for (i=start; i<n; i++) { + ss += resid[i - resid_start]*resid[i - resid_start] / w[i]; + } + + npoints = n - start; + ss /= (double)(npoints - 2); + *sb1 = sqrt(ss / V); + aa = u * (*sb1); + *sb0 = sqrt((ss / W) + (aa * aa)); + *s2 = ss * (double) npoints / W; + + *new_start = start; + *dof = npoints - 2; + *n_runs = nruns; + + return 1; + +} + +/* ================================================== */ + +#define EXCH(a,b) temp=(a); (a)=(b); (b)=temp + +/* ================================================== */ +/* Find the index'th biggest element in the array x of n elements. + flags is an array where a 1 indicates that the corresponding entry + in x is known to be sorted into its correct position and a 0 + indicates that the corresponding entry is not sorted. However, if + flags[m] = flags[n] = 1 with m<n, then x[m] must be <= x[n] and for + all i with m<i<n, x[m] <= x[i] <= x[n]. In practice, this means + flags[] has to be the result of a previous call to this routine + with the same array x, and is used to remember which parts of the + x[] array we have already sorted. + + The approach used is a cut-down quicksort, where we only bother to + keep sorting the partition that contains the index we are after. + The approach comes from Numerical Recipes in C (ISBN + 0-521-43108-5). */ + +static double +find_ordered_entry_with_flags(double *x, int n, int index, char *flags) +{ + int u, v, l, r; + double temp; + double piv; + int pivind; + + assert(index >= 0); + + /* If this bit of the array is already sorted, simple! */ + if (flags[index]) { + return x[index]; + } + + /* Find subrange to look at */ + u = v = index; + while (u > 0 && !flags[u]) u--; + if (flags[u]) u++; + + while (v < (n-1) && !flags[v]) v++; + if (flags[v]) v--; + + do { + if (v - u < 2) { + if (x[v] < x[u]) { + EXCH(x[v], x[u]); + } + flags[v] = flags[u] = 1; + return x[index]; + } else { + pivind = (u + v) >> 1; + EXCH(x[u], x[pivind]); + piv = x[u]; /* New value */ + l = u + 1; + r = v; + do { + while (l < v && x[l] < piv) l++; + while (x[r] > piv) r--; + if (r <= l) break; + EXCH(x[l], x[r]); + l++; + r--; + } while (1); + EXCH(x[u], x[r]); + flags[r] = 1; /* Pivot now in correct place */ + if (index == r) { + return x[r]; + } else if (index < r) { + v = r - 1; + } else if (index > r) { + u = l; + } + } + } while (1); +} + +/* ================================================== */ + +#if 0 +/* Not used, but this is how it can be done */ +static double +find_ordered_entry(double *x, int n, int index) +{ + char flags[MAX_POINTS]; + + memset(flags, 0, n * sizeof (flags[0])); + return find_ordered_entry_with_flags(x, n, index, flags); +} +#endif + +/* ================================================== */ +/* Find the median entry of an array x[] with n elements. */ + +static double +find_median(double *x, int n) +{ + int k; + char flags[MAX_POINTS]; + + memset(flags, 0, n * sizeof (flags[0])); + k = n>>1; + if (n&1) { + return find_ordered_entry_with_flags(x, n, k, flags); + } else { + return 0.5 * (find_ordered_entry_with_flags(x, n, k, flags) + + find_ordered_entry_with_flags(x, n, k-1, flags)); + } +} + +/* ================================================== */ + +double +RGR_FindMedian(double *x, int n) +{ + double tmp[MAX_POINTS]; + + assert(n > 0 && n <= MAX_POINTS); + memcpy(tmp, x, n * sizeof (tmp[0])); + + return find_median(tmp, n); +} + +/* ================================================== */ +/* This function evaluates the equation + + \sum_{i=0}^{n-1} x_i sign(y_i - a - b x_i) + + and chooses the value of a that minimises the absolute value of the + result. (See pp703-704 of Numerical Recipes in C). */ + +static void +eval_robust_residual +(double *x, /* The independent points */ + double *y, /* The dependent points */ + int n, /* Number of points */ + double b, /* Slope */ + double *aa, /* Intercept giving smallest absolute + value for the above equation */ + double *rr /* Corresponding value of equation */ +) +{ + int i; + double a, res, del; + double d[MAX_POINTS]; + + for (i=0; i<n; i++) { + d[i] = y[i] - b * x[i]; + } + + a = find_median(d, n); + + res = 0.0; + for (i=0; i<n; i++) { + del = y[i] - a - b * x[i]; + if (del > 0.0) { + res += x[i]; + } else if (del < 0.0) { + res -= x[i]; + } + } + + *aa = a; + *rr = res; +} + +/* ================================================== */ +/* This routine performs a 'robust' regression, i.e. one which has low + susceptibility to outliers amongst the data. If one thinks of a + normal (least squares) linear regression in 2D being analogous to + the arithmetic mean in 1D, this algorithm in 2D is roughly + analogous to the median in 1D. This algorithm seems to work quite + well until the number of outliers is approximately half the number + of data points. + + The return value is a status indicating whether there were enough + data points to run the routine or not. */ + +int +RGR_FindBestRobustRegression +(double *x, /* The independent axis points */ + double *y, /* The dependent axis points (which + may contain outliers). */ + int n, /* The number of points */ + double tol, /* The tolerance required in + determining the value of b1 */ + double *b0, /* The estimated Y-axis intercept */ + double *b1, /* The estimated slope */ + int *n_runs, /* The number of runs of residuals */ + int *best_start /* The best starting index */ +) +{ + int i; + int start; + int n_points; + double a, b; + double P, U, V, W, X; + double resid, resids[MAX_POINTS]; + double blo, bhi, bmid, rlo, rhi, rmid; + double s2, sb, incr; + double mx, dx, my, dy; + int nruns = 0; + + assert(n <= MAX_POINTS); + + if (n < 2) { + return 0; + } else if (n == 2) { + /* Just a straight line fit (we need this for the manual mode) */ + *b1 = (y[1] - y[0]) / (x[1] - x[0]); + *b0 = y[0] - (*b1) * x[0]; + *n_runs = 0; + *best_start = 0; + return 1; + } + + /* else at least 3 points, apply normal algorithm */ + + start = 0; + + /* Loop to strip oldest points that cause the regression residuals + to fail the number of runs test */ + do { + + n_points = n - start; + + /* Use standard least squares regression to get starting estimate */ + + P = U = 0.0; + for (i=start; i<n; i++) { + P += y[i]; + U += x[i]; + } + + W = (double) n_points; + + my = P/W; + mx = U/W; + + X = V = 0.0; + for (i=start; i<n; i++) { + dy = y[i] - my; + dx = x[i] - mx; + X += dy * dx; + V += dx * dx; + } + + b = X / V; + a = my - b*mx; + + s2 = 0.0; + for (i=start; i<n; i++) { + resid = y[i] - a - b * x[i]; + s2 += resid * resid; + } + + /* Need to expand range of b to get a root in the interval. + Estimate standard deviation of b and expand range about b based + on that. */ + sb = sqrt(s2 * W/V); + incr = MAX(sb, tol); + + do { + incr *= 2.0; + + /* Give up if the interval is too large */ + if (incr > 100.0) + return 0; + + blo = b - incr; + bhi = b + incr; + + /* We don't want 'a' yet */ + eval_robust_residual(x + start, y + start, n_points, blo, &a, &rlo); + eval_robust_residual(x + start, y + start, n_points, bhi, &a, &rhi); + + } while (rlo * rhi >= 0.0); /* fn vals have same sign or one is zero, + i.e. root not in interval (rlo, rhi). */ + + /* OK, so the root for b lies in (blo, bhi). Start bisecting */ + do { + bmid = 0.5 * (blo + bhi); + if (!(blo < bmid && bmid < bhi)) + break; + eval_robust_residual(x + start, y + start, n_points, bmid, &a, &rmid); + if (rmid == 0.0) { + break; + } else if (rmid * rlo > 0.0) { + blo = bmid; + rlo = rmid; + } else if (rmid * rhi > 0.0) { + bhi = bmid; + rhi = rmid; + } else { + assert(0); + } + } while (bhi - blo > tol); + + *b0 = a; + *b1 = bmid; + + /* Number of runs test, but not if we're already down to the + minimum number of points */ + if (n_points == MIN_SAMPLES_FOR_REGRESS) { + break; + } + + for (i=start; i<n; i++) { + resids[i] = y[i] - a - bmid * x[i]; + } + + nruns = n_runs_from_residuals(resids + start, n_points); + + if (nruns > critical_runs[n_points]) { + break; + } else { + start++; + } + + } while (1); + + *n_runs = nruns; + *best_start = start; + + return 1; + +} + +/* ================================================== */ +/* This routine performs linear regression with two independent variables. + It returns non-zero status if there were enough data points and there + was a solution. */ + +int +RGR_MultipleRegress +(double *x1, /* first independent variable */ + double *x2, /* second independent variable */ + double *y, /* measured data */ + + int n, /* number of data points */ + + /* The results */ + double *b2 /* estimated second slope */ + /* other values are not needed yet */ +) +{ + double Sx1, Sx2, Sx1x1, Sx1x2, Sx2x2, Sx1y, Sx2y, Sy; + double U, V, V1, V2, V3; + int i; + + if (n < 4) + return 0; + + Sx1 = Sx2 = Sx1x1 = Sx1x2 = Sx2x2 = Sx1y = Sx2y = Sy = 0.0; + + for (i = 0; i < n; i++) { + Sx1 += x1[i]; + Sx2 += x2[i]; + Sx1x1 += x1[i] * x1[i]; + Sx1x2 += x1[i] * x2[i]; + Sx2x2 += x2[i] * x2[i]; + Sx1y += x1[i] * y[i]; + Sx2y += x2[i] * y[i]; + Sy += y[i]; + } + + U = n * (Sx1x2 * Sx1y - Sx1x1 * Sx2y) + + Sx1 * Sx1 * Sx2y - Sx1 * Sx2 * Sx1y + + Sy * (Sx2 * Sx1x1 - Sx1 * Sx1x2); + + V1 = n * (Sx1x2 * Sx1x2 - Sx1x1 * Sx2x2); + V2 = Sx1 * Sx1 * Sx2x2 + Sx2 * Sx2 * Sx1x1; + V3 = -2.0 * Sx1 * Sx2 * Sx1x2; + V = V1 + V2 + V3; + + /* Check if there is a (numerically stable) solution */ + if (fabs(V) * 1.0e10 <= -V1 + V2 + fabs(V3)) + return 0; + + *b2 = U / V; + + return 1; +} |