diff options
Diffstat (limited to '')
277 files changed, 11940 insertions, 1885 deletions
diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..45ec5156 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +*.c diff=cpp +*.h diff=cpp diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 08765d5c..7d5dfa8b 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -7,40 +7,40 @@ # Ownership by directory structure .travis/ @paulkatsoulakis @cakrit .github/ @paulkatsoulakis @cakrit -backends/ @ktsaou @vlvkobal -backends/graphite/ @ktsaou @vlvkobal -backends/json/ @ktsaou @vlvkobal -backends/opentsdb/ @ktsaou @vlvkobal -backends/prometheus/ @ktsaou @vlvkobal @paulkatsoulakis +backends/ @thiagoftsm @vlvkobal +backends/graphite/ @thiagoftsm @vlvkobal +backends/json/ @thiagoftsm @vlvkobal +backends/opentsdb/ @thiagoftsm @vlvkobal +backends/prometheus/ @vlvkobal @paulkatsoulakis @thiagoftsm build/ @paulkatsoulakis @cakrit -collectors/ @ktsaou @vlvkobal @cakrit -collectors/charts.d.plugin/ @ktsaou @paulkatsoulakis @cakrit +collectors/ @vlvkobal @cakrit +collectors/charts.d.plugin/ @paulkatsoulakis @cakrit collectors/freebsd.plugin/ @vlvkobal @cakrit collectors/macos.plugin/ @vlvkobal @cakrit -collectors/node.d.plugin/ @ktsaou @gmosx @cakrit -collectors/node.d.plugin/fronius/ @ktsaou @gmosx @ccremer @cakrit -collectors/node.d.plugin/snmp/ @ktsaou @gmosx @cakrit -collectors/node.d.plugin/stiebeleltron/ @ktsaou @gmosx @ccremer @cakrit +collectors/node.d.plugin/ @gmosx @cakrit +collectors/node.d.plugin/fronius/ @ccremer @cakrit +collectors/node.d.plugin/snmp/ @gmosx @cakrit +collectors/node.d.plugin/stiebeleltron/ @ccremer @cakrit collectors/python.d.plugin/ @ilyam8 -collectors/cups.plugin/ @simonnagl @ktsaou @vlvkobal @cakrit -daemon/ @ktsaou @mfundul @cakrit -database/ @ktsaou @mfundul +collectors/cups.plugin/ @simonnagl @vlvkobal @cakrit +daemon/ @thiagoftsm @mfundul @cakrit +database/ @cakrit @mfundul docs/ @cakrit -health/ @ktsaou @cakrit -health/health.d/ @ktsaou @cakrit -health/notifications/ @ktsaou @Ferroin @cakrit -libnetdata/ @ktsaou @cakrit +health/ @thiagoftsm @cakrit +health/health.d/ @thiagoftsm @cakrit +health/notifications/ @Ferroin @cakrit +libnetdata/ @thiagofsm @cakrit packaging/ @paulkatsoulakis @cakrit -packaging/installer/ @ktsaou @paulkatsoulakis @cakrit -packaging/makeself/ @ktsaou @paulkatsoulakis @cakrit -registry/ @ktsaou @gmosx @cakrit -streaming/ @ktsaou @mfundul -web/ @ktsaou @cakrit -web/gui/ @ktsaou @gmosx @cakrit +packaging/installer/ @paulkatsoulakis @cakrit +packaging/makeself/ @paulkatsoulakis @cakrit +registry/ @gmosx @cakrit +streaming/ @cakrit @thiagoftsm +web/ @thiagoftsm @cakrit +web/gui/ @gmosx @cakrit # Ownership by filetype (overwrites ownership by directory) -*.md @ktsaou @cakrit -*.am @paulkatsoulakis @ktsaou +*.md @cakrit +*.am @paulkatsoulakis # Ownership of specific files .gitignore @paulkatsoulakis @cakrit @@ -52,10 +52,10 @@ web/gui/ @ktsaou @gmosx @cakrit .codeclimate.yml @paulkatsoulakis .codacy.yml @paulkatsoulakis netdata.spec.in @paulkatsoulakis -netdata-installer.sh @ktsaou @paulkatsoulakis @cakrit +netdata-installer.sh @paulkatsoulakis @cakrit netlify.toml @cakrit package.json @gmosx packaging/version @netdatabot -LICENSE.md @ktsaou +LICENSE.md @cakrit CHANGELOG.md @netdatabot diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 4fe94ad6..bd939bab 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -1,3 +1,8 @@ +--- +about: General issue template +labels: "needs triage", "no changelog" +--- + <!--- This is a generic issue template. We usually prefer contributors to use one of 3 other specific issue templates (bug report, feature request, question) diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index fbd69a2f..d378d451 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -1,7 +1,7 @@ --- name: Bug report about: Create a bug report to help us improve - +labels: bug, needs triage --- <!--- diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md index b27ba265..4d210259 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -1,7 +1,7 @@ --- name: Feature request about: Suggest an idea for our project - +labels: feature request, needs triage --- <!--- diff --git a/.github/ISSUE_TEMPLATE/question.md b/.github/ISSUE_TEMPLATE/question.md index 9bdf6f14..c5cd71e6 100644 --- a/.github/ISSUE_TEMPLATE/question.md +++ b/.github/ISSUE_TEMPLATE/question.md @@ -1,6 +1,7 @@ --- name: Question about: You just want to ask a question? Go on. +labels: question, no changelog --- <!--- diff --git a/.github/stale.yml b/.github/stale.yml index dfa5ce2c..abf927a4 100644 --- a/.github/stale.yml +++ b/.github/stale.yml @@ -1,8 +1,8 @@ --- only: issues limitPerRun: 30 -daysUntilStale: 45 -daysUntilClose: 60 +daysUntilStale: 30 +daysUntilClose: 7 exemptLabels: - bug - help wanted @@ -11,9 +11,8 @@ exemptProjects: true exemptMilestones: true staleLabel: stale markComment: > - Currently netdata team doesn't have enough capacity to work on this issue. - We will be more than glad to accept a pull request with a solution to problem described here. - This issue will be closed after another 60 days of inactivity. + This issue has been inactive for 30 days. + It will be closed in one week, unless it is updated. closeComment: > This issue has been automatically closed due to extended period of inactivity. Please reopen if it is still valid. Thank you for your contributions. @@ -56,9 +56,16 @@ nfacct.plugin xenstat.plugin !xenstat.plugin/ +perf.plugin +!perf.plugin/ + cgroup-network !cgroup-network/ +# protoc generated files +*.pb.cc +*.pb.h + # installation artifacts packaging/installer/.environment.sh *.tar.* @@ -171,3 +178,6 @@ docs/generator/build docs/generator/mkdocs.yml .environment.sh + +#CLion files +netdata.cbp diff --git a/.remarkrc.js b/.remarkrc.js new file mode 100644 index 00000000..a5d9d128 --- /dev/null +++ b/.remarkrc.js @@ -0,0 +1,121 @@ +// Source: https://github.com/codacy/codacy-remark-lint/raw/master/.remarkrc.js + +const fs = require("fs"); +const path = require("path"); + +exports.settings = { + gfm: true, + commonmark: true, + looseTable: false, + spacedTable: false, + paddedTable: false, + fences: true, + rule: '-', + ruleRepetition: 3, + emphasis: "*", + strong: "*", + bullet: "-", + listItemIndent: 'tab', + incrementListMarker: true +}; + +const personalDictionaryPath = path.join(__dirname, ".dictionary"); +const personalDictionary = fs.existsSync(personalDictionaryPath) + ? { + personal: fs.readFileSync(personalDictionaryPath, "utf8") + } + : {}; + +const remarkPresetLintMarkdownStyleGuide = { + plugins: require("remark-preset-lint-markdown-style-guide").plugins.filter( + function(elem) { + return elem != require("remark-lint-no-duplicate-headings"); + } + ) +}; + +exports.plugins = [ + require("remark-preset-lint-consistent"), + require("remark-preset-lint-recommended"), + remarkPresetLintMarkdownStyleGuide, + [require("remark-lint-no-dead-urls"), { skipOffline: true }], + require("remark-lint-heading-whitespace"), + [require("remark-lint-maximum-line-length"), 120], + [require("remark-lint-maximum-heading-length"), 120], + [require("remark-lint-list-item-indent"), "tab-size"], + [require("remark-lint-list-item-spacing"), false], + [require("remark-lint-strong-marker"), "*"], + [require("remark-lint-emphasis-marker"), "_"], + [require("remark-lint-unordered-list-marker-style"), "-"], + [require("remark-lint-ordered-list-marker-style"), "."], + [require("remark-lint-ordered-list-marker-value"), "ordered"], + /*[ + require("remark-lint-write-good"), + [ + "warn", + { + passive: false, + illusion: true, + so: true, + thereIs: true, + weasel: true, + adverb: true, + tooWordy: true, + cliches: true, + eprime: false + } + ] + ],*/ + require("remark-validate-links"), + require("remark-frontmatter"), + /*[ + require("remark-retext"), + require("unified")().use({ + plugins: [ + require("retext-english"), + require("retext-syntax-urls"), + [ + require("retext-spell"), + { + ignoreLiteral: true, + dictionary: require("dictionary-en-us"), + ...personalDictionary + } + ], + [ + require("retext-sentence-spacing"), + { + preferred: 1 + } + ], + require("retext-repeated-words"), + require("retext-usage"), + require("retext-indefinite-article"), + require("retext-redundant-acronyms"), + [ + require("retext-contractions"), + { + straight: true, + allowLiteral: true + } + ], + require("retext-diacritics"), + [ + require("retext-quotes"), + { + preferred: "straight" + } + ], + require("retext-equality"), + require("retext-passive"), + require("retext-profanities"), + [ + require("retext-readability"), + { + age: 20 + } + ] + ] + }) + ]*/ +]; diff --git a/.travis.yml b/.travis.yml index e1b89dfa..cb9d7290 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,13 +1,18 @@ +dist: xenial sudo: true language: c services: - docker + + # This is a hook to help us introduce "soft" errors on our process matrix: allow_failures: - env: ALLOW_SOFT_FAILURE_HERE=true + + # Install dependencies for all, once # install: @@ -16,12 +21,23 @@ install: - sudo pip install git-semver - docker info - source tests/installer/slack.sh +- export NOTIF_CHANNEL="automation-beta" +- if [ "${TRAVIS_REPO_SLUG}" = "netdata/netdata" ]; then export NOTIF_CHANNEL="automation"; fi; +- export BUILD_VERSION="$(cat packaging/version | cut -d'-' -f1)" +- if [[ "${TRAVIS_COMMIT_MESSAGE}" = *"[Build latest]"* ]]; then export BUILD_VERSION="$(cat packaging/version | cut -d'-' -f1,2 | sed -e 's/-/./g').latest"; fi; +- export DEPLOY_REPO="netdata" # Default production packaging repository +- if [[ "${TRAVIS_COMMIT_MESSAGE}" = *"[Build latest]"* ]]; then export DEPLOY_REPO="netdata-edge"; fi; +- export PACKAGING_USER="$(echo ${TRAVIS_REPO_SLUG} | cut -d'/' -f1)" + + # Setup notification system # notifications: webhooks: https://app.fossa.io/hooks/travisci + + # Define the stage sequence and conditionals # stages: @@ -29,6 +45,8 @@ stages: - name: Code quality, linting, syntax, code style - name: Build process - name: Artifacts validation +- name: Artifacts validation on bare OS, stable to current lifecycle checks + if: branch = master AND (type = pull_request OR type = cron) # Nightly operations - name: Nightly operations @@ -43,6 +61,148 @@ stages: - name: Publish for release if: branch = master AND type != pull_request AND type != cron AND commit_message =~ /(\[netdata release candidate\]|\[netdata major release\]|\[netdata minor release\]|\[netdata patch release\])/ + # Build DEB packages under special conditions + # Ubuntu +- name: "Package ubuntu/disco" + if: type != cron AND branch = master AND commit_message =~ /(\[Package arm64 DEB Ubuntu\]|\[Package arm64 DEB\]|\[Package i386 DEB Ubuntu\]|\[Package i386 DEB\]|\[Package AMD64 DEB Ubuntu\]|\[Package AMD64 DEB\])/ +- name: "Package ubuntu/cosmic" + if: type != cron AND branch = master AND commit_message =~ /(\[Package arm64 DEB Ubuntu\]|\[Package arm64 DEB\]|\[Package i386 DEB Ubuntu\]|\[Package i386 DEB\]|\[Package AMD64 DEB Ubuntu\]|\[Package AMD64 DEB\])/ +- name: "Package ubuntu/bionic" + if: type != cron AND branch = master AND commit_message =~ /(\[Package arm64 DEB Ubuntu\]|\[Package arm64 DEB\]|\[Package i386 DEB Ubuntu\]|\[Package i386 DEB\]|\[Package AMD64 DEB Ubuntu\]|\[Package AMD64 DEB\])/ +- name: "Package ubuntu/artful" + if: type != cron AND branch = master AND commit_message =~ /(\[Package arm64 DEB Ubuntu\]|\[Package arm64 DEB\]|\[Package i386 DEB Ubuntu\]|\[Package i386 DEB\]|\[Package AMD64 DEB Ubuntu\]|\[Package AMD64 DEB\])/ + + # Debian +- name: "Package debian/buster" + if: type != cron AND branch = master AND commit_message =~ /(\[Package arm64 DEB Debian\]|\[Package arm64 DEB\]|\[Package i386 DEB Debian\]|\[Package i386 DEB\]|\[Package AMD64 DEB Debian\]|\[Package AMD64 DEB\])/ +- name: "Package debian/stretch" + if: type != cron AND branch = master AND commit_message =~ /(\[Package arm64 DEB Debian\]|\[Package arm64 DEB\]|\[Package i386 DEB Debian\]|\[Package i386 DEB\]|\[Package AMD64 DEB Debian\]|\[Package AMD64 DEB\])/ +- name: "Package debian/jessie" + if: type != cron AND branch = master AND commit_message =~ /(\[Package arm64 DEB Debian\]|\[Package arm64 DEB\]|\[Package i386 DEB Debian\]|\[Package i386 DEB\]|\[Package AMD64 DEB Debian\]|\[Package AMD64 DEB\])/ +- name: "Package debian/wheezy" + if: type != cron AND branch = master AND commit_message =~ /(\[Package arm64 DEB Debian\]|\[Package arm64 DEB\]|\[Package i386 DEB Debian\]|\[Package i386 DEB\]|\[Package AMD64 DEB Debian\]|\[Package AMD64 DEB\])/ + + # Build RPM packages under special conditions + # Enterprise linux (Covers CentOS, Redhat, Amazon linux) +- name: "Package Enterprise Linux 7" + if: type != cron AND branch = master AND commit_message =~ /(\[Package arm64 RPM Enterprise Linux\]|\[Package arm64 RPM\]|\[Package i386 RPM Enterprise Linux\]|\[Package i386 RPM\]|\[Package AMD64 RPM Enterprise Linux\]|\[Package AMD64 RPM\])/ +- name: "Package Enterprise linux 6" + if: type != cron AND branch = master AND commit_message =~ /(\[Package i386 RPM Enterprise Linux\]|\[Package i386 RPM\]|\[Package AMD64 RPM Enterprise Linux\]|\[Package AMD64 RPM\])/ + + # Fedora +- name: "Package Fedora 30" + if: type != cron AND branch = master AND commit_message =~ /(\[Package arm64 RPM Fedora\]|\[Package arm64 RPM\]|\[Package AMD64 RPM Fedora\]|\[Package AMD64 RPM\])/ +- name: "Package Fedora 29" + if: type != cron AND branch = master AND commit_message =~ /(\[Package arm64 RPM Fedora\]|\[Package arm64 RPM\]|\[Package AMD64 RPM Fedora\]|\[Package AMD64 RPM\])/ +- name: "Package Fedora 28" + if: type != cron AND branch = master AND commit_message =~ /(\[Package arm64 RPM Fedora\]|\[Package arm64 RPM\]|\[Package AMD64 RPM Fedora\]|\[Package AMD64 RPM\])/ + + # OpenSuSE +- name: "Package OpenSuSE 15.1" + if: type != cron AND branch = master AND commit_message =~ /(\[Package arm64 RPM openSuSE\]|\[Package arm64 RPM\]|\[Package AMD64 RPM openSuSE\]|\[Package AMD64 RPM\])/ +- name: "Package OpenSuSE 15.0" + if: type != cron AND branch = master AND commit_message =~ /(\[Package arm64 RPM openSuSE\]|\[Package arm64 RPM\]|\[Package AMD64 RPM openSuSE\]|\[Package AMD64 RPM\])/ + + + +# DEB and RPM template flows +- stage: &_RPM_TEMPLATE + name: "Build & Publish RPM package" + before_install: + - sudo apt-get install -y wget lxc lxc-templates + - source tests/installer/slack.sh + before_script: + - post_message "TRAVIS_MESSAGE" "Starting package preparation and publishing for ${BUILD_STRING}.${BUILD_ARCH}" "${NOTIF_CHANNEL}" + - export PACKAGES_DIRECTORY="$(mktemp -d -t netdata-packaging-contents-dir-XXXXXX)" && echo "Created packaging directory ${PACKAGES_DIRECTORY}" + script: + - echo "Creating LXC environment for the build" && sudo -E .travis/package_management/create_lxc_for_build.sh + - echo "Building package in container" && sudo -E .travis/package_management/build_package_in_container.sh + - sudo chmod -R 755 "/var/lib/lxc" + - echo "Preparing RPM packaging contents for upload" && sudo -E .travis/package_management/prepare_packages.sh + after_failure: post_message "TRAVIS_MESSAGE" "Failed to build RPM for ${BUILD_STRING}.${BUILD_ARCH}" + + before_deploy: + - .travis/package_management/yank_stale_rpm.sh "${PACKAGES_DIRECTORY}" "${BUILD_STRING}" || echo "No stale RPM found" + deploy: + # Beta packages deployment + - provider: packagecloud + repository: "${DEPLOY_REPO}" + username: "${PACKAGING_USER}" + token: "${PKG_CLOUD_TOKEN}" + dist: "${BUILD_STRING}" + local_dir: "${PACKAGES_DIRECTORY}" + skip_cleanup: true + on: + # Only deploy on ${USER}/netdata, master branch, when packages directory is created + repo: ${TRAVIS_REPO_SLUG} + branch: master + condition: -d "${PACKAGES_DIRECTORY}" + # Production release packages deployment + - provider: packagecloud + repository: "netdata" + username: "netdata" + token: "${PKG_CLOUD_TOKEN}" + dist: "${BUILD_STRING}" + local_dir: "${PACKAGES_DIRECTORY}" + skip_cleanup: true + on: + # Only deploy on ${USER}/netdata, master branch, when packages directory is created + repo: "netdata/netdata" + branch: "master" + condition: -d "${PACKAGES_DIRECTORY}" + after_deploy: + - if [ -n "${BUILDER_NAME}" ]; then rm -rf /home/${BUILDER_NAME}/* && echo "Cleared /home/${BUILDER_NAME} directory" || echo "Failed to clean /home/${BUILDER_NAME} directory"; fi; + - if [ -d "${PACKAGES_DIRECTORY}" ]; then rm -rf "${PACKAGES_DIRECTORY}"; fi; + + + # TODO: This section is stale, will be aligned with the RPM implementation when we get to DEB packaging +- stage: &_DEB_TEMPLATE + name: "Build & Publish DEB package" + before_install: + - sudo apt-get install -y wget lxc lxc-templates + - source tests/installer/slack.sh + before_script: + - post_message "TRAVIS_MESSAGE" "Starting package preparation and publishing for ${BUILD_STRING}.${BUILD_ARCH}" "${NOTIF_CHANNEL}" + - export PACKAGES_DIRECTORY="$(mktemp -d -t netdata-packaging-contents-dir-XXXXXX)" && echo "Created packaging directory ${PACKAGES_DIRECTORY}" + script: + - echo "Creating LXC environment for the build" && sudo -E .travis/package_management/create_lxc_for_build.sh + - echo "Building package in container" && sudo -E .travis/package_management/build_package_in_container.sh + - sudo chmod -R 755 "/var/lib/lxc" + - echo "Preparing DEB packaging contents for upload" && sudo -E .travis/package_management/prepare_packages.sh + after_failure: post_message "TRAVIS_MESSAGE" "Failed to build DEB for ${BUILD_STRING}.${BUILD_ARCH}" + before_deploy: + - .travis/package_management/yank_stale_rpm.sh "${PACKAGES_DIRECTORY}" "${BUILD_STRING}" || echo "No stale DEB found" + deploy: + # Beta packages deployment + - provider: packagecloud + repository: "${DEPLOY_REPO}" + username: "${PACKAGING_USER}" + token: "${PKG_CLOUD_TOKEN}" + dist: "${BUILD_STRING}" + local_dir: "${PACKAGES_DIRECTORY}" + skip_cleanup: true + on: + # Only deploy on ${USER}/netdata, master branch, when build-area directory is created + repo: ${TRAVIS_REPO_SLUG} + branch: master + condition: -d "${PACKAGES_DIRECTORY}" + # Production release packages deployment + - provider: packagecloud + repository: "netdata" + username: "netdata" + token: "${PKG_CLOUD_TOKEN}" + dist: "${BUILD_STRING}" + local_dir: "${PACKAGES_DIRECTORY}" + skip_cleanup: true + on: + # Only deploy on ${USER}/netdata, master branch, when build-area directory is created + repo: "netdata/netdata" + branch: master + condition: -d "${PACKAGES_DIRECTORY}" + after_deploy: + - if [ -n "${BUILDER_NAME}" ]; then rm -rf /home/${BUILDER_NAME}/* && echo "Cleared /home/${BUILDER_NAME} directory" || echo "Failed to clean /home/${BUILDER_NAME} directory"; fi; + - if [ -d "${PACKAGES_DIRECTORY}" ]; then rm -rf "${PACKAGES_DIRECTORY}"; fi; + # Define stage implementation details @@ -88,7 +248,7 @@ jobs: - echo "packaging/version:" && cat packaging/version - docker run -it -v "${PWD}:/netdata:rw" -v "/tmp/netdata-makedist-test:/netdata_install:rw" -w /netdata "netdata/os-test:ubuntu1804" make clean || echo "Nothing to clean" - docker run -it -v "${PWD}:/netdata:rw" -v "/tmp/netdata-makedist-test:/netdata_install:rw" -w /netdata "netdata/os-test:ubuntu1804" make distclean || echo "Nothing to distclean" - - docker run -it -v "${PWD}:/netdata:rw" -v "/tmp/netdata-makedist-test:/netdata_install:rw" -w /netdata "netdata/os-test:ubuntu1804" autoreconf -ivf && ./configure --prefix=/netdata_install/usr --sysconfdir=/netdata_install/etc --localstatedir=/netdata_install/var --with-zlib --with-math --with-user=netdata CFLAGS=-O2 + - docker run -it -v "${PWD}:/netdata:rw" -v "/tmp/netdata-makedist-test:/netdata_install:rw" -w /netdata "netdata/os-test:ubuntu1804" /bin/bash -c "autoreconf -ivf && ./configure --prefix=/netdata_install/usr --sysconfdir=/netdata_install/etc --localstatedir=/netdata_install/var --with-zlib --with-math --with-user=netdata CFLAGS=-O2" - docker run -it -v "${PWD}:/netdata:rw" -v "/tmp/netdata-makedist-test:/netdata_install:rw" -w /netdata "netdata/os-test:ubuntu1804" make dist - docker run -it -v "${PWD}:/netdata:rw" -v "/tmp/netdata-makedist-test:/netdata_install:rw" -w /netdata "netdata/os-test:ubuntu1804" ls -ltr ./netdata-$(git describe).tar.gz || ls -ltr ./netdata-$(cat packaging/version | tr -d '\n').tar.gz - .travis/run_install_with_dist_file.sh @@ -135,10 +295,69 @@ jobs: + - stage: "Artifacts validation on bare OS, stable to current lifecycle checks" + + # Ubuntu runs + name: Run netdata lifecycle on Ubuntu 16.04 (xenial) + script: sudo -E tests/updater_checks.sh + after_failure: post_message "TRAVIS_MESSAGE" "Netdata updater process failed on bare Ubuntu 16.04" + + - name: Run netdata lifecycle, on Ubuntu 19.04 (Containerized) + script: docker run -it -v "${PWD}:/netdata:rw" -w /netdata "ubuntu:19.04" tests/updater_checks.sh + after_failure: post_message "TRAVIS_MESSAGE" "Netdata updater process failed on bare Ubuntu 19.04" + + # Centos runs + - name: Run netdata lifecycle on CentOS 7 (Containerized) + script: docker run -it -v "${PWD}:/netdata:rw" -w /netdata "centos:7" tests/updater_checks.sh + after_failure: post_message "TRAVIS_MESSAGE" "Netdata updater process failed on bare CentOS 7" + + # Debian runs + - name: Run netdata lifecycle, on Debian 9 (Containerized) + script: docker run -it -v "${PWD}:/netdata:rw" -w /netdata "debian:stretch" tests/updater_checks.sh + after_failure: post_message "TRAVIS_MESSAGE" "Netdata updater process failed on bare Debian 9 (stretch)" + + # openSuSE runs + - name: Run netdata lifecycle, on openSuSE 15.0 + script: docker run -it -v "${PWD}:/netdata:rw" -w /netdata "opensuse/leap:15.0" tests/updater_checks.sh + after_failure: post_message "TRAVIS_MESSAGE" "Netdata updater process failed on bare opensuse/leap:15.0" + + - name: Run netdata lifecycle, on openSuSE 15.1 + script: docker run -it -v "${PWD}:/netdata:rw" -w /netdata "opensuse/leap:15.1" tests/updater_checks.sh + after_failure: post_message "TRAVIS_MESSAGE" "Netdata updater process failed on bare opensuse/leap:15.1" + + - name: Run netdata lifecycle, on openSuSE Tumbleweed + script: docker run -it -v "${PWD}:/netdata:rw" -w /netdata "opensuse/tumbleweed:latest" tests/updater_checks.sh + after_failure: post_message "TRAVIS_MESSAGE" "Netdata updater process failed on bare opensuse/tumbleweed:latest" + + # Alpine runs + - name: Run netdata lifecycle, on Alpine linux + script: docker run -it -v "${PWD}:/netdata:rw" -w /netdata "alpine" tests/updater_checks.sh + after_failure: post_message "TRAVIS_MESSAGE" "Netdata updater process failed on bare Alpine" + + # Arch linux runs + - name: Run netdata lifecycle, on ArchLinux + script: docker run -it -v "${PWD}:/netdata:rw" -w /netdata "archlinux/base:latest" tests/updater_checks.sh + after_failure: post_message "TRAVIS_MESSAGE" "Netdata updater process failed on bare archlinux/base:latest" + + # Fedora runs + - name: Run netdata lifecycle, on Fedora 28 + script: docker run -it -v "${PWD}:/netdata:rw" -w /netdata "fedora:28" tests/updater_checks.sh + after_failure: post_message "TRAVIS_MESSAGE" "Netdata updater process failed on bare Fedora 28" + + - name: Run netdata lifecycle, on Fedora 29 + script: docker run -it -v "${PWD}:/netdata:rw" -w /netdata "fedora:29" tests/updater_checks.sh + after_failure: post_message "TRAVIS_MESSAGE" "Netdata updater process failed on bare Fedora 29" + + - name: Run netdata lifecycle, on Fedora 30 (Containerized) + script: docker run -it -v "${PWD}:/netdata:rw" -w /netdata "fedora:30" tests/updater_checks.sh + after_failure: post_message "TRAVIS_MESSAGE" "Netdata updater process failed on bare Fedora 30" + + + - stage: Packaging for release name: Generate changelog and TAG the release (only on special commit msg) - before_script: post_message "TRAVIS_MESSAGE" "Packaging step for release initiated" + before_script: post_message "TRAVIS_MESSAGE" "Packaging step for release initiated" "${NOTIF_CHANNEL}" script: - echo "GIT Branch:" && git branch - echo "Last commit:" && git log -1 @@ -154,11 +373,177 @@ jobs: - # We only publish if a TAG has been set during packaging + # ###### Packaging workflow section ###### + # References: + # https://us.images.linuxcontainers.org + # https://packagecloud.io/docs#install_repo + + # Ubuntu distros build + # + - stage: + <<: *_DEB_TEMPLATE + stage: "Package ubuntu/disco" + env: + - BUILDER_NAME="builder" BUILD_DISTRO="ubuntu" BUILD_RELEASE="disco" BUILD_STRING="ubuntu/disco" + - PACKAGE_TYPE="deb" REPO_TOOL="apt-get" + - ALLOW_SOFT_FAILURE_HERE=true + + + + - stage: + <<: *_DEB_TEMPLATE + stage: "Package ubuntu/cosmic" + env: + - BUILDER_NAME="builder" BUILD_DISTRO="ubuntu" BUILD_RELEASE="cosmic" BUILD_STRING="ubuntu/cosmic" + - PACKAGE_TYPE="deb" REPO_TOOL="apt-get" + - ALLOW_SOFT_FAILURE_HERE=true + + + + - stage: + <<: *_DEB_TEMPLATE + stage: "Package ubuntu/bionic" + env: + - BUILDER_NAME="builder" BUILD_DISTRO="ubuntu" BUILD_RELEASE="bionic" BUILD_STRING="ubuntu/bionic" + - PACKAGE_TYPE="deb" REPO_TOOL="apt-get" + - ALLOW_SOFT_FAILURE_HERE=true + + + + - stage: + <<: *_DEB_TEMPLATE + stage: "Package ubuntu/artful" + env: + - BUILDER_NAME="builder" BUILD_DISTRO="ubuntu" BUILD_RELEASE="artful" BUILD_STRING="ubuntu/artful" + - PACKAGE_TYPE="deb" REPO_TOOL="apt-get" + - ALLOW_SOFT_FAILURE_HERE=true + + + + # Debian distros build + - stage: + <<: *_DEB_TEMPLATE + stage: "Package debian/buster" + env: + - BUILDER_NAME="builder" BUILD_DISTRO="debian" BUILD_RELEASE="buster" BUILD_STRING="debian/buster" + - PACKAGE_TYPE="deb" REPO_TOOL="apt-get" + - ALLOW_SOFT_FAILURE_HERE=true + + + + - stage: + <<: *_DEB_TEMPLATE + stage: "Package debian/stretch" + env: + - BUILDER_NAME="builder" BUILD_DISTRO="debian" BUILD_RELEASE="stretch" BUILD_STRING="debian/stretch" + - PACKAGE_TYPE="deb" REPO_TOOL="apt-get" + - ALLOW_SOFT_FAILURE_HERE=true + + + + - stage: + <<: *_DEB_TEMPLATE + stage: "Package debian/jessie" + env: + - BUILDER_NAME="builder" BUILD_DISTRO="debian" BUILD_RELEASE="jessie" BUILD_STRING="debian/jessie" + - PACKAGE_TYPE="deb" REPO_TOOL="apt-get" + - ALLOW_SOFT_FAILURE_HERE=true + + + + - stage: + <<: *_DEB_TEMPLATE + stage: "Package debian/wheezy" + env: + - BUILDER_NAME="builder" BUILD_DISTRO="debian" BUILD_RELEASE="wheezy" BUILD_STRING="debian/wheezy" + - PACKAGE_TYPE="deb" REPO_TOOL="apt-get" + - ALLOW_SOFT_FAILURE_HERE=true + + + + # Enterprise linux builds (Centos, Redhat, Amazon linux (el/6)) + # + - stage: + <<: *_RPM_TEMPLATE + stage: "Package Enterprise Linux 7" + env: + - BUILDER_NAME="builder" BUILD_DISTRO="centos" BUILD_RELEASE="7" BUILD_STRING="el/7" + - PACKAGE_TYPE="rpm" REPO_TOOL="yum" + - ALLOW_SOFT_FAILURE_HERE=true + + + + - stage: + <<: *_RPM_TEMPLATE + stage: "Package Enterprise linux 6" + env: + - BUILDER_NAME="builder" BUILD_DISTRO="centos" BUILD_RELEASE="6" BUILD_STRING="el/6" + - PACKAGE_TYPE="rpm" REPO_TOOL="yum" + - ALLOW_SOFT_FAILURE_HERE=true + + + + # Fedora distros build + # + - stage: + <<: *_RPM_TEMPLATE + stage: "Package Fedora 30" + env: + - BUILDER_NAME="builder" BUILD_DISTRO="fedora" BUILD_RELEASE="30" BUILD_STRING="fedora/30" + - PACKAGE_TYPE="rpm" REPO_TOOL="dnf" + - ALLOW_SOFT_FAILURE_HERE=true + + + + - stage: + <<: *_RPM_TEMPLATE + stage: "Package Fedora 29" + env: + - BUILDER_NAME="builder" BUILD_DISTRO="fedora" BUILD_RELEASE="29" BUILD_STRING="fedora/29" + - PACKAGE_TYPE="rpm" REPO_TOOL="dnf" + - ALLOW_SOFT_FAILURE_HERE=true + + + + - stage: + <<: *_RPM_TEMPLATE + stage: "Package Fedora 28" + env: + - BUILDER_NAME="builder" BUILD_DISTRO="fedora" BUILD_RELEASE="28" BUILD_STRING="fedora/28" + - PACKAGE_TYPE="rpm" REPO_TOOL="dnf" + - ALLOW_SOFT_FAILURE_HERE=true + + + + # Opensuse distros build + # + - stage: + <<: *_RPM_TEMPLATE + stage: "Package OpenSuSE 15.1" + env: + - BUILDER_NAME="builder" BUILD_DISTRO="opensuse" BUILD_RELEASE="15.0" BUILD_STRING="opensuse/15.1" + - PACKAGE_TYPE="rpm" REPO_TOOL="zypper" + - ALLOW_SOFT_FAILURE_HERE=true + + + + - stage: + <<: *_RPM_TEMPLATE + stage: "Package OpenSuSE 15.0" + env: + - BUILDER_NAME="builder" BUILD_DISTRO="opensuse" BUILD_RELEASE="15.0" BUILD_STRING="opensuse/15.0" + - PACKAGE_TYPE="rpm" REPO_TOOL="zypper" + - ALLOW_SOFT_FAILURE_HERE=true + # ###### End of packaging workflow section ###### # + # ############################################### # + + + + # We only publish if a TAG has been set during packaging - stage: Publish for release name: Build & Publish docker images - before_script: post_message "TRAVIS_MESSAGE" "Publishing docker images" + before_script: post_message "TRAVIS_MESSAGE" "Publishing docker images" "${NOTIF_CHANNEL}" script: - echo "GIT Branch:" && git branch - echo "Last commit:" && git log -1 @@ -175,7 +560,7 @@ jobs: if: tag !~ /(-rc)/ - name: Create release draft - before_script: post_message "TRAVIS_MESSAGE" "Drafting release on github" + before_script: post_message "TRAVIS_MESSAGE" "Drafting release on github" "${NOTIF_CHANNEL}" script: - echo "GIT Branch:" && git branch - echo "Last commit:" && git log -1 @@ -191,12 +576,12 @@ jobs: - # This is the nightly pre-execution step (Jobs, preparatory steps for nightly, etc) + # This is the nightly pre-execution step (Jobs, preparatory steps for nightly, etc) - stage: Nightly operations name: Run coverity scan # Just notify people that Nightly ops triggered, use the first step as a hook to do that - before_script: post_message "TRAVIS_MESSAGE" "Starting nightly operations" + before_script: post_message "TRAVIS_MESSAGE" "Starting nightly operations" "${NOTIF_CHANNEL}" script: ./coverity-install.sh && ./coverity-scan.sh || echo "Coverity failed :(" - name: Kickstart files integrity testing (extended) @@ -207,7 +592,7 @@ jobs: # This is generating the changelog for nightly release and publish it - name: Generate nightly changelog - before_script: post_message "TRAVIS_MESSAGE" "Starting changelog generation for nightlies" + before_script: post_message "TRAVIS_MESSAGE" "Starting changelog generation for nightlies" "${NOTIF_CHANNEL}" script: ".travis/nightlies.sh" after_failure: post_message "TRAVIS_MESSAGE" "<!here> Nightly changelog generation failed" git: @@ -215,12 +600,12 @@ jobs: - # This is the nightly execution step - # + # This is the nightly execution step + # - stage: Nightly release name: Build & Publish docker images - before_script: post_message "TRAVIS_MESSAGE" "Publishing docker images for nightlies" + before_script: post_message "TRAVIS_MESSAGE" "Publishing docker images for nightlies" "${NOTIF_CHANNEL}" script: - echo "GIT Branch:" && git branch - echo "Last commit:" && git log -1 @@ -236,7 +621,7 @@ jobs: env: ALLOW_SOFT_FAILURE_HERE=true - name: Create nightly release artifacts, publish to GCS - before_script: post_message "TRAVIS_MESSAGE" "Starting artifacts generation for nightlies" + before_script: post_message "TRAVIS_MESSAGE" "Starting artifacts generation for nightlies" "${NOTIF_CHANNEL}" script: - echo "GIT Branch:" && git branch - echo "Last commit:" && git log -1 diff --git a/.travis/README.md b/.travis/README.md index 03ac2edd..b7b61ecb 100644 --- a/.travis/README.md +++ b/.travis/README.md @@ -95,3 +95,49 @@ During packaging we are preparing the release changelog information and run the ## Publish for release The publishing stage is the most complex part in publishing. This is the stage were we generate and publish docker images, prepare the release artifacts and get ready with the release draft. + +### Package Management workflows +As part of our goal to provide the best support to our customers, we have created a set of CI workflows to automatically produce +DEB and RPM for multiple distributions. These workflows are implemented under the templated stages '_DEB_TEMPLATE' and '_RPM_TEMPLATE'. +We currently plan to actively support the following Operating Systems, with a plan to further expand this list following our users needs. + +### Operating systems supported +The following distributions are supported +- Debian versions + - Buster (TBD - not released yet, check [debian releases](https://www.debian.org/releases/) for details) + - Stretch + - Jessie + - Wheezy + +- Ubuntu versions + - Disco + - Cosmic + - Bionic + - artful + +- Enterprise Linux versions (Covers Redhat, CentOS, and Amazon Linux with version 6) + - Version 8 (TBD) + - Version 7 + - Version 6 + +- Fedora versions + - Version 31 (TBD) + - Version 30 + - Version 29 + - Version 28 + +- OpenSuSE versions + - 15.1 + - 15.0 + +- Gentoo distributions + - TBD + +### Architectures supported +We plan to support amd64, x86 and arm64 architectures. As of June 2019 only amd64 and x86 will become available, as we are still working on solving issues with the architecture. + +The Package deployment can be triggered manually by executing an empty commit with the following message pattern: `[Package PACKAGE_TYPE PACKAGE_ARCH] DESCRIBE_THE_REASONING_HERE`. +Travis Yaml configuration allows the user to combine package type and architecture as necessary to regenerate the current stable release (For example tag v1.15.0 as of 4th of May 2019) +Sample patterns to trigger building of packages for all AMD64 supported architecture: +- '[Package AMD64 RPM]': Build & publish all amd64 available RPM packages +- '[Package AMD64 DEB]': Build & publish all amd64 available DEB packages diff --git a/.travis/labeler.sh b/.travis/labeler.sh index e8d7d228..7863084d 100755 --- a/.travis/labeler.sh +++ b/.travis/labeler.sh @@ -36,21 +36,6 @@ echo "===== Looking up available labels =====" LABELS_FILE=/tmp/labels hub issue labels >$LABELS_FILE -echo "===== Categorizing issues =====" -# This won't touch issues which already have at least one label assigned -for STATE in "open" "closed"; do - for ISSUE in $(hub issue -f "%I %l%n" -s "$STATE" -d "$(date +%F -d '1 day ago')" | grep -v -f $LABELS_FILE); do - echo "-------- Processing $STATE issue no. $ISSUE --------" - BODY="$(curl -H "Authorization: token $GITHUB_TOKEN" "https://api.github.com/repos/netdata/netdata/issues/$ISSUE" 2>/dev/null | jq .body)" - case "${BODY}" in - *"# Question summary"*) new_labels "$ISSUE" "question" "no changelog" ;; - *"# Bug report summary"*) new_labels "$ISSUE" "needs triage" "bug" ;; - *"# Feature idea summary"*) new_labels "$ISSUE" "needs triage" "feature request" ;; - *) new_labels "$ISSUE" "needs triage" "no changelog" ;; - esac - done -done - # Change all 'area' labels assigned to PR saving non-area labels. echo "===== Categorizing PRs =====" NEW_LABELS=/tmp/new_labels diff --git a/.travis/package_management/build_package_in_container.sh b/.travis/package_management/build_package_in_container.sh new file mode 100755 index 00000000..95a68e7a --- /dev/null +++ b/.travis/package_management/build_package_in_container.sh @@ -0,0 +1,82 @@ +#!/usr/bin/env bash +# +# Entry point for package build process +# +# Copyright: SPDX-License-Identifier: GPL-3.0-or-later +# +# Author : Pavlos Emm. Katsoulakis (paul@netdata.cloud) +#shellcheck disable=SC1091 +set -e + +# If we are not in netdata git repo, at the top level directory, fail +TOP_LEVEL=$(basename "$(git rev-parse --show-toplevel)") +CWD=$(git rev-parse --show-cdup) +if [ -n "$CWD" ] || [ ! "${TOP_LEVEL}" == "netdata" ]; then + echo "Run as .travis/package_management/$(basename "$0") from top level directory of netdata git repository" + echo "Docker build process aborted" + exit 1 +fi + +source .travis/package_management/functions.sh || (echo "Failed to load packaging library" && exit 1) + +# Check for presence of mandatory environment variables +if [ -z "${BUILD_STRING}" ]; then + echo "No Distribution was defined. Make sure BUILD_STRING is set on the environment before running this script" + exit 1 +fi + +if [ -z "${BUILDER_NAME}" ]; then + echo "No builder account and container name defined. Make sure BUILDER_NAME is set on the environment before running this script" + exit 1 +fi + +if [ -z "${BUILD_DISTRO}" ]; then + echo "No build distro information defined. Make sure BUILD_DISTRO is set on the environment before running this script" + exit 1 +fi + +if [ -z "${BUILD_RELEASE}" ]; then + echo "No build release information defined. Make sure BUILD_RELEASE is set on the environment before running this script" + exit 1 +fi + +if [ -z "${PACKAGE_TYPE}" ]; then + echo "No build release information defined. Make sure PACKAGE_TYPE is set on the environment before running this script" + exit 1 +fi + +# Detect architecture and load extra variables needed +detect_arch_from_commit + +case "${BUILD_ARCH}" in +"all") + echo "* * * Building all architectures, amd64 and i386 * * *" + echo "Building for amd64.." + export CONTAINER_NAME="${BUILDER_NAME}-${BUILD_DISTRO}${BUILD_RELEASE}-amd64" + export LXC_CONTAINER_ROOT="/var/lib/lxc/${CONTAINER_NAME}/rootfs" + .travis/package_management/trigger_${PACKAGE_TYPE}_lxc_build.py "${CONTAINER_NAME}" + + echo "Building for arm64.." + export CONTAINER_NAME="${BUILDER_NAME}-${BUILD_DISTRO}${BUILD_RELEASE}-arm64" + export LXC_CONTAINER_ROOT="/var/lib/lxc/${CONTAINER_NAME}/rootfs" + .travis/package_management/trigger_${PACKAGE_TYPE}_lxc_build.py "${CONTAINER_NAME}" + + echo "Building for i386.." + export CONTAINER_NAME="${BUILDER_NAME}-${BUILD_DISTRO}${BUILD_RELEASE}-i386" + export LXC_CONTAINER_ROOT="/var/lib/lxc/${CONTAINER_NAME}/rootfs" + .travis/package_management/trigger_${PACKAGE_TYPE}_lxc_build.py "${CONTAINER_NAME}" + + ;; +"amd64"|"arm64"|"i386") + echo "Building for ${BUILD_ARCH}.." + export CONTAINER_NAME="${BUILDER_NAME}-${BUILD_DISTRO}${BUILD_RELEASE}-${BUILD_ARCH}" + export LXC_CONTAINER_ROOT="/var/lib/lxc/${CONTAINER_NAME}/rootfs" + .travis/package_management/trigger_${PACKAGE_TYPE}_lxc_build.py "${CONTAINER_NAME}" + ;; +*) + echo "Unknown build architecture '${BUILD_ARCH}', nothing to do for build" + exit 1 + ;; +esac + +echo "Build process completed!" diff --git a/.travis/package_management/common.py b/.travis/package_management/common.py new file mode 100755 index 00000000..6cf59293 --- /dev/null +++ b/.travis/package_management/common.py @@ -0,0 +1,46 @@ +# +# +# Python library with commonly used functions within the package management scope +# +# Author : Pavlos Emm. Katsoulakis <paul@netdata.cloud> + +import lxc +import subprocess + +def replace_tag(tag_name, spec, new_tag_content): + print("Fixing tag %s in %s" % (tag_name, spec)) + + ifp = open(spec, "r") + config = ifp.readlines() + ifp.close() + + source_line = -1 + for line in config: + if str(line).count(tag_name + ":") > 0: + source_line = config.index(line) + print("Found line: %s in item %d" % (line, source_line)) + break + + if source_line >= 0: + print("Replacing line %s with %s in spec file" %(config[source_line], new_tag_content)) + config[source_line] = "%s: %s\n" % (tag_name, new_tag_content) + config_str = ''.join(config) + ofp = open(spec, 'w') + ofp.write(config_str) + ofp.close() + +def run_command(container, command): + print("Running command: %s" % command) + command_result = container.attach_wait(lxc.attach_run_command, command) + + if command_result != 0: + raise Exception("Command failed with exit code %d" % command_result) + +def run_command_in_host(cmd): + print("Issue command in host: %s" % str(cmd)) + + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + o, e = proc.communicate() + print('Output: ' + o.decode('ascii')) + print('Error: ' + e.decode('ascii')) + print('code: ' + str(proc.returncode)) diff --git a/.travis/package_management/configure_deb_lxc_environment.py b/.travis/package_management/configure_deb_lxc_environment.py new file mode 100755 index 00000000..58999ad3 --- /dev/null +++ b/.travis/package_management/configure_deb_lxc_environment.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 +# +# Prepare the build environment within the container +# The script attaches to the running container and does the following: +# 1) Create the container +# 2) Start the container up +# 3) Create the builder user +# 4) Prepare the environment for DEB build +# +# Copyright: SPDX-License-Identifier: GPL-3.0-or-later +# +# Author : Pavlos Emm. Katsoulakis <paul@netdata.cloud> + +import common +import os +import sys +import lxc + +if len(sys.argv) != 2: + print('You need to provide a container name to get things started') + sys.exit(1) +container_name=sys.argv[1] + +# Setup the container object +print("Defining container %s" % container_name) +container = lxc.Container(container_name) +if not container.defined: + raise Exception("Container %s not defined!" % container_name) + +# Start the container +if not container.start(): + raise Exception("Failed to start the container") + +if not container.running or not container.state == "RUNNING": + raise Exception('Container %s is not running, configuration process aborted ' % container_name) + +# Wait for connectivity +print("Waiting for container connectivity to start configuration sequence") +if not container.get_ips(timeout=30): + raise Exception("Timeout while waiting for container") + +# Run the required activities now +# 1. Create the builder user +print("1. Adding user %s" % os.environ['BUILDER_NAME']) +common.run_command(container, ["useradd", "-m", os.environ['BUILDER_NAME']]) + +# Fetch package dependencies for the build +print("2. Installing package dependencies within LXC container") +common.run_command(container, ["apt-get", "update", "-y"]) +common.run_command(container, ["apt-get", "install", "-y", "sudo"]) +common.run_command(container, ["apt-get", "install", "-y", "wget"]) +common.run_command(container, ["apt-get", "install", "-y", "bash"]) +common.run_command(container, ["wget", "-T", "15", "-O", "~/.install-required-packages.sh", "https://raw.githubusercontent.com/netdata/netdata-demo-site/master/install-required-packages.sh"]) +common.run_command(container, ["bash", "~/.install-required-packages.sh", "netdata", "--dont-wait", "--non-interactive"]) + +# Download the source +dest_archive="/home/%s/netdata-%s.tar.gz" % (os.environ['BUILDER_NAME'],os.environ['BUILD_VERSION']) +release_url="https://github.com/netdata/netdata/releases/download/%s/netdata-%s.tar.gz" % (os.environ['BUILD_VERSION'], os.environ['BUILD_VERSION']) +print("3. Fetch netdata source (%s -> %s)" % (release_url, dest_archive)) +common.run_command(container, ["sudo", "-u", os.environ['BUILDER_NAME'], "wget", "-T", "15", "--output-document=" + dest_archive, release_url]) + +print("4. Extracting directory contents to /home " + os.environ['BUILDER_NAME']) +common.run_command(container, ["sudo", "-u", os.environ['BUILDER_NAME'], "tar", "xf", dest_archive, "-C", "/home/" + os.environ['BUILDER_NAME']]) + +print("Done!") diff --git a/.travis/package_management/configure_rpm_lxc_environment.py b/.travis/package_management/configure_rpm_lxc_environment.py new file mode 100755 index 00000000..644e027b --- /dev/null +++ b/.travis/package_management/configure_rpm_lxc_environment.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python3 +# +# +# Prepare the build environment within the container +# The script attaches to the running container and does the following: +# 1) Create the container +# 2) Start the container up +# 3) Create the builder user +# 4) Prepare the environment for RPM build +# +# Copyright: SPDX-License-Identifier: GPL-3.0-or-later +# +# Author : Pavlos Emm. Katsoulakis <paul@netdata.cloud> + +import common +import os +import sys +import lxc + +if len(sys.argv) != 2: + print('You need to provide a container name to get things started') + sys.exit(1) +container_name=sys.argv[1] + +# Setup the container object +print("Defining container %s" % container_name) +container = lxc.Container(container_name) +if not container.defined: + raise Exception("Container %s not defined!" % container_name) + +# Start the container +if not container.start(): + raise Exception("Failed to start the container") + +if not container.running or not container.state == "RUNNING": + raise Exception('Container %s is not running, configuration process aborted ' % container_name) + +# Wait for connectivity +print("Waiting for container connectivity to start configuration sequence") +if not container.get_ips(timeout=30): + raise Exception("Timeout while waiting for container") + +# Run the required activities now +# Create the builder user +print("1. Adding user %s" % os.environ['BUILDER_NAME']) +common.run_command(container, ["useradd", "-m", os.environ['BUILDER_NAME']]) + +# Fetch package dependencies for the build +print("2. Installing package dependencies within LXC container") +if str(os.environ["REPO_TOOL"]).count("zypper") == 1: + common.run_command(container, [os.environ["REPO_TOOL"], "clean", "-a"]) + common.run_command(container, [os.environ["REPO_TOOL"], "--no-gpg-checks", "update", "-y"]) + common.run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "json-glib-devel"]) + +elif str(os.environ["REPO_TOOL"]).count("yum") == 1: + common.run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "json-c-devel"]) + common.run_command(container, [os.environ["REPO_TOOL"], "clean", "all"]) + common.run_command(container, [os.environ["REPO_TOOL"], "update", "-y"]) + common.run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "epel-release"]) +else: + common.run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "json-c-devel"]) + common.run_command(container, [os.environ["REPO_TOOL"], "update", "-y"]) + +common.run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "sudo"]) +common.run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "wget"]) +common.run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "bash"]) +common.run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "freeipmi-devel"]) +common.run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "cups-devel"]) + +# Exceptional cases, not available everywhere +# + +# Not on Centos-7 +if os.environ["BUILD_STRING"].count("el/7") <= 0: + common.run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "libnetfilter_acct-devel"]) + +# Not on Centos-6 +if os.environ["BUILD_STRING"].count("el/6") <= 0: + common.run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "autoconf-archive"]) + +common.run_command(container, ["wget", "-T", "15", "-O", "/home/%s/.install-required-packages.sh" % (os.environ['BUILDER_NAME']), "https://raw.githubusercontent.com/netdata/netdata-demo-site/master/install-required-packages.sh"]) +common.run_command(container, ["bash", "/home/%s/.install-required-packages.sh" % (os.environ['BUILDER_NAME']), "netdata", "--dont-wait", "--non-interactive"]) + +print("3. Setting up macros") +common.run_command(container, ["sudo", "-u", os.environ['BUILDER_NAME'], "/bin/echo", "'%_topdir %(echo /home/" + os.environ['BUILDER_NAME'] + ")/rpmbuild' > /home/" + os.environ['BUILDER_NAME'] + "/.rpmmacros"]) + +print("4. Create rpmbuild directory") +common.run_command(container, ["sudo", "-u", os.environ['BUILDER_NAME'], "mkdir", "-p", "/home/" + os.environ['BUILDER_NAME'] + "/rpmbuild/BUILD"]) +common.run_command(container, ["sudo", "-u", os.environ['BUILDER_NAME'], "mkdir", "-p", "/home/" + os.environ['BUILDER_NAME'] + "/rpmbuild/RPMS"]) +common.run_command(container, ["sudo", "-u", os.environ['BUILDER_NAME'], "mkdir", "-p", "/home/" + os.environ['BUILDER_NAME'] + "/rpmbuild/SOURCES"]) +common.run_command(container, ["sudo", "-u", os.environ['BUILDER_NAME'], "mkdir", "-p", "/home/" + os.environ['BUILDER_NAME'] + "/rpmbuild/SPECS"]) +common.run_command(container, ["sudo", "-u", os.environ['BUILDER_NAME'], "mkdir", "-p", "/home/" + os.environ['BUILDER_NAME'] + "/rpmbuild/SRPMS"]) +common.run_command(container, ["sudo", "-u", os.environ['BUILDER_NAME'], "ls", "-ltrR", "/home/" + os.environ['BUILDER_NAME'] + "/rpmbuild"]) + +# Download the source +rpm_friendly_version="" +dest_archive="" +download_url="" +spec_file="/home/%s/rpmbuild/SPECS/netdata.spec" % os.environ['BUILDER_NAME'] + +# TODO: Checksum validations +if str(os.environ['BUILD_VERSION']).count(".latest") == 1: + version_list=str(os.environ['BUILD_VERSION']).replace('v', '').split('.') + rpm_friendly_version='.'.join(version_list[0:3]) + "." + version_list[3] + + print("Building latest nightly version of netdata..(%s)" % os.environ['BUILD_VERSION']) + dest_archive="/home/%s/rpmbuild/SOURCES/netdata-%s.tar.gz" % (os.environ['BUILDER_NAME'], rpm_friendly_version) + + print("5. Preparing local latest implementation tarball for version %s" % rpm_friendly_version) + tar_file = os.environ['LXC_CONTAINER_ROOT'] + dest_archive + + print("5.1 Tagging the code with latest version: %s" % rpm_friendly_version) + common.run_command_in_host(['git', 'tag', '-a', rpm_friendly_version, '-m', 'Tagging while packaging on %s' % os.environ["CONTAINER_NAME"]]) + + print("5.2 Run autoreconf -ivf") + common.run_command_in_host(['autoreconf', '-ivf']) + + print("5.3 Run configure") + common.run_command_in_host(['./configure', '--with-math', '--with-zlib', '--with-user=netdata']) + + print("5.4 Run make dist") + common.run_command_in_host(['make', 'dist']) + + print("5.5 Copy generated tarbal to desired path") + if os.path.exists('netdata-%s.tar.gz' % rpm_friendly_version): + common.run_command_in_host(['sudo', 'cp', 'netdata-%s.tar.gz' % rpm_friendly_version, tar_file]) + + print("5.6 Fixing permissions on tarball") + common.run_command_in_host(['sudo', 'chmod', '777', tar_file]) + else: + print("I could not find (%s) on the disk, stopping the build. Kindly check the logs and try again" % 'netdata-%s.tar.gz' % rpm_friendly_version) + sys.exit(1) + + # Extract the spec file in place + print("6. Extract spec file from the source") + common.run_command_in_host(['sudo', 'cp', 'netdata.spec', os.environ['LXC_CONTAINER_ROOT'] + spec_file]) + common.run_command_in_host(['sudo', 'chmod', '777', os.environ['LXC_CONTAINER_ROOT'] + spec_file]) + + print("7. Temporary hack: Change Source0 to %s on spec file %s" % (dest_archive, spec_file)) + common.replace_tag("Source0", os.environ['LXC_CONTAINER_ROOT'] + spec_file, tar_file) +else: + rpm_friendly_version = os.environ['BUILD_VERSION'] + + print("Building latest stable version of netdata.. (%s)" % os.environ['BUILD_VERSION']) + dest_archive="/home/%s/rpmbuild/SOURCES/netdata-%s.tar.gz" % (os.environ['BUILDER_NAME'],os.environ['BUILD_VERSION']) + download_url="https://github.com/netdata/netdata/releases/download/%s/netdata-%s.tar.gz" % (os.environ['BUILD_VERSION'], os.environ['BUILD_VERSION']) + + print("5. Fetch netdata source into the repo structure(%s -> %s)" % (download_url, dest_archive)) + tar_file="%s/netdata-%s.tar.gz" % (os.path.dirname(dest_archive), rpm_friendly_version) + common.run_command(container, ["sudo", "-u", os.environ['BUILDER_NAME'], "wget", "-T", "15", "--output-document=" + dest_archive, download_url]) + + print("6.Extract spec file from the source") + common.run_command(container, ["sudo", "-u", os.environ['BUILDER_NAME'], "tar", "--to-command=cat > %s" % spec_file, "-xvf", dest_archive, "netdata-%s/netdata.spec" % os.environ['BUILD_VERSION']]) + + print("7. Temporary hack: Adjust version string on the spec file (%s) to %s and Source0 to %s" % (os.environ['LXC_CONTAINER_ROOT'] + spec_file, rpm_friendly_version, download_url)) + common.replace_tag("Version", os.environ['LXC_CONTAINER_ROOT'] + spec_file, rpm_friendly_version) + common.replace_tag("Source0", os.environ['LXC_CONTAINER_ROOT'] + spec_file, tar_file) + +print('Done!') diff --git a/.travis/package_management/create_lxc_for_build.sh b/.travis/package_management/create_lxc_for_build.sh new file mode 100755 index 00000000..ae855a74 --- /dev/null +++ b/.travis/package_management/create_lxc_for_build.sh @@ -0,0 +1,101 @@ +#!/usr/bin/env bash +# +# This script generates an LXC container and starts it up +# Once the script completes successfully, a container has become available for usage +# The container image to be used and the container name to be set, are part of variables +# that must be present for the script to work +# +# Copyright: SPDX-License-Identifier: GPL-3.0-or-later +# +# Author : Pavlos Emm. Katsoulakis (paul@netdata.cloud) +# shellcheck disable=SC1091 +set -e + +source .travis/package_management/functions.sh || (echo "Failed to load packaging library" && exit 1) + +# If we are not in netdata git repo, at the top level directory, fail +TOP_LEVEL=$(basename "$(git rev-parse --show-toplevel)") +CWD=$(git rev-parse --show-cdup) +if [ -n "$CWD" ] || [ ! "${TOP_LEVEL}" == "netdata" ]; then + echo "Run as .travis/package_management/$(basename "$0") from top level directory of netdata git repository" + echo "LXC Container creation aborted" + exit 1 +fi + +# Check for presence of mandatory environment variables +if [ -z "${BUILD_STRING}" ]; then + echo "No Distribution was defined. Make sure BUILD_STRING is set on the environment before running this script" + exit 1 +fi + +if [ -z "${BUILDER_NAME}" ]; then + echo "No builder account and container name defined. Make sure BUILDER_NAME is set on the environment before running this script" + exit 1 +fi + +if [ -z "${BUILD_DISTRO}" ]; then + echo "No build distro information defined. Make sure BUILD_DISTRO is set on the environment before running this script" + exit 1 +fi + +if [ -z "${BUILD_RELEASE}" ]; then + echo "No build release information defined. Make sure BUILD_RELEASE is set on the environment before running this script" + exit 1 +fi + +if [ -z "${PACKAGE_TYPE}" ]; then + echo "No build release information defined. Make sure PACKAGE_TYPE is set on the environment before running this script" + exit 1 +fi + +# Detect architecture and load extra variables needed +detect_arch_from_commit + +echo "Creating LXC container ${BUILDER_NAME}/${BUILD_STRING}/${BUILD_ARCH}...." + +case "${BUILD_ARCH}" in +"all") + # i386 + echo "Creating LXC Container for i386.." + export CONTAINER_NAME="${BUILDER_NAME}-${BUILD_DISTRO}${BUILD_RELEASE}-i386" + export LXC_CONTAINER_ROOT="/var/lib/lxc/${CONTAINER_NAME}/rootfs" + lxc-create -n "${CONTAINER_NAME}" -t "download" -- --dist "${BUILD_DISTRO}" --release "${BUILD_RELEASE}" --arch "i386" --no-validate + + echo "Container(s) ready. Configuring container(s).." + .travis/package_management/configure_${PACKAGE_TYPE}_lxc_environment.py "${CONTAINER_NAME}" + + # amd64 + echo "Creating LXC Container for amd64.." + export CONTAINER_NAME="${BUILDER_NAME}-${BUILD_DISTRO}${BUILD_RELEASE}-amd64" + export LXC_CONTAINER_ROOT="/var/lib/lxc/${CONTAINER_NAME}/rootfs" + lxc-create -n "${CONTAINER_NAME}" -t "download" -- --dist "${BUILD_DISTRO}" --release "${BUILD_RELEASE}" --arch "amd64" --no-validate + + echo "Container(s) ready. Configuring container(s).." + .travis/package_management/configure_${PACKAGE_TYPE}_lxc_environment.py "${CONTAINER_NAME}" + + # arm64 + echo "Creating LXC Container for arm64.." + export CONTAINER_NAME="${BUILDER_NAME}-${BUILD_DISTRO}${BUILD_RELEASE}-arm64" + export LXC_CONTAINER_ROOT="/var/lib/lxc/${CONTAINER_NAME}/rootfs" + lxc-create -n "${CONTAINER_NAME}" -t "download" -- --dist "${BUILD_DISTRO}" --release "${BUILD_RELEASE}" --arch "arm64" --no-validate + + echo "Container(s) ready. Configuring container(s).." + .travis/package_management/configure_${PACKAGE_TYPE}_lxc_environment.py "${CONTAINER_NAME}" + ;; +"i386"|"amd64"|"arm64") + # AMD64 or i386 + echo "Creating LXC Container for ${BUILD_ARCH}.." + export CONTAINER_NAME="${BUILDER_NAME}-${BUILD_DISTRO}${BUILD_RELEASE}-${BUILD_ARCH}" + export LXC_CONTAINER_ROOT="/var/lib/lxc/${CONTAINER_NAME}/rootfs" + lxc-create -n "${CONTAINER_NAME}" -t "download" -- --dist "${BUILD_DISTRO}" --release "${BUILD_RELEASE}" --arch "${BUILD_ARCH}" --no-validate + + echo "Container(s) ready. Configuring container(s).." + .travis/package_management/configure_${PACKAGE_TYPE}_lxc_environment.py "${CONTAINER_NAME}" + ;; +*) + echo "Unknown BUILD_ARCH value '${BUILD_ARCH}' given, process failed" + exit 1 + ;; +esac + +echo "..LXC creation complete!" diff --git a/.travis/package_management/functions.sh b/.travis/package_management/functions.sh new file mode 100644 index 00000000..9a467ffe --- /dev/null +++ b/.travis/package_management/functions.sh @@ -0,0 +1,33 @@ +# no-shebang-needed-its-a-library +# +# Utility functions for packaging in travis CI +# +# Copyright: SPDX-License-Identifier: GPL-3.0-or-later +# +# Author : Pavlos Emm. Katsoulakis (paul@netdata.cloud) +#shellcheck disable=SC2148 +set -e + +function detect_arch_from_commit { + case "${TRAVIS_COMMIT_MESSAGE}" in + "[Package AMD64"*) + export BUILD_ARCH="amd64" + ;; + "[Package i386"*) + export BUILD_ARCH="i386" + ;; + "[Package ALL"*) + export BUILD_ARCH="all" + ;; + "[Package arm64"*) + export BUILD_ARCH="arm64" + ;; + + *) + echo "Unknown build architecture '${BUILD_ARCH}' provided" + exit 1 + ;; + esac + + echo "Detected build architecture ${BUILD_ARCH}" +} diff --git a/.travis/package_management/package_cloud_wrapper.sh b/.travis/package_management/package_cloud_wrapper.sh new file mode 100755 index 00000000..48a372d3 --- /dev/null +++ b/.travis/package_management/package_cloud_wrapper.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash +# +# This is a tool to help removal of packages from packagecloud.io +# It utilizes the package_cloud utility provided from packagecloud.io +# +# Depends on: +# 1) package cloud gem (detects absence and installs it) +# +# Requires: +# 1) PKG_CLOUD_TOKEN variable exported +# 2) To properly install package_cloud when not found, it requires: ruby gcc gcc-c++ ruby-devel +# +# Copyright: SPDX-License-Identifier: GPL-3.0-or-later +# +# Author : Pavlos Emm. Katsoulakis (paul@netdata.cloud) +#shellcheck disable=SC2068,SC2145 +set -e +PKG_CLOUD_CONFIG="$HOME/.package_cloud_configuration.cfg" + +# If we are not in netdata git repo, at the top level directory, fail +TOP_LEVEL=$(basename "$(git rev-parse --show-toplevel)") +CWD=$(git rev-parse --show-cdup) +if [ -n "$CWD" ] || [ ! "${TOP_LEVEL}" == "netdata" ]; then + echo "Run as .travis/package_management/$(basename "$0") from top level directory of netdata git repository" + echo "Docker build process aborted" + exit 1 +fi + +# Install dependency if not there +if ! command -v package_cloud > /dev/null 2>&1; then + echo "No package cloud gem found, installing" + gem install -V package_cloud || (echo "Package cloud installation failed. you might want to check if required dependencies are there (ruby gcc gcc-c++ ruby-devel)" && exit 1) +else + echo "Found package_cloud gem, continuing" +fi + +# Check for required token and prepare config +if [ -z "${PKG_CLOUD_TOKEN}" ]; then + echo "Please set PKG_CLOUD_TOKEN to be able to use ${0}" + exit 1 +fi +echo "{\"url\":\"https://packagecloud.io\",\"token\":\"${PKG_CLOUD_TOKEN}\"}" > "${PKG_CLOUD_CONFIG}" + +echo "Executing package_cloud with config ${PKG_CLOUD_CONFIG} and parameters $@" +package_cloud $@ --config="${PKG_CLOUD_CONFIG}" + +rm -rf "${PKG_CLOUD_CONFIG}" +echo "Done!" diff --git a/.travis/package_management/prepare_packages.sh b/.travis/package_management/prepare_packages.sh new file mode 100755 index 00000000..1fb26a95 --- /dev/null +++ b/.travis/package_management/prepare_packages.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +# +# Utility that gathers generated packages, +# puts them together in a local folder for deploy facility to pick up +# +# Copyright: SPDX-License-Identifier: GPL-3.0-or-later +# +# Author : Pavlos Emm. Katsoulakis (paul@netdata.cloud) +#shellcheck disable=SC2068 +set -e + +# If we are not in netdata git repo, at the top level directory, fail +TOP_LEVEL=$(basename "$(git rev-parse --show-toplevel)") +CWD=$(git rev-parse --show-cdup) +if [ -n "$CWD" ] || [ ! "${TOP_LEVEL}" == "netdata" ]; then + echo "Run as .travis/package_management/$(basename "$0") from top level directory of netdata git repository" + echo "Package preparation aborted" + exit 1 +fi + +export LXC_ROOT="/var/lib/lxc" + +# Go through the containers created for packaging and pick up all generated packages +CREATED_CONTAINERS=$(ls -A "${LXC_ROOT}") +for d in ${CREATED_CONTAINERS[@]}; do + echo "Picking up packaging contents from ${d}" + + # Pick up any RPMS from builder + RPM_BUILD_PATH="${LXC_ROOT}/${d}/rootfs/home/${BUILDER_NAME}/rpmbuild" + echo "Checking folder ${RPM_BUILD_PATH} for RPMS and SRPMS" + + if [ -d "${RPM_BUILD_PATH}/RPMS" ]; then + echo "Copying any RPMS in '${RPM_BUILD_PATH}', copying over the following:" + ls -ltrR "${RPM_BUILD_PATH}/RPMS" + [[ -d "${RPM_BUILD_PATH}/RPMS/x86_64" ]] && cp -r "${RPM_BUILD_PATH}"/RPMS/x86_64/* "${PACKAGES_DIRECTORY}" + [[ -d "${RPM_BUILD_PATH}/RPMS/i386" ]] && cp -r "${RPM_BUILD_PATH}"/RPMS/i386/* "${PACKAGES_DIRECTORY}" + [[ -d "${RPM_BUILD_PATH}/RPMS/i686" ]] && cp -r "${RPM_BUILD_PATH}"/RPMS/i686/* "${PACKAGES_DIRECTORY}" + fi + + if [ -d "${RPM_BUILD_PATH}/SRPMS" ]; then + echo "Copying any SRPMS in '${RPM_BUILD_PATH}', copying over the following:" + ls -ltrR "${RPM_BUILD_PATH}/SRPMS" + [[ -d "${RPM_BUILD_PATH}/SRPMS/x86_64" ]] && cp -r "${RPM_BUILD_PATH}"/SRPMS/x86_64/* "${PACKAGES_DIRECTORY}" + [[ -d "${RPM_BUILD_PATH}/SRPMS/i386" ]] && cp -r "${RPM_BUILD_PATH}"/SRPMS/i386/* "${PACKAGES_DIRECTORY}" + [[ -d "${RPM_BUILD_PATH}/SRPMS/i686" ]] && cp -r "${RPM_BUILD_PATH}"/SRPMS/i686/* "${PACKAGES_DIRECTORY}" + fi + + # Pick up any DEBs from builder + DEB_BUILD_PATH="${d}/home/${BUILDER_NAME}/build-area" + echo "Checking folder ${DEB_BUILD_PATH} for DEB packages" + #TODO: During debian clean up we 'll fill this up + +done + +chmod -R 777 "${PACKAGES_DIRECTORY}" +echo "Packaging contents ready to ship!" diff --git a/.travis/package_management/trigger_deb_lxc_build.py b/.travis/package_management/trigger_deb_lxc_build.py new file mode 100755 index 00000000..3040bdd6 --- /dev/null +++ b/.travis/package_management/trigger_deb_lxc_build.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +# +# This script is responsible for running the RPM build on the running container +# +# Copyright: SPDX-License-Identifier: GPL-3.0-or-later +# +# Author : Pavlos Emm. Katsoulakis <paul@netdata.cloud> + +import common +import os +import sys +import lxc + +if len(sys.argv) != 2: + print('You need to provide a container name to get things started') + sys.exit(1) +container_name=sys.argv[1] + +# Load the container, break if its not there +print("Starting up container %s" % container_name) +container = lxc.Container(container_name) +if not container.defined: + raise Exception("Container %s does not exist!" % container_name) + +# Check if the container is running, attempt to start it up in case its not running +if not container.running or not container.state == "RUNNING": + print('Container %s is not running, attempt to start it up' % container_name) + + # Start the container + if not container.start(): + raise Exception("Failed to start the container") + + if not container.running or not container.state == "RUNNING": + raise Exception('Container %s is not running, configuration process aborted ' % container_name) + +# Wait for connectivity +if not container.get_ips(timeout=30): + raise Exception("Timeout while waiting for container") + +print("Setting up EMAIL and DEBFULLNAME variables required by the build tools") +os.environ["EMAIL"] = "bot@netdata.cloud" +os.environ["DEBFULLNAME"] = "Netdata builder" + +# Run the build process on the container +print("Starting DEB build process, running dh-make") +new_version = os.environ["BUILD_VERSION"].replace('v', '') + +print("Building the package") +common.run_command(container, ["sudo", "-u", os.environ['BUILDER_NAME'], "dpkg-buildpackage", "--host-arch", "amd64", "--target-arch", "amd64", "--post-clean", "--pre-clean", "--build=binary", "--release-by=\"Netdata Builder\"", "--build-by=\"Netdata Builder\""]) + +print('Done!') diff --git a/.travis/package_management/trigger_rpm_lxc_build.py b/.travis/package_management/trigger_rpm_lxc_build.py new file mode 100755 index 00000000..f9e109c7 --- /dev/null +++ b/.travis/package_management/trigger_rpm_lxc_build.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 +# +# This script is responsible for running the RPM build on the running container +# +# Copyright: SPDX-License-Identifier: GPL-3.0-or-later +# +# Author : Pavlos Emm. Katsoulakis <paul@netdata.cloud> + +import common +import os +import sys +import lxc + +if len(sys.argv) != 2: + print('You need to provide a container name to get things started') + sys.exit(1) +container_name=sys.argv[1] + +# Load the container, break if its not there +print("Starting up container %s" % container_name) +container = lxc.Container(container_name) +if not container.defined: + raise Exception("Container %s does not exist!" % container_name) + +# Check if the container is running, attempt to start it up in case its not running +if not container.running or not container.state == "RUNNING": + print('Container %s is not running, attempt to start it up' % container_name) + + # Start the container + if not container.start(): + raise Exception("Failed to start the container") + + if not container.running or not container.state == "RUNNING": + raise Exception('Container %s is not running, configuration process aborted ' % container_name) + +# Wait for connectivity +if not container.get_ips(timeout=30): + raise Exception("Timeout while waiting for container") + +print("Adding builder specific dependencies to the LXC container") +common.run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "rpm-build"]) +common.run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "rpm-devel"]) +common.run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "rpmlint"]) +common.run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "make"]) +common.run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "python"]) +common.run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "bash"]) +common.run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "diffutils"]) +common.run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "patch"]) +common.run_command(container, [os.environ["REPO_TOOL"], "install", "-y", "rpmdevtools"]) + +# Run the build process on the container +print("Starting RPM build process") +common.run_command(container, ["sudo", "-u", os.environ['BUILDER_NAME'], "rpmbuild", "-ba", "--rebuild", "/home/%s/rpmbuild/SPECS/netdata.spec" % os.environ['BUILDER_NAME']]) + +print('Done!') diff --git a/.travis/package_management/yank_stale_rpm.sh b/.travis/package_management/yank_stale_rpm.sh new file mode 100755 index 00000000..5cf93866 --- /dev/null +++ b/.travis/package_management/yank_stale_rpm.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# +# This script is responsible for the removal of stale RPM/DEB files. +# It runs on the pre-deploy step and takes care of the removal of the files +# prior to the upload of the freshly built ones +# +# Copyright: SPDX-License-Identifier: GPL-3.0-or-later +# +# Author : Pavlos Emm. Katsoulakis (paul@netdata.cloud) +#shellcheck disable=SC2010,SC2068 +set -e + +# If we are not in netdata git repo, at the top level directory, fail +TOP_LEVEL=$(basename "$(git rev-parse --show-toplevel)") +CWD=$(git rev-parse --show-cdup) +if [ -n "$CWD" ] || [ ! "${TOP_LEVEL}" == "netdata" ]; then + echo "Run as .travis/package_management/$(basename "$0") from top level directory of netdata git repository" + echo "Package yanking cancelled" + exit 1 +fi + +PACKAGES_DIR="$1" +DISTRO="$2" +PACKAGES_LIST="$(ls -AR "${PACKAGES_DIR}" | grep '\.rpm')" + +if [ ! -d "${PACKAGES_DIR}" ] || [ -z "${PACKAGES_LIST}" ]; then + echo "Folder ${PACKAGES_DIR} does not seem to be a valid directory or is empty. No packages to check for yanking" + exit 1 +fi + +for pkg in ${PACKAGES_LIST[@]}; do + echo "Attempting yank on ${pkg}.." + .travis/package_management/package_cloud_wrapper.sh yank "${PACKAGING_USER}/${DEPLOY_REPO}/${DISTRO}" "${pkg}" || echo "Nothing to yank or error on ${pkg}" +done + diff --git a/CHANGELOG.md b/CHANGELOG.md index f217e8f8..ed154251 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,248 @@ # Changelog -## [v1.15.0](https://github.com/netdata/netdata/tree/v1.15.0) (2019-05-21) +## [v1.16.0](https://github.com/netdata/netdata/tree/v1.16.0) (2019-07-08) **Fixed bugs:** +- Double free or corruption \(fasttop\) [\#6370](https://github.com/netdata/netdata/issues/6370) +- Netdata not ignoring /dev and /run by default [\#6361](https://github.com/netdata/netdata/issues/6361) +- \[critical\] netdata segfault when restart service [\#6356](https://github.com/netdata/netdata/issues/6356) +- Backend as-collected values from statsd are 1000 time to high [\#6347](https://github.com/netdata/netdata/issues/6347) +- Tomcat collector break on invalid XML caused by single quotes in Memory Pool names \(code solution inside\) [\#6343](https://github.com/netdata/netdata/issues/6343) +- adaptec\_raid: failed to parse `arcconf GETCONFIG 1 LD` output [\#6337](https://github.com/netdata/netdata/issues/6337) +- Cannot reinstall netdata [\#6329](https://github.com/netdata/netdata/issues/6329) +- collectors/perf.plugin/perf\_plugin.c:171: error: 'PERF\_COUNT\_HW\_REF\_CPU\_CYCLES' undeclared here \(not in a function\) [\#6321](https://github.com/netdata/netdata/issues/6321) +- Never able to sign in [\#6306](https://github.com/netdata/netdata/issues/6306) +- /dev/fd/63: line 113: : command not found when trying to update [\#6289](https://github.com/netdata/netdata/issues/6289) +- Redirect cannot overwrite netdata. [\#6288](https://github.com/netdata/netdata/issues/6288) +- Netdata lateral menu hidden [\#6287](https://github.com/netdata/netdata/issues/6287) +- How to remove/unregister a streaming node to prevent the health alarms from triggering [\#6266](https://github.com/netdata/netdata/issues/6266) +- netdata/daemon: Service down with multiple "too many open files" occurring on DB engine [\#6265](https://github.com/netdata/netdata/issues/6265) +- RFE: Minor change to the spec.in file in order to help with building rpms on RHEL [\#6256](https://github.com/netdata/netdata/issues/6256) +- addgroup: gid '999' in use [\#6253](https://github.com/netdata/netdata/issues/6253) +- Starting netdata official container fails with `addgroup: gid '999' in use` [\#6251](https://github.com/netdata/netdata/issues/6251) +- Elasticsearch plugin error - 'module' object has no attribute 'Retry' [\#6248](https://github.com/netdata/netdata/issues/6248) +- "Missing charts" on v1.15.0-70-nightly \(\*\*solved\*\*\) [\#6244](https://github.com/netdata/netdata/issues/6244) +- netdata daemon collectors stuck [\#6239](https://github.com/netdata/netdata/issues/6239) +- URL Parser changes, correct health cmd api conditions to create silencer [\#6238](https://github.com/netdata/netdata/issues/6238) +- Error during installation in devuan [\#6230](https://github.com/netdata/netdata/issues/6230) +- kickstart.sh: Getting Nightly SHA256 sums from storage.googleapis.com times out [\#6227](https://github.com/netdata/netdata/issues/6227) +- Use major/minor from sys/types.h [\#6218](https://github.com/netdata/netdata/issues/6218) +- Update fails because of new system dependencies \(\*\*Resolved\*\*\) [\#6200](https://github.com/netdata/netdata/issues/6200) +- netdata/database: db engine crashing under certain conditions [\#6199](https://github.com/netdata/netdata/issues/6199) +- Update frequency for x509check alarm [\#6193](https://github.com/netdata/netdata/issues/6193) +- kickstart.sh: Getting Nightly SHA256 sums from storage.googleapis.com times out [\#6191](https://github.com/netdata/netdata/issues/6191) +- tv.html errors on https [\#6188](https://github.com/netdata/netdata/issues/6188) +- Error messages on old database files [\#6186](https://github.com/netdata/netdata/issues/6186) +- netdata/packaging: Sporadic job failures in Travis CI [\#6185](https://github.com/netdata/netdata/issues/6185) +- Fix date in pushbullet alarm notifications [\#6178](https://github.com/netdata/netdata/issues/6178) +- Chart's "name" not streamed [\#6177](https://github.com/netdata/netdata/issues/6177) +- Why is auto-update not working? V1.14.0 [\#6170](https://github.com/netdata/netdata/issues/6170) +- \[solved\] ZFS database \(was: netdata FATAL : MAIN :uv\_fs\_fsopen: invalid argument \# : Invalid argument\) [\#6161](https://github.com/netdata/netdata/issues/6161) +- httpcheck does not export some metrics in version 1.15 [\#6157](https://github.com/netdata/netdata/issues/6157) +- opensuse - installation by hand issues due to hardcoded libexec in netdata-installer.sh [\#6153](https://github.com/netdata/netdata/issues/6153) +- httpcheck causes lots of SYN\_SENT dangling sockets [\#6152](https://github.com/netdata/netdata/issues/6152) +- varnish plugin not showing \(varnish version 4\) [\#6149](https://github.com/netdata/netdata/issues/6149) +- Reduce number of codacy issues [\#6131](https://github.com/netdata/netdata/issues/6131) +- Optimize calls that gather system-info [\#6130](https://github.com/netdata/netdata/issues/6130) +- Fix telemetry config in installer [\#6129](https://github.com/netdata/netdata/issues/6129) +- web\_log reports unmatched lines [\#6125](https://github.com/netdata/netdata/issues/6125) +- Netdata 1.15 crashes on Ubuntu 16.04 [\#6117](https://github.com/netdata/netdata/issues/6117) +- netdata doesn't start with the new dbengine [\#6110](https://github.com/netdata/netdata/issues/6110) +- "mdstat.mdX\_disks" chart's total is misleading [\#6108](https://github.com/netdata/netdata/issues/6108) +- Telemetry rarely received from docker containers [\#6095](https://github.com/netdata/netdata/issues/6095) +- netdata/packaging/installer: Fine tuning based on user feedback and findings from testing [\#6094](https://github.com/netdata/netdata/issues/6094) +- opensuse - installation by hand issues due to hardcoded libexec in netdata-installer.sh [\#6092](https://github.com/netdata/netdata/issues/6092) +- Problem updating [\#6088](https://github.com/netdata/netdata/issues/6088) +- invalid help links on platform \(web log nginx/apache\) [\#6086](https://github.com/netdata/netdata/issues/6086) +- install on debian jessie [\#6083](https://github.com/netdata/netdata/issues/6083) +- error "cgroup-network-helper.sh: DEBUG: virsh command is not available" with the lastest docker image [\#6073](https://github.com/netdata/netdata/issues/6073) +- Kickstart script verification md5sum is out of date, verification fails [\#6049](https://github.com/netdata/netdata/issues/6049) +- AWS Kinesis dependency [\#6002](https://github.com/netdata/netdata/issues/6002) +- HTTP requests are classified as AF\_UNIX [\#5987](https://github.com/netdata/netdata/issues/5987) +- swapio chart is missing on CentOS 7 [\#5845](https://github.com/netdata/netdata/issues/5845) +- Netdata behind caddy reverse proxy wont login. [\#5794](https://github.com/netdata/netdata/issues/5794) +- netdata/packaging/installer: error when running the kickstart and also when uninstalling it with uninstaller [\#5745](https://github.com/netdata/netdata/issues/5745) +- cgroups name resolution doesn't work \(on Raspbian\) [\#5314](https://github.com/netdata/netdata/issues/5314) +- Old Monit metrics still remains in Netdata even after Netdata restarted [\#5074](https://github.com/netdata/netdata/issues/5074) +- netdata/packaging/docker: Fix docker documentation and a fix to avoid failures [\#6344](https://github.com/netdata/netdata/pull/6344) ([paulkatsoulakis](https://github.com/paulkatsoulakis)) +- Fix devuan support for initd [\#6275](https://github.com/netdata/netdata/pull/6275) ([paulkatsoulakis](https://github.com/paulkatsoulakis)) +- netdata/packaging/docker: Fix docker socket utilization, first pass [\#6233](https://github.com/netdata/netdata/pull/6233) ([paulkatsoulakis](https://github.com/paulkatsoulakis)) +- netdata/daemon: Eliminate a couple of warnings, plus tabs removal [\#6222](https://github.com/netdata/netdata/pull/6222) ([paulkatsoulakis](https://github.com/paulkatsoulakis)) +- netdata/packaging/ci: Add lifecycle checks to bare operating system installations [\#6209](https://github.com/netdata/netdata/pull/6209) ([paulkatsoulakis](https://github.com/paulkatsoulakis)) +- netdata/daemon: SSL fix - broken compilation case when ssl library not present! [\#6201](https://github.com/netdata/netdata/pull/6201) ([thiagoftsm](https://github.com/thiagoftsm)) +- netdata/packaging/installer: Fix updater issue on empty config value [\#6182](https://github.com/netdata/netdata/pull/6182) ([paulkatsoulakis](https://github.com/paulkatsoulakis)) +- netdata/packaging/installer: nits and fixes [\#6121](https://github.com/netdata/netdata/pull/6121) ([paulkatsoulakis](https://github.com/paulkatsoulakis)) +- netdata: Fix labels usage, quotes are not needed [\#6091](https://github.com/netdata/netdata/pull/6091) ([paulkatsoulakis](https://github.com/paulkatsoulakis)) + +**Closed issues:** + +- \[QUESTION\] Show Docker Container without ID [\#6358](https://github.com/netdata/netdata/issues/6358) +- Allow user to configure the maximum number of File Descriptors for the netdata service [\#6313](https://github.com/netdata/netdata/issues/6313) +- XMPP/Jabber notification support [\#6292](https://github.com/netdata/netdata/issues/6292) +- collector/freeipmi - option to remove the ID added to the label [\#6284](https://github.com/netdata/netdata/issues/6284) +- FreeIPMI - option to remove the ID added to the label [\#6283](https://github.com/netdata/netdata/issues/6283) +- Scope of 1.16-rc2 [\#6243](https://github.com/netdata/netdata/issues/6243) +- Documentation improvements [\#6214](https://github.com/netdata/netdata/issues/6214) +- monitor dnsmasq dhcp server leases [\#6206](https://github.com/netdata/netdata/issues/6206) +- Pihole stats modules [\#6204](https://github.com/netdata/netdata/issues/6204) +- Scope of 1.16-rc1 [\#6159](https://github.com/netdata/netdata/issues/6159) +- Netdata variable granularity Research [\#6148](https://github.com/netdata/netdata/issues/6148) +- SN\_EMPTY\_SLOT RRDDIMM value detection inside DB engine [\#6105](https://github.com/netdata/netdata/issues/6105) +- force page alignment per dimension of chart [\#6104](https://github.com/netdata/netdata/issues/6104) +- When the collector restarts after having stopped a long time ago fill the chart gaps efficiently [\#6103](https://github.com/netdata/netdata/issues/6103) +- Introduce cross-distro release testing on our CI [\#6102](https://github.com/netdata/netdata/issues/6102) +- Optimize the memory footprint of the Database Engine [\#6010](https://github.com/netdata/netdata/issues/6010) +- phpdaemon monitoring [\#6006](https://github.com/netdata/netdata/issues/6006) +- Secure streaming via SSL [\#6004](https://github.com/netdata/netdata/issues/6004) +- \[Binary releases\] Optimise netdata.spec file [\#5969](https://github.com/netdata/netdata/issues/5969) +- \[Binary releases\] Create a script to containerise the RPM build process [\#5967](https://github.com/netdata/netdata/issues/5967) +- Clearer communiation of telemetry [\#5863](https://github.com/netdata/netdata/issues/5863) +- alarm-notify.sh should respect the cloud base url setting [\#5791](https://github.com/netdata/netdata/issues/5791) +- Design k8s collector endpoint autodiscovery [\#5729](https://github.com/netdata/netdata/issues/5729) +- notify package maintainers of the new netdata releases [\#5682](https://github.com/netdata/netdata/issues/5682) +- \[preparation\] VMware Vsphere monitoring [\#5635](https://github.com/netdata/netdata/issues/5635) +- Feature: add VictoriaMetrics backend for long-term archiving [\#5619](https://github.com/netdata/netdata/issues/5619) +- Update docs for obsolete Python modules [\#5339](https://github.com/netdata/netdata/issues/5339) +- Hide: timestamps, memory values left, background and use graph as a gif [\#5186](https://github.com/netdata/netdata/issues/5186) +- Cookie consent for the Netdata sites [\#4798](https://github.com/netdata/netdata/issues/4798) +- netdata startup order on boot \(systemd\) [\#4266](https://github.com/netdata/netdata/issues/4266) +- RFC: registry v2 [\#3990](https://github.com/netdata/netdata/issues/3990) +- mail notifications wiki points to a non-existing file [\#3433](https://github.com/netdata/netdata/issues/3433) +- Simple way to disable alerts [\#3414](https://github.com/netdata/netdata/issues/3414) +- CPU performance monitoring [\#3232](https://github.com/netdata/netdata/issues/3232) +- \[RFE\] send notifications repeatedly until the alert is resolved [\#2956](https://github.com/netdata/netdata/issues/2956) +- allow netdata to know the plugin that collects each chart [\#2692](https://github.com/netdata/netdata/issues/2692) +- riak-rv support [\#2413](https://github.com/netdata/netdata/issues/2413) +- alarms to monitor the number of processes in a system [\#2239](https://github.com/netdata/netdata/issues/2239) +- Support OpenTSDB back-end via HTTP\(S\) API [\#1642](https://github.com/netdata/netdata/issues/1642) + +**Merged pull requests:** + +- Ignore /dev and /run space/inode usage [\#6399](https://github.com/netdata/netdata/pull/6399) ([vlvkobal](https://github.com/vlvkobal)) +- Update favicon with new logo [\#6398](https://github.com/netdata/netdata/pull/6398) ([cakrit](https://github.com/cakrit)) +- Update apps\_groups.conf for time group [\#6397](https://github.com/netdata/netdata/pull/6397) ([mbarper](https://github.com/mbarper)) +- Update to icons [\#6396](https://github.com/netdata/netdata/pull/6396) ([ivorjvr](https://github.com/ivorjvr)) +- Changed links from my-netdata.io to netdata.cloud [\#6389](https://github.com/netdata/netdata/pull/6389) ([joelhans](https://github.com/joelhans)) +- alarm-notify.sh should respect the cloud base url setting [\#6383](https://github.com/netdata/netdata/pull/6383) ([ladakis](https://github.com/ladakis)) +- Add a check for a macro declaration for the perf plugin [\#6382](https://github.com/netdata/netdata/pull/6382) ([vlvkobal](https://github.com/vlvkobal)) +- Add a .gitattributes file [\#6381](https://github.com/netdata/netdata/pull/6381) ([ac000](https://github.com/ac000)) +- Health fix double Free Corruption [\#6379](https://github.com/netdata/netdata/pull/6379) ([thiagoftsm](https://github.com/thiagoftsm)) +- Health giving wrong message [\#6377](https://github.com/netdata/netdata/pull/6377) ([thiagoftsm](https://github.com/thiagoftsm)) +- Health could not read properly the health silencers file [\#6374](https://github.com/netdata/netdata/pull/6374) ([thiagoftsm](https://github.com/thiagoftsm)) +- Add more debug messages for pluginsd pipe errors [\#6373](https://github.com/netdata/netdata/pull/6373) ([vlvkobal](https://github.com/vlvkobal)) +- Improve documentation about file descriptors and systemd configuration. [\#6372](https://github.com/netdata/netdata/pull/6372) ([mfundul](https://github.com/mfundul)) +- netdata/packaging: Netdata binary packages generation - spec file refinement, support for nightlies \(RPM\) [\#6369](https://github.com/netdata/netdata/pull/6369) ([paulkatsoulakis](https://github.com/paulkatsoulakis)) +- apps.plugin: detect openldap server processes by default on Debian [\#6364](https://github.com/netdata/netdata/pull/6364) ([nodiscc](https://github.com/nodiscc)) +- Easily disable alarms, by persisting the silencers configuration [\#6360](https://github.com/netdata/netdata/pull/6360) ([thiagoftsm](https://github.com/thiagoftsm)) +- Redirect old site to new site at www.netdata.cloud [\#6359](https://github.com/netdata/netdata/pull/6359) ([cakrit](https://github.com/cakrit)) +- Better checks for nfacct headers [\#6351](https://github.com/netdata/netdata/pull/6351) ([vlvkobal](https://github.com/vlvkobal)) +- Tomcat status invalid XML fix [\#6345](https://github.com/netdata/netdata/pull/6345) ([Danamir](https://github.com/Danamir)) +- pihole: alarms delay fix [\#6342](https://github.com/netdata/netdata/pull/6342) ([ilyam8](https://github.com/ilyam8)) +- monit: obsolete service in runtime [\#6340](https://github.com/netdata/netdata/pull/6340) ([ilyam8](https://github.com/ilyam8)) +- adaptec\_raid: logical device regex fix [\#6338](https://github.com/netdata/netdata/pull/6338) ([ilyam8](https://github.com/ilyam8)) +- Better context name to client context [\#6336](https://github.com/netdata/netdata/pull/6336) ([thiagoftsm](https://github.com/thiagoftsm)) +- installer: include go.d.plugin version v0.7.0 [\#6328](https://github.com/netdata/netdata/pull/6328) ([ilyam8](https://github.com/ilyam8)) +- pihole: add to the dashboard\_info [\#6325](https://github.com/netdata/netdata/pull/6325) ([ilyam8](https://github.com/ilyam8)) +- pihole collector: add alarms [\#6320](https://github.com/netdata/netdata/pull/6320) ([ilyam8](https://github.com/ilyam8)) +- dnsmasq\_dhcp: dhcp-range utilization alarm [\#6319](https://github.com/netdata/netdata/pull/6319) ([ilyam8](https://github.com/ilyam8)) +- Update the documentation on charts with zero metrics [\#6314](https://github.com/netdata/netdata/pull/6314) ([vlvkobal](https://github.com/vlvkobal)) +- fix elasticsearch plugin [\#6311](https://github.com/netdata/netdata/pull/6311) ([Wing924](https://github.com/Wing924)) +- Repeating alarm notifications [\#6309](https://github.com/netdata/netdata/pull/6309) ([thiagoftsm](https://github.com/thiagoftsm)) +- Chart name streaming [\#6304](https://github.com/netdata/netdata/pull/6304) ([vlvkobal](https://github.com/vlvkobal)) +- Handle file descriptors running out [\#6303](https://github.com/netdata/netdata/pull/6303) ([mfundul](https://github.com/mfundul)) +- Add note regarding libexecdir [\#6301](https://github.com/netdata/netdata/pull/6301) ([cakrit](https://github.com/cakrit)) +- stale bot limits update [\#6297](https://github.com/netdata/netdata/pull/6297) ([ilyam8](https://github.com/ilyam8)) +- \[freeipmi\] Remove id in sensor name when already unique [\#6296](https://github.com/netdata/netdata/pull/6296) ([Saruspete](https://github.com/Saruspete)) +- Web [\#6294](https://github.com/netdata/netdata/pull/6294) ([thiagoftsm](https://github.com/thiagoftsm)) +- doc: remove single/multi-threaded web server configuration [\#6291](https://github.com/netdata/netdata/pull/6291) ([nodiscc](https://github.com/nodiscc)) +- Add a riak plugin [\#6286](https://github.com/netdata/netdata/pull/6286) ([jchristgit](https://github.com/jchristgit)) +- netdata/packaging: Separate beta messages from production messages [\#6282](https://github.com/netdata/netdata/pull/6282) ([paulkatsoulakis](https://github.com/paulkatsoulakis)) +- Add more info on the stream.conf health enabled by default = auto option [\#6281](https://github.com/netdata/netdata/pull/6281) ([cakrit](https://github.com/cakrit)) +- Add comments about AWS SDK for C++ installation [\#6277](https://github.com/netdata/netdata/pull/6277) ([vlvkobal](https://github.com/vlvkobal)) +- Easily disable alarms, by persisting the silencers configuration [\#6274](https://github.com/netdata/netdata/pull/6274) ([thiagoftsm](https://github.com/thiagoftsm)) +- netdata/packaging: During install, many file not found were raised [\#6272](https://github.com/netdata/netdata/pull/6272) ([paulkatsoulakis](https://github.com/paulkatsoulakis)) +- netdata/packaging/documentation: Its Redhat, then came the others [\#6271](https://github.com/netdata/netdata/pull/6271) ([paulkatsoulakis](https://github.com/paulkatsoulakis)) +- netdata/packaging: Adjust CI notification logic [\#6268](https://github.com/netdata/netdata/pull/6268) ([paulkatsoulakis](https://github.com/paulkatsoulakis)) +- Update README.md [\#6264](https://github.com/netdata/netdata/pull/6264) ([mfundul](https://github.com/mfundul)) +- UrlService: add min required version check [\#6263](https://github.com/netdata/netdata/pull/6263) ([ilyam8](https://github.com/ilyam8)) +- Fix variable namespace in memory health check [\#6261](https://github.com/netdata/netdata/pull/6261) ([octomike](https://github.com/octomike)) +- Fix typo in nfacct.plugin [\#6260](https://github.com/netdata/netdata/pull/6260) ([vlvkobal](https://github.com/vlvkobal)) +- netdata/packaging: Fix netdata/netdata docker image failure, when users passing PGID that already exists on the system [\#6259](https://github.com/netdata/netdata/pull/6259) ([paulkatsoulakis](https://github.com/paulkatsoulakis)) +- PR to fix issue \#6238 [\#6242](https://github.com/netdata/netdata/pull/6242) ([thiagoftsm](https://github.com/thiagoftsm)) +- Update CODEOWNERS [\#6241](https://github.com/netdata/netdata/pull/6241) ([cakrit](https://github.com/cakrit)) +- Force page alignment per dimension of chart. [\#6240](https://github.com/netdata/netdata/pull/6240) ([mfundul](https://github.com/mfundul)) +- dns\_query\_time py module: saving dns request in 'r', checking response for answer, recording '-… [\#6237](https://github.com/netdata/netdata/pull/6237) ([n0coast](https://github.com/n0coast)) +- netdata/packaging: Add more distribution validations [\#6235](https://github.com/netdata/netdata/pull/6235) ([paulkatsoulakis](https://github.com/paulkatsoulakis)) +- Remove CNCF logo and TOC presentation reference [\#6234](https://github.com/netdata/netdata/pull/6234) ([dankohn](https://github.com/dankohn)) +- URL\_parser fixing allmetrics! [\#6231](https://github.com/netdata/netdata/pull/6231) ([thiagoftsm](https://github.com/thiagoftsm)) +- Perf plugin [\#6225](https://github.com/netdata/netdata/pull/6225) ([vlvkobal](https://github.com/vlvkobal)) +- netdata/packaging: Introducing automatic binary packages generation and delivery for RPM types \(Phase 1\) [\#6223](https://github.com/netdata/netdata/pull/6223) ([paulkatsoulakis](https://github.com/paulkatsoulakis)) +- Backend and SSL! [\#6220](https://github.com/netdata/netdata/pull/6220) ([thiagoftsm](https://github.com/thiagoftsm)) +- SSL\_fix\_format Fix wrong format used with SSL! [\#6219](https://github.com/netdata/netdata/pull/6219) ([thiagoftsm](https://github.com/thiagoftsm)) +- installer: include go.d.plugin version v0.6.1 [\#6217](https://github.com/netdata/netdata/pull/6217) ([ilyam8](https://github.com/ilyam8)) +- Fill chart gaps efficiently. [\#6216](https://github.com/netdata/netdata/pull/6216) ([mfundul](https://github.com/mfundul)) +- Add code style guidance to CONTRIBUTING [\#6212](https://github.com/netdata/netdata/pull/6212) ([cakrit](https://github.com/cakrit)) +- Simplify health cmdapi tester - no setup/cleanup needed [\#6210](https://github.com/netdata/netdata/pull/6210) ([cakrit](https://github.com/cakrit)) +- Visibility fix in anonymous-statistics.md [\#6208](https://github.com/netdata/netdata/pull/6208) ([cakrit](https://github.com/cakrit)) +- smartd documentation improvements [\#6207](https://github.com/netdata/netdata/pull/6207) ([cakrit](https://github.com/cakrit)) +- Add note to make smartd directory [\#6203](https://github.com/netdata/netdata/pull/6203) ([Steve8291](https://github.com/Steve8291)) +- Fix page cache descriptor race condition [\#6202](https://github.com/netdata/netdata/pull/6202) ([mfundul](https://github.com/mfundul)) +- Turn tv.html links to https [\#6198](https://github.com/netdata/netdata/pull/6198) ([cakrit](https://github.com/cakrit)) +- wmi collector: `fa-server` icon [\#6197](https://github.com/netdata/netdata/pull/6197) ([ilyam8](https://github.com/ilyam8)) +- Change print level from error to info [\#6195](https://github.com/netdata/netdata/pull/6195) ([mfundul](https://github.com/mfundul)) +- health: change x509check\_last\_collected\_secs alarm every to 60s [\#6194](https://github.com/netdata/netdata/pull/6194) ([ilyam8](https://github.com/ilyam8)) +- Documentation: Correct example list for python.d SimpleService [\#6189](https://github.com/netdata/netdata/pull/6189) ([kvisle](https://github.com/kvisle)) +- Terminate email header lines with \r\n [\#6187](https://github.com/netdata/netdata/pull/6187) ([toofar](https://github.com/toofar)) +- Make custom notification's instructions clearer [\#6181](https://github.com/netdata/netdata/pull/6181) ([cakrit](https://github.com/cakrit)) +- web log pattern fix [\#6180](https://github.com/netdata/netdata/pull/6180) ([ilyam8](https://github.com/ilyam8)) +- Correct date used in pushbullet notifications [\#6179](https://github.com/netdata/netdata/pull/6179) ([cakrit](https://github.com/cakrit)) +- Support falling back to buffered I/O when direct I/O is unavailable [\#6174](https://github.com/netdata/netdata/pull/6174) ([mfundul](https://github.com/mfundul)) +- Add empty page detection in DB engine [\#6173](https://github.com/netdata/netdata/pull/6173) ([mfundul](https://github.com/mfundul)) +- Increase the cpu\_limit chart precision in cgroup plugin [\#6172](https://github.com/netdata/netdata/pull/6172) ([vlvkobal](https://github.com/vlvkobal)) +- varnish v4 compatibility fix [\#6168](https://github.com/netdata/netdata/pull/6168) ([ilyam8](https://github.com/ilyam8)) +- Update security policy [\#6166](https://github.com/netdata/netdata/pull/6166) ([cakrit](https://github.com/cakrit)) +- Fix mdstat disks chart [\#6164](https://github.com/netdata/netdata/pull/6164) ([vlvkobal](https://github.com/vlvkobal)) +- Properly add security policy [\#6163](https://github.com/netdata/netdata/pull/6163) ([cakrit](https://github.com/cakrit)) +- Fix typo in README [\#6146](https://github.com/netdata/netdata/pull/6146) ([cakrit](https://github.com/cakrit)) +- Documentation fixes [\#6144](https://github.com/netdata/netdata/pull/6144) ([cakrit](https://github.com/cakrit)) +- x509check: add last\_collected alarm [\#6139](https://github.com/netdata/netdata/pull/6139) ([ilyam8](https://github.com/ilyam8)) +- web\_log: remove trailing space from nginx\_ext2 pattern [\#6138](https://github.com/netdata/netdata/pull/6138) ([ilyam8](https://github.com/ilyam8)) +- Change 'netdata' to 'Netdata' in /docs/ and /README.md [\#6137](https://github.com/netdata/netdata/pull/6137) ([apardyl](https://github.com/apardyl)) +- DB engine optimize RAM usage [\#6134](https://github.com/netdata/netdata/pull/6134) ([mfundul](https://github.com/mfundul)) +- Optimize calls that gather system info [\#6128](https://github.com/netdata/netdata/pull/6128) ([cakrit](https://github.com/cakrit)) +- Fix telemetry config in netdata-installer [\#6127](https://github.com/netdata/netdata/pull/6127) ([cakrit](https://github.com/cakrit)) +- Pass correct info to run funct. [\#6126](https://github.com/netdata/netdata/pull/6126) ([Steve8291](https://github.com/Steve8291)) +- Add modules to charts.d.conf [\#6120](https://github.com/netdata/netdata/pull/6120) ([Steve8291](https://github.com/Steve8291)) +- add userstats charts for mysql [\#6118](https://github.com/netdata/netdata/pull/6118) ([kam1kaze](https://github.com/kam1kaze)) +- Active processes number alert [\#6116](https://github.com/netdata/netdata/pull/6116) ([apardyl](https://github.com/apardyl)) +- add mysql deadlocks chart [\#6115](https://github.com/netdata/netdata/pull/6115) ([kam1kaze](https://github.com/kam1kaze)) +- Remove system\_info copying [\#6113](https://github.com/netdata/netdata/pull/6113) ([vlvkobal](https://github.com/vlvkobal)) +- Fix incorrect module name: energi [\#6112](https://github.com/netdata/netdata/pull/6112) ([Steve8291](https://github.com/Steve8291)) +- Update README.md [\#6111](https://github.com/netdata/netdata/pull/6111) ([mfundul](https://github.com/mfundul)) +- installer: include go.d.plugin version v0.6.0 [\#6097](https://github.com/netdata/netdata/pull/6097) ([ilyam8](https://github.com/ilyam8)) +- Move call to send\_statistics later, to get more events from docker [\#6096](https://github.com/netdata/netdata/pull/6096) ([cakrit](https://github.com/cakrit)) +- Fix path \#6085 [\#6093](https://github.com/netdata/netdata/pull/6093) ([gmosx](https://github.com/gmosx)) +- Fix minor typos [\#6090](https://github.com/netdata/netdata/pull/6090) ([Steve8291](https://github.com/Steve8291)) +- Create missing /etc/netdata/custom-plugins.d [\#6089](https://github.com/netdata/netdata/pull/6089) ([Steve8291](https://github.com/Steve8291)) +- Corrected links to web\_log.conf [\#6087](https://github.com/netdata/netdata/pull/6087) ([cakrit](https://github.com/cakrit)) +- Mention anonymous statistics in additional places in the docs [\#6084](https://github.com/netdata/netdata/pull/6084) ([cakrit](https://github.com/cakrit)) +- Add "custom-plugins.d" to fix error in log file [\#6080](https://github.com/netdata/netdata/pull/6080) ([Steve8291](https://github.com/Steve8291)) +- New URL parser [\#6070](https://github.com/netdata/netdata/pull/6070) ([thiagoftsm](https://github.com/thiagoftsm)) +- wmi alarms [\#6068](https://github.com/netdata/netdata/pull/6068) ([ilyam8](https://github.com/ilyam8)) +- Add perforce server process monitoring [\#6064](https://github.com/netdata/netdata/pull/6064) ([akwan](https://github.com/akwan)) +- Prometheus remote write backend [\#6062](https://github.com/netdata/netdata/pull/6062) ([vlvkobal](https://github.com/vlvkobal)) +- SSL implementation for Netdata [\#5956](https://github.com/netdata/netdata/pull/5956) ([thiagoftsm](https://github.com/thiagoftsm)) +- NEW: local remark-lint checks and autofix support [\#5898](https://github.com/netdata/netdata/pull/5898) ([andvgal](https://github.com/andvgal)) +- use github templating mechanisms to classify issues when they are created [\#5776](https://github.com/netdata/netdata/pull/5776) ([paulfantom](https://github.com/paulfantom)) + +## [v1.15.0](https://github.com/netdata/netdata/tree/v1.15.0) (2019-05-22) + +**Fixed bugs:** + +- Fix rrdengineapi compiler warning [\#6075](https://github.com/netdata/netdata/issues/6075) +- New dbengine stil creates directories for individual charts in the cache directory. [\#6067](https://github.com/netdata/netdata/issues/6067) +- v1.15.0 fails to build due to missing CFLAGS [\#6066](https://github.com/netdata/netdata/issues/6066) - netdata/dbengine: constant restarts on octopuscs [\#6053](https://github.com/netdata/netdata/issues/6053) - Nodes are inconsistently unreachables [\#6051](https://github.com/netdata/netdata/issues/6051) - BUG when compiling code in mac OS [\#6043](https://github.com/netdata/netdata/issues/6043) @@ -14,6 +253,7 @@ - using with docker. got error after update. [\#6018](https://github.com/netdata/netdata/issues/6018) - Segfault on NetData v1.14.0-51-g18336910 [\#6013](https://github.com/netdata/netdata/issues/6013) - Slack does not send to channel [\#6003](https://github.com/netdata/netdata/issues/6003) +- api/v1/chart and api/v1/data calls don't return chart variables names, values [\#5990](https://github.com/netdata/netdata/issues/5990) - elasticsearch throws exception and kills whole python.d [\#5978](https://github.com/netdata/netdata/issues/5978) - System info doesn't show OS name and version in Mac OS X [\#5950](https://github.com/netdata/netdata/issues/5950) - nvidia\_smi wrong power draw numbers [\#5939](https://github.com/netdata/netdata/issues/5939) @@ -34,6 +274,7 @@ - \[Question\] Are python-pymongo and python-yaml needed for a barebones install of netdata? [\#5632](https://github.com/netdata/netdata/issues/5632) - python SocketService: lack of connect timeout, python.d.plugin hangs [\#5541](https://github.com/netdata/netdata/issues/5541) - installer wrong message on centos [\#5474](https://github.com/netdata/netdata/issues/5474) +- Misleading information on memory consumption [\#5203](https://github.com/netdata/netdata/issues/5203) - File not found by glob when building an rpm \(latest code - 20181218\) [\#5033](https://github.com/netdata/netdata/issues/5033) - Issue creating deb package from v1.11.1 [\#4979](https://github.com/netdata/netdata/issues/4979) - netdata service fails to start - pfsense [\#3469](https://github.com/netdata/netdata/issues/3469) @@ -78,7 +319,14 @@ **Merged pull requests:** +- Shorten netdata version and correctly send OS\_VERSION\_ID [\#6082](https://github.com/netdata/netdata/pull/6082) ([cakrit](https://github.com/cakrit)) +- Fix build errors [\#6081](https://github.com/netdata/netdata/pull/6081) ([mfundul](https://github.com/mfundul)) +- Fix race condition in DB engine API [\#6079](https://github.com/netdata/netdata/pull/6079) ([mfundul](https://github.com/mfundul)) +- Fix creating chart directories when not necessary with memory mode dbengine [\#6078](https://github.com/netdata/netdata/pull/6078) ([mfundul](https://github.com/mfundul)) +- Add more metrics to the prometheus bats tests [\#6074](https://github.com/netdata/netdata/pull/6074) ([cakrit](https://github.com/cakrit)) +- Add 1.15 to news section of main README [\#6065](https://github.com/netdata/netdata/pull/6065) ([cakrit](https://github.com/cakrit)) - Fix delete datafile error [\#6057](https://github.com/netdata/netdata/pull/6057) ([mfundul](https://github.com/mfundul)) +- Update swagger with definition of api/v1/alarm\_variables [\#6055](https://github.com/netdata/netdata/pull/6055) ([cakrit](https://github.com/cakrit)) - Reiterate the proper way to persist configurations in the Configuration guid [\#6052](https://github.com/netdata/netdata/pull/6052) ([cakrit](https://github.com/cakrit)) - netdata/daemon/backends: Fix AWS Kinesis link error [\#6047](https://github.com/netdata/netdata/pull/6047) ([paulkatsoulakis](https://github.com/paulkatsoulakis)) - netdata/packaging: Fix makeself packaging [\#6041](https://github.com/netdata/netdata/pull/6041) ([paulkatsoulakis](https://github.com/paulkatsoulakis)) @@ -226,7 +474,6 @@ - Errant netdata-updater logs on root "/" directory [\#5679](https://github.com/netdata/netdata/issues/5679) - remove python obsolete modules [\#5647](https://github.com/netdata/netdata/issues/5647) - Reinstalling with kickstart.sh fails [\#5584](https://github.com/netdata/netdata/issues/5584) -- .opt-out-from-anonymous-statistics still leaks referrer [\#5577](https://github.com/netdata/netdata/issues/5577) - Uninstaller fixes and instructions [\#5290](https://github.com/netdata/netdata/issues/5290) - Installer problem with config files under 'orig' symlink [\#5039](https://github.com/netdata/netdata/issues/5039) @@ -834,7 +1081,6 @@ - Phusion Passenger monitoring [\#4833](https://github.com/netdata/netdata/issues/4833) - Iis monitoring [\#4832](https://github.com/netdata/netdata/issues/4832) - Scaleio monitoring [\#4828](https://github.com/netdata/netdata/issues/4828) -- Gluster monitoring [\#4827](https://github.com/netdata/netdata/issues/4827) - Leofs monitoring [\#4826](https://github.com/netdata/netdata/issues/4826) - Jumpy data when running on kubernetes [\#4778](https://github.com/netdata/netdata/issues/4778) - Create documentation on how to opt-out of anonymous data collection [\#4746](https://github.com/netdata/netdata/issues/4746) @@ -1082,7 +1328,6 @@ - httpcheck do not accept URLs that do not end with com [\#3656](https://github.com/netdata/netdata/issues/3656) - httpcheck python.d plugin fails [\#3641](https://github.com/netdata/netdata/issues/3641) - Issue with statsd sample rate [\#3630](https://github.com/netdata/netdata/issues/3630) -- NetData and Kubernetes - Docker Name [\#3369](https://github.com/netdata/netdata/issues/3369) - netdata-uninstaller.sh not working \(with macOS 10.13\) [\#2941](https://github.com/netdata/netdata/issues/2941) - Problem with plugins in debug mode \(wrong path to cfgs\) [\#2593](https://github.com/netdata/netdata/issues/2593) - dashboard with thousands of charts [\#2275](https://github.com/netdata/netdata/issues/2275) diff --git a/CMakeLists.txt b/CMakeLists.txt index 870f7cf5..6bfab928 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -131,6 +131,15 @@ set(NETDATA_COMMON_LIBRARIES ${NETDATA_COMMON_LIBRARIES} ${OPENSSL_LIBRARIES}) set(NETDATA_COMMON_INCLUDE_DIRS ${NETDATA_COMMON_INCLUDE_DIRS} ${OPENSSL_INCLUDE_DIRS}) # ----------------------------------------------------------------------------- +# JSON-C used to health + +pkg_check_modules(JSON REQUIRED json-c) +set(NETDATA_COMMON_CFLAGS ${NETDATA_COMMON_CFLAGS} ${JSONC_CFLAGS_OTHER}) +set(NETDATA_COMMON_LIBRARIES ${NETDATA_COMMON_LIBRARIES} ${JSON_LIBRARIES}) +set(NETDATA_COMMON_INCLUDE_DIRS ${NETDATA_COMMON_INCLUDE_DIRS} ${JSON_INCLUDE_DIRS}) + + +# ----------------------------------------------------------------------------- # Detect libcap IF(LINUX) @@ -241,6 +250,27 @@ find_library(HAVE_KINESIS aws-cpp-sdk-kinesis) # later we use: # ${HAVE_KINESIS} + +# ----------------------------------------------------------------------------- +# Detect libprotobuf + +pkg_check_modules(PROTOBUF protobuf) +# later we use: +# ${PROTOBUF_LIBRARIES} +# ${PROTOBUF_CFLAGS_OTHER} +# ${PROTOBUF_INCLUDE_DIRS} + + +# ----------------------------------------------------------------------------- +# Detect libsnappy + +pkg_check_modules(SNAPPY snappy) +# later we use: +# ${SNAPPY_LIBRARIES} +# ${SNAPPY_CFLAGS_OTHER} +# ${SNAPPY_INCLUDE_DIRS} + + # ----------------------------------------------------------------------------- # netdata files @@ -284,7 +314,14 @@ set(LIBNETDATA_FILES libnetdata/threads/threads.h libnetdata/url/url.c libnetdata/url/url.h - ) + libnetdata/json/json.c + libnetdata/json/json.h + libnetdata/json/jsmn.c + libnetdata/json/jsmn.h + libnetdata/health/health.c + libnetdata/health/health.h + libnetdata/socket/security.c + libnetdata/socket/security.h) add_library(libnetdata OBJECT ${LIBNETDATA_FILES}) @@ -348,6 +385,10 @@ set(XENSTAT_PLUGIN_FILES collectors/xenstat.plugin/xenstat_plugin.c ) +set(PERF_PLUGIN_FILES + collectors/perf.plugin/perf_plugin.c + ) + set(PROC_PLUGIN_FILES collectors/proc.plugin/ipc.c collectors/proc.plugin/plugin_proc.c @@ -456,6 +497,8 @@ set(RRD_PLUGIN_FILES database/engine/rrdengineapi.h database/engine/pagecache.c database/engine/pagecache.h + database/engine/rrdenglocking.c + database/engine/rrdenglocking.h ) set(WEB_PLUGIN_FILES @@ -544,6 +587,11 @@ set(KINESIS_BACKEND_FILES backends/aws_kinesis/aws_kinesis_put_record.h ) +set(PROMETHEUS_REMOTE_WRITE_BACKEND_FILES + backends/prometheus/remote_write/remote_write.cc + backends/prometheus/remote_write/remote_write.h + ) + set(DAEMON_FILES daemon/common.c daemon/common.h @@ -609,6 +657,29 @@ ELSE() ENDIF() # ----------------------------------------------------------------------------- +# prometheus remote write backend + +IF(PROTOBUF_LIBRARIES AND SNAPPY_LIBRARIES) + SET(ENABLE_BACKEND_PROMETHEUS_REMOTE_WRITE True) +ELSE() + SET(ENABLE_BACKEND_PROMETHEUS_REMOTE_WRITE False) +ENDIF() + +IF(ENABLE_BACKEND_PROMETHEUS_REMOTE_WRITE) + message(STATUS "prometheus remote write backend: enabled") + + find_package(Protobuf REQUIRED) + protobuf_generate_cpp(PROTO_SRCS PROTO_HDRS backends/prometheus/remote_write/remote_write.proto) + + list(APPEND NETDATA_FILES ${PROMETHEUS_REMOTE_WRITE_BACKEND_FILES} ${PROTO_SRCS} ${PROTO_HDRS}) + list(APPEND NETDATA_COMMON_LIBRARIES ${PROTOBUF_LIBRARIES} ${SNAPPY_LIBRARIES}) + list(APPEND NETDATA_COMMON_INCLUDE_DIRS ${PROTOBUF_INCLUDE_DIRS} ${SNAPPY_INCLUDE_DIRS} ${CMAKE_CURRENT_BINARY_DIR}) + list(APPEND NETDATA_COMMON_CFLAGS ${PROTOBUF_CFLAGS_OTHER} ${SNAPPY_CFLAGS_OTHER}) +ELSE() + message(STATUS "prometheus remote write backend: disabled (requires protobuf and snappy libraries)") +ENDIF() + +# ----------------------------------------------------------------------------- # netdata set(NETDATA_COMMON_LIBRARIES ${NETDATA_COMMON_LIBRARIES} m ${CMAKE_THREAD_LIBS_INIT}) @@ -626,6 +697,7 @@ IF(LINUX) SET(ENABLE_PLUGIN_CGROUP_NETWORK True) SET(ENABLE_PLUGIN_APPS True) + SET(ENABLE_PLUGIN_PERF True) ELSEIF(FREEBSD) add_executable(netdata config.h ${NETDATA_FILES} ${FREEBSD_PLUGIN_FILES}) @@ -634,6 +706,7 @@ ELSEIF(FREEBSD) target_compile_options(netdata PUBLIC ${NETDATA_COMMON_CFLAGS}) SET(ENABLE_PLUGIN_CGROUP_NETWORK False) SET(ENABLE_PLUGIN_APPS True) + SET(ENABLE_PLUGIN_PERF False) ELSEIF(MACOS) add_executable(netdata config.h ${NETDATA_FILES} ${MACOS_PLUGIN_FILES}) @@ -642,10 +715,11 @@ ELSEIF(MACOS) target_compile_options(netdata PUBLIC ${NETDATA_COMMON_CFLAGS}) SET(ENABLE_PLUGIN_CGROUP_NETWORK False) SET(ENABLE_PLUGIN_APPS False) + SET(ENABLE_PLUGIN_PERF False) ENDIF() -IF(ENABLE_BACKEND_KINESIS) +IF(ENABLE_BACKEND_KINESIS OR ENABLE_BACKEND_PROMETHEUS_REMOTE_WRITE) set_property(TARGET netdata PROPERTY CXX_STANDARD 11) set_property(TARGET netdata PROPERTY CMAKE_CXX_STANDARD_REQUIRED ON) ENDIF() @@ -726,6 +800,20 @@ ENDIF() # ----------------------------------------------------------------------------- +# perf.plugin + +IF(ENABLE_PLUGIN_PERF) + message(STATUS "perf.plugin: enabled") + add_executable(perf.plugin config.h ${PERF_PLUGIN_FILES}) + target_link_libraries (perf.plugin libnetdata ${NETDATA_COMMON_LIBRARIES}) + target_include_directories(perf.plugin PUBLIC ${NETDATA_COMMON_INCLUDE_DIRS}) + target_compile_options(perf.plugin PUBLIC ${NETDATA_COMMON_CFLAGS}) +ELSE() + message(STATUS "perf.plugin: disabled") +ENDIF() + + +# ----------------------------------------------------------------------------- # cgroup-network IF(ENABLE_PLUGIN_CGROUP_NETWORK) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6183f05e..8847f0c2 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -120,6 +120,18 @@ Your contributions should be bundled with related documentation to help users un When you contribute code to Netdata, you are automatically accepting that you will be responsible for maintaining that code in the future. So, if users need help, or report bugs, we will invite you to the related github issues to help them or fix the issues or bugs of your contributions. +#### Code Style + +The single most important rule when writing code is this: *check the surrounding code and try to imitate it*. [Reference](https://developer.gnome.org/programming-guidelines/stable/c-coding-style.html.en) + +We use several different languages and have had contributions from several people with different styles. When in doubt, you can check similar existing code. + +For C contributions in particular, we try to respect the [Linux kernel style](https://www.kernel.org/doc/html/v4.10/process/coding-style.html), with the following exceptions: + - Use 4 space indentation instead of 8 + - We occassionally have multiple statements on a single line (e.g. `if (a) b;`) + - Allow max line length of 120 chars + - Allow opening brace at the end of a function declaration: `function() {`. + ### Your first pull request There are several guides for pull requests, such as the following: diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 08bbdcc8..0565644e 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -124,5 +124,6 @@ username|name|email (optional) @gmosx|George Moschovitis @adherzog|Adam Herzog|adam@adamherzog.com @skrzyp1|Jerzy S.| +@akwan|Alan Kwan| [![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2FCONTRIBUTORS&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)]() diff --git a/Makefile.am b/Makefile.am index 41d0fe27..bc928bba 100644 --- a/Makefile.am +++ b/Makefile.am @@ -20,13 +20,9 @@ CLEANFILES= \ EXTRA_DIST = \ .gitignore \ - .codacy.yml \ - .codeclimate.yml \ .csslintrc \ .eslintignore \ .eslintrc \ - .lgtm.yml \ - .travis \ .github/CODEOWNERS \ build/m4/jemalloc.m4 \ build/m4/ax_c___atomic.m4 \ @@ -55,6 +51,7 @@ SUBDIRS = \ $(NULL) dist_noinst_DATA= \ + CHANGELOG.md \ cppcheck.sh \ configs.signatures \ contrib \ @@ -106,7 +103,7 @@ SUBDIRS += \ AM_CFLAGS = \ $(OPTIONAL_MATH_CFLAGS) \ - $(OPTIONAL_NFACCT_CLFAGS) \ + $(OPTIONAL_NFACCT_CFLAGS) \ $(OPTIONAL_ZLIB_CFLAGS) \ $(OPTIONAL_UUID_CFLAGS) \ $(OPTIONAL_LIBCAP_LIBS) \ @@ -154,6 +151,8 @@ LIBNETDATA_FILES = \ libnetdata/simple_pattern/simple_pattern.h \ libnetdata/socket/socket.c \ libnetdata/socket/socket.h \ + libnetdata/socket/security.c \ + libnetdata/socket/security.h \ libnetdata/statistical/statistical.c \ libnetdata/statistical/statistical.h \ libnetdata/storage_number/storage_number.c \ @@ -162,6 +161,12 @@ LIBNETDATA_FILES = \ libnetdata/threads/threads.h \ libnetdata/url/url.c \ libnetdata/url/url.h \ + libnetdata/json/json.c \ + libnetdata/json/json.h \ + libnetdata/json/jsmn.c \ + libnetdata/json/jsmn.h \ + libnetdata/health/health.c \ + libnetdata/health/health.h \ $(NULL) APPS_PLUGIN_FILES = \ @@ -235,6 +240,11 @@ XENSTAT_PLUGIN_FILES = \ $(LIBNETDATA_FILES) \ $(NULL) +PERF_PLUGIN_FILES = \ + collectors/perf.plugin/perf_plugin.c \ + $(LIBNETDATA_FILES) \ + $(NULL) + PROC_PLUGIN_FILES = \ collectors/proc.plugin/ipc.c \ collectors/proc.plugin/plugin_proc.c \ @@ -326,6 +336,8 @@ if ENABLE_DBENGINE database/engine/rrdengineapi.h \ database/engine/pagecache.c \ database/engine/pagecache.h \ + database/engine/rrdenglocking.c \ + database/engine/rrdenglocking.h \ $(NULL) endif @@ -437,6 +449,12 @@ KINESIS_BACKEND_FILES = \ backends/aws_kinesis/aws_kinesis_put_record.h \ $(NULL) +PROMETHEUS_REMOTE_WRITE_BACKEND_FILES = \ + backends/prometheus/remote_write/remote_write.cc \ + backends/prometheus/remote_write/remote_write.h \ + backends/prometheus/remote_write/remote_write.proto \ + $(NULL) + DAEMON_FILES = \ daemon/common.c \ daemon/common.h \ @@ -494,20 +512,21 @@ endif NETDATA_COMMON_LIBS = \ $(OPTIONAL_MATH_LIBS) \ $(OPTIONAL_ZLIB_LIBS) \ + $(OPTIONAL_SSL_LIBS) \ $(OPTIONAL_UUID_LIBS) \ $(OPTIONAL_UV_LIBS) \ $(OPTIONAL_LZ4_LIBS) \ $(OPTIONAL_JUDY_LIBS) \ $(OPTIONAL_SSL_LIBS) \ + $(OPTIONAL_JSONC_LIBS) \ $(NULL) -# TODO: Find more graceful way to add libs for AWS Kinesis sbin_PROGRAMS += netdata netdata_SOURCES = $(NETDATA_FILES) netdata_LDADD = \ $(NETDATA_COMMON_LIBS) \ $(NULL) -if ENABLE_BACKEND_KINESIS +if ENABLE_CXX_LINKER netdata_LINK = $(CXXLD) $(CXXFLAGS) $(LDFLAGS) -o $@ else netdata_LINK = $(CCLD) $(CFLAGS) $(LDFLAGS) -o $@ @@ -566,7 +585,30 @@ if ENABLE_PLUGIN_XENSTAT $(NULL) endif +if ENABLE_PLUGIN_PERF + plugins_PROGRAMS += perf.plugin + perf_plugin_SOURCES = $(PERF_PLUGIN_FILES) + perf_plugin_LDADD = \ + $(NETDATA_COMMON_LIBS) \ + $(NULL) +endif + if ENABLE_BACKEND_KINESIS netdata_SOURCES += $(KINESIS_BACKEND_FILES) netdata_LDADD += $(OPTIONAL_KINESIS_LIBS) endif + +if ENABLE_BACKEND_PROMETHEUS_REMOTE_WRITE + netdata_SOURCES += $(PROMETHEUS_REMOTE_WRITE_BACKEND_FILES) + netdata_LDADD += $(OPTIONAL_PROMETHEUS_REMOTE_WRITE_LIBS) + BUILT_SOURCES = \ + backends/prometheus/remote_write/remote_write.pb.cc \ + backends/prometheus/remote_write/remote_write.pb.h \ + $(NULL) + nodist_netdata_SOURCES = $(BUILT_SOURCES) + +backends/prometheus/remote_write/remote_write.pb.cc \ +backends/prometheus/remote_write/remote_write.pb.h: backends/prometheus/remote_write/remote_write.proto + $(PROTOC) --proto_path=$(srcdir) --cpp_out=$(builddir) $^ + +endif @@ -1,4 +1,4 @@ -# netdata [![Build Status](https://travis-ci.com/netdata/netdata.svg?branch=master)](https://travis-ci.com/netdata/netdata) [![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/2231/badge)](https://bestpractices.coreinfrastructure.org/projects/2231) [![License: GPL v3+](https://img.shields.io/badge/License-GPL%20v3%2B-blue.svg)](https://www.gnu.org/licenses/gpl-3.0) [![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Freadme&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)]() +# Netdata [![Build Status](https://travis-ci.com/netdata/netdata.svg?branch=master)](https://travis-ci.com/netdata/netdata) [![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/2231/badge)](https://bestpractices.coreinfrastructure.org/projects/2231) [![License: GPL v3+](https://img.shields.io/badge/License-GPL%20v3%2B-blue.svg)](https://www.gnu.org/licenses/gpl-3.0) [![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Freadme&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)]() [![Code Climate](https://codeclimate.com/github/netdata/netdata/badges/gpa.svg)](https://codeclimate.com/github/netdata/netdata) [![Codacy Badge](https://api.codacy.com/project/badge/Grade/a994873f30d045b9b4b83606c3eb3498)](https://www.codacy.com/app/netdata/netdata?utm_source=github.com&utm_medium=referral&utm_content=netdata/netdata&utm_campaign=Badge_Grade) [![LGTM C](https://img.shields.io/lgtm/grade/cpp/g/netdata/netdata.svg?logo=lgtm)](https://lgtm.com/projects/g/netdata/netdata/context:cpp) [![LGTM JS](https://img.shields.io/lgtm/grade/javascript/g/netdata/netdata.svg?logo=lgtm)](https://lgtm.com/projects/g/netdata/netdata/context:javascript) [![LGTM PYTHON](https://img.shields.io/lgtm/grade/python/g/netdata/netdata.svg?logo=lgtm)](https://lgtm.com/projects/g/netdata/netdata/context:python) @@ -12,48 +12,45 @@ _Netdata is **fast** and **efficient**, designed to permanently run on all syste Netdata is **free, open-source software** and it currently runs on **Linux**, **FreeBSD**, and **MacOS**. -![cncf](https://www.cncf.io/wp-content/uploads/2016/09/logo_cncf.png) - -Netdata is in the [Cloud Native Computing Foundation (CNCF) landscape](https://landscape.cncf.io/format=card-mode&grouping=no&sort=stars) and it is the 3rd most starred open-source project. -Check the [CNCF TOC Netdata presentation](https://docs.google.com/presentation/d/18C8bCTbtgKDWqPa57GXIjB2PbjjpjsUNkLtZEz6YK8s/edit?usp=sharing). +Netdata is not hosted by the CNCF but is the 3rd most starred open-source project in the [Cloud Native Computing Foundation (CNCF) landscape](https://landscape.cncf.io/format=card-mode&grouping=no&sort=stars). --- -People get **addicted to netdata**.<br/> +People get **addicted to Netdata**.<br/> Once you use it on your systems, **there is no going back**! *You have been warned...* ![image](https://user-images.githubusercontent.com/2662304/48305662-9de82980-e537-11e8-9f5b-aa1a60fbb82f.png) -[![Tweet about netdata!](https://img.shields.io/twitter/url/http/shields.io.svg?style=social&label=Tweet%20about%20netdata)](https://twitter.com/intent/tweet?text=Netdata,%20real-time%20performance%20and%20health%20monitoring,%20done%20right!&url=https://my-netdata.io/&via=linuxnetdata&hashtags=netdata,monitoring) +[![Tweet about Netdata!](https://img.shields.io/twitter/url/http/shields.io.svg?style=social&label=Tweet%20about%20netdata)](https://twitter.com/intent/tweet?text=Netdata,%20real-time%20performance%20and%20health%20monitoring,%20done%20right!&url=https://my-netdata.io/&via=linuxnetdata&hashtags=netdata,monitoring) ## Contents 1. [How it looks](#how-it-looks) - have a quick look at it -2. [User base](#user-base) - who uses netdata? +2. [User base](#user-base) - who uses Netdata? 3. [Quick Start](#quick-start) - try it now on your systems -4. [Why Netdata](#why-netdata) - why people love netdata, how it compares with other solutions -5. [News](#news) - latest news about netdata -6. [How it works](#how-it-works) - high level diagram of how netdata works -7. [infographic](#infographic) - everything about netdata, in a page +4. [Why Netdata](#why-netdata) - why people love Netdata, how it compares with other solutions +5. [News](#news) - latest news about Netdata +6. [How it works](#how-it-works) - high level diagram of how Netdata works +7. [infographic](#infographic) - everything about Netdata, in a page 8. [Features](#features) - what features does it have 9. [Visualization](#visualization) - unique visualization features 10. [What does it monitor](#what-does-it-monitor) - which metrics it collects 11. [Documentation](#documentation) - read the docs 12. [Community](#community) - discuss with others and get support -13. [License](#license) - check the license of netdata +13. [License](#license) - check the license of Netdata 14. [Is it any good?](#is-it-any-good) - Yes 15. [Is it awesome?](#is-it-awesome) - Yes ## How it looks -The following animated image, shows the top part of a typical netdata dashboard. +The following animated image, shows the top part of a typical Netdata dashboard. ![peek 2018-11-11 02-40](https://user-images.githubusercontent.com/2662304/48307727-9175c800-e55b-11e8-92d8-a581d60a4889.gif) -*A typical netdata dashboard, in 1:1 timing. Charts can be panned by dragging them, zoomed in/out with `SHIFT` + `mouse wheel`, an area can be selected for zoom-in with `SHIFT` + `mouse selection`. Netdata is highly interactive and **real-time**, optimized to get the work done!* +*A typical Netdata dashboard, in 1:1 timing. Charts can be panned by dragging them, zoomed in/out with `SHIFT` + `mouse wheel`, an area can be selected for zoom-in with `SHIFT` + `mouse selection`. Netdata is highly interactive and **real-time**, optimized to get the work done!* -> *We have a few online demos to experience it live: [https://my-netdata.io](https://my-netdata.io)* +> *We have a few online demos to experience it live: [https://www.netdata.cloud](https://www.netdata.cloud/#live-demo)* ## User base @@ -70,7 +67,7 @@ We provide docker images for the most common architectures. These are statistics [![netdata/netdata (official)](https://img.shields.io/docker/pulls/netdata/netdata.svg?label=netdata/netdata+%28official%29)](https://hub.docker.com/r/netdata/netdata/) [![firehol/netdata (deprecated)](https://img.shields.io/docker/pulls/firehol/netdata.svg?label=firehol/netdata+%28deprecated%29)](https://hub.docker.com/r/firehol/netdata/) [![titpetric/netdata (donated)](https://img.shields.io/docker/pulls/titpetric/netdata.svg?label=titpetric/netdata+%28third+party%29)](https://hub.docker.com/r/titpetric/netdata/) ### Registry -When you install multiple netdata, they are integrated into **one distributed application**, via a [netdata registry](registry/#registry). This is a web browser feature and it allows us to count the number of unique users and unique netdata servers installed. The following information comes from the global public netdata registry we run: +When you install multiple Netdata, they are integrated into **one distributed application**, via a [Netdata registry](registry/#registry). This is a web browser feature and it allows us to count the number of unique users and unique Netdata servers installed. The following information comes from the global public Netdata registry we run: [![User Base](https://registry.my-netdata.io/api/v1/badge.svg?chart=netdata.registry_entries&dimensions=persons&label=user%20base&units=M&value_color=blue&precision=2÷=1000000&v43)](https://registry.my-netdata.io/#menu_netdata_submenu_registry) [![Monitored Servers](https://registry.my-netdata.io/api/v1/badge.svg?chart=netdata.registry_entries&dimensions=machines&label=servers%20monitored&units=k÷=1000&value_color=orange&precision=2&v43)](https://registry.my-netdata.io/#menu_netdata_submenu_registry) [![Sessions Served](https://registry.my-netdata.io/api/v1/badge.svg?chart=netdata.registry_sessions&label=sessions%20served&units=M&value_color=yellowgreen&precision=2÷=1000000&v43)](https://registry.my-netdata.io/#menu_netdata_submenu_registry) @@ -78,13 +75,13 @@ When you install multiple netdata, they are integrated into **one distributed ap ## Quick Start -You can quickly install netdata on a Linux box (physical, virtual, container, IoT) with the following command: +You can quickly install Netdata on a Linux box (physical, virtual, container, IoT) with the following command: ```sh # make sure you run `bash` for your shell bash -# install netdata, directly from github sources +# install Netdata, directly from github sources bash <(curl -Ss https://my-netdata.io/kickstart.sh) ``` ![](https://registry.my-netdata.io/api/v1/badge.svg?chart=web_log_nginx.requests_per_url&options=unaligned&dimensions=kickstart&group=sum&after=-3600&label=last+hour&units=installations&value_color=orange&precision=0) ![](https://registry.my-netdata.io/api/v1/badge.svg?chart=web_log_nginx.requests_per_url&options=unaligned&dimensions=kickstart&group=sum&after=-86400&label=today&units=installations&precision=0) @@ -96,7 +93,7 @@ The above command will: More installation methods and additional options can be found at the [installation page](packaging/installer/#installation). -To try netdata in a docker container, run this: +To try Netdata in a docker container, run this: ``` docker run -d --name=netdata \ @@ -109,10 +106,12 @@ docker run -d --name=netdata \ netdata/netdata ``` -For more information about running netdata in docker, check the [docker installation page](packaging/docker/). +For more information about running Netdata in docker, check the [docker installation page](packaging/docker/). ![image](https://user-images.githubusercontent.com/2662304/48304090-fd384080-e51b-11e8-80ae-eecb03118dda.png) +From Netdata v1.12 and above, anonymous usage information is collected by default and sent to Google Analytics. To read more about the information collected and how to opt-out, check the [anonymous statistics page](docs/anonymous-statistics.md). + ## Why Netdata Netdata has a quite different approach to monitoring. @@ -126,9 +125,9 @@ Netdata is a monitoring agent you install on all your systems. It is: All the above, are packaged together in a very flexible, extremely modular, distributed application. -This is how netdata compares to other monitoring solutions: +This is how Netdata compares to other monitoring solutions: -netdata|others (open-source and commercial) +Netdata|others (open-source and commercial) :---:|:---: **High resolution metrics** (1s granularity)|Low resolution metrics (10s granularity at best) Monitors everything, **thousands of metrics per node**|Monitor just a few metrics @@ -147,27 +146,57 @@ not just visualize metrics. ## News -`Apr 26th, 2019` - **[netdata v1.14.0 released!](https://github.com/netdata/netdata/releases)** +`May 21st, 2019` - **[Netdata v1.15.0 released!](https://github.com/netdata/netdata/releases)** + +Release v1.15.0 contains 11 bug fixes and 30 improvements. + +We are very happy and proud to be able to include two major improvements in this release: The aggregated node view and the [new database engine](https://docs.netdata.cloud/database/engine/). + +*Aggregated node view* + +The No. 1 request from our community has been a better way to view and manage their Netdata installations, via an aggregated view. The node menu with the simple list of hosts on the agent UI just didn't do it for people with hundreds, or thousands of instances. This release introduces the node view, which uses the power of [Netdata Cloud](https://blog.netdata.cloud/posts/netdata-cloud-announcement/) to deliver powerful views of a Netdata-based monitoring infrastructure. You can read more about Netdata Cloud and the future of Netdata [here](https://blog.netdata.cloud/posts/netdata-cloud-announcement/). + +*New database engine* + +Historically, Netdata has required a lot of memory for long-term metrics storage. To mitigate this we've been building a new DB engine for several months and will continue improving until it can become the default `memory mode` for new Netdata installations. The version included in release v1.15.0 already permits longer-term storage of compressed data and we'll continue reducing the required memory in following releases. + +*Other major additions* + +We have added support for the [AWS Kinesis backend](https://docs.netdata.cloud/backends/aws_kinesis/) and new collectors for [OpenVPN](https://docs.netdata.cloud/collectors/go.d.plugin/modules/openvpn/), the [Tengine web server](https://docs.netdata.cloud/collectors/go.d.plugin/modules/tengine/), [ScaleIO (VxFlex OS)](https://docs.netdata.cloud/collectors/go.d.plugin/modules/scaleio/), [ioping-like latency metrics](https://docs.netdata.cloud/collectors/ioping.plugin/) and [Energi Core node instances](https://docs.netdata.cloud/collectors/python.d.plugin/energid/). + +We now have a new, ["text-only" chart type](https://github.com/netdata/netdata/issues/5578), [cpu limits for v2 cgroups](https://github.com/netdata/netdata/issues/5850), [docker swarm metrics](https://docs.netdata.cloud/collectors/go.d.plugin/modules/docker_engine/) and improved [documentation](https://docs.netdata.cloud/). + +We continued improving the [Kubernetes helmchart](https://github.com/netdata/helmchart) with liveness probes for slaves, persistence options, a fix for a `Cannot allocate memory` issue and easy configuration for the kubelet, kube-proxy and coredns collectors. + +Finally, we built a process to quickly replace any problematic nightly builds and added more automated CI tests to prevent such builds from being published in the first place. + +--- + +`Apr 26th, 2019` - **[Netdata v1.14.0 released!](https://github.com/netdata/netdata/releases)** Release 1.14 contains 14 bug fixes and 24 improvements. -The release introduces major additions to Kubernetes monitoring, with tens of new charts for [Kubelet](https://docs.netdata.cloud/collectors/go.d.plugin/modules/k8s_kubelet/), [kube-proxy](https://docs.netdata.cloud/collectors/go.d.plugin/modules/k8s_kubeproxy/) and [coredns](https://github.com/netdata/go.d.plugin/tree/master/modules/coredns) metrics, as well as significant improvements to the netdata [helm chart](https://github.com/netdata/helmchart/). +The release introduces major additions to Kubernetes monitoring, with tens of new charts for [Kubelet](https://docs.netdata.cloud/collectors/go.d.plugin/modules/k8s_kubelet/), [kube-proxy](https://docs.netdata.cloud/collectors/go.d.plugin/modules/k8s_kubeproxy/) and [coredns](https://github.com/netdata/go.d.plugin/tree/master/modules/coredns) metrics, as well as significant improvements to the Netdata [helm chart](https://github.com/netdata/helmchart/). Two new collectors were added, to monitor [Docker hub](https://docs.netdata.cloud/collectors/go.d.plugin/modules/dockerhub/) and [Docker engine](https://docs.netdata.cloud/collectors/go.d.plugin/modules/docker_engine/) metrics. Finally, v1.14 adds support for [version 2 cgroups](https://github.com/netdata/netdata/pull/5407), [OpenLDAP over TLS](https://github.com/netdata/netdata/pull/5859), [NVIDIA SMI free and per process memory](https://github.com/netdata/netdata/pull/5796/files) and [configurable syslog facilities](https://github.com/netdata/netdata/pull/5792). -`Mar 14th, 2019` - **[netdata v1.13.0 released!](https://github.com/netdata/netdata/releases)** +--- + +`Mar 14th, 2019` - **[Netdata v1.13.0 released!](https://github.com/netdata/netdata/releases)** Release 1.13.0 contains 14 bug fixes and 8 improvements. -netdata has taken the first step into the world of Kubernetes, with a beta version of a [Helm chart](https://github.com/netdata/helmchart) for deployment to a k8s cluster and [proper naming](https://github.com/netdata/netdata/pull/5576) of the cgroup containers. We have [big plans](https://github.com/netdata/netdata/issues/5392) for Kubernetes, so stay tuned! +Netdata has taken the first step into the world of Kubernetes, with a beta version of a [Helm chart](https://github.com/netdata/helmchart) for deployment to a k8s cluster and [proper naming](https://github.com/netdata/netdata/pull/5576) of the cgroup containers. We have [big plans](https://github.com/netdata/netdata/issues/5392) for Kubernetes, so stay tuned! -A [major refactoring of the python.d plugin](https://github.com/netdata/netdata/pull/5552) has resulted in a dramatic decrease of the required memory, making netdata even more resource efficient. +A [major refactoring of the python.d plugin](https://github.com/netdata/netdata/pull/5552) has resulted in a dramatic decrease of the required memory, making Netdata even more resource efficient. We also added charts for IPC shared memory segments and total memory used. -`Feb 28th, 2019` - **[netdata v1.12.2 released!](https://github.com/netdata/netdata/releases)** +--- + +`Feb 28th, 2019` - **[Netdata v1.12.2 released!](https://github.com/netdata/netdata/releases)** Patch release 1.12.2 contains 7 bug fixes and 4 improvements. @@ -176,34 +205,40 @@ A "stable" installation and update channel was always on our roadmap, but it bec We are also introducing a new **Oracle DB collector** module, implemented in Python. -`Feb 21st, 2019` - **[netdata v1.12.1 released!](https://github.com/netdata/netdata/releases)** +--- + +`Feb 21st, 2019` - **[Netdata v1.12.1 released!](https://github.com/netdata/netdata/releases)** Patch release 1.12.1 contains 22 bug fixes and 8 improvements. -`Feb 14th, 2019` - **[netdata v1.12.0 released!](https://github.com/netdata/netdata/releases)** +--- + +`Feb 14th, 2019` - **[Netdata v1.12.0 released!](https://github.com/netdata/netdata/releases)** Release 1.12 is made out of 211 pull requests and 22 bug fixes. The key improvements are: -- Introducing `netdata.cloud`, the free netdata service for all netdata users +- Introducing `netdata.cloud`, the free Netdata service for all Netdata users - High performance plugins with go.d.plugin (data collection orchestrator written in Go) - 7 new data collectors and 11 rewrites of existing data collectors for improved performance -- A new management API for all netdata servers -- Bind different functions of the netdata APIs to different ports +- A new management API for all Netdata servers +- Bind different functions of the Netdata APIs to different ports - Improved installation and updates -`Nov 22nd, 2018` - **[netdata v1.11.1 released!](https://github.com/netdata/netdata/releases)** +--- + +`Nov 22nd, 2018` - **[Netdata v1.11.1 released!](https://github.com/netdata/netdata/releases)** - Improved internal database to support values above 64bit. - New data collection plugins: [`openldap`](collectors/python.d.plugin/openldap/), [`tor`](collectors/python.d.plugin/tor/), [`nvidia_smi`](collectors/python.d.plugin/nvidia_smi/). -- Improved data collection plugins: netdata now supports monitoring network interface aliases, [`smartd_log`](collectors/python.d.plugin/smartd_log/), [`cpufreq`](collectors/proc.plugin/README.md#cpu-frequency), [`sensors`](collectors/python.d.plugin/sensors/). +- Improved data collection plugins: Netdata now supports monitoring network interface aliases, [`smartd_log`](collectors/python.d.plugin/smartd_log/), [`cpufreq`](collectors/proc.plugin/README.md#cpu-frequency), [`sensors`](collectors/python.d.plugin/sensors/). - Health monitoring improvements: network interface congestion alarm restored, [`alerta.io`](health/notifications/alerta/), `conntrack_max`. - `my-netdata`menu has been refactored. - Packaging: `openrc` service definition got a few improvements. --- -`Sep 18, 2018` - **netdata has its own organization** +`Sep 18, 2018` - **Netdata has its own organization** Netdata used to be a [firehol.org](https://firehol.org) project, accessible as `firehol/netdata`. @@ -222,7 +257,7 @@ Function|Description|Documentation **Collect**|Multiple independent data collection workers are collecting metrics from their sources using the optimal protocol for each application and push the metrics to the database. Each data collection worker has lockless write access to the metrics it collects.|[`collectors`](collectors/#data-collection-plugins) **Store**|Metrics are stored in RAM in a round robin database (ring buffer), using a custom made floating point number for minimal footprint.|[`database`](database/#database) **Check**|A lockless independent watchdog is evaluating **health checks** on the collected metrics, triggers alarms, maintains a health transaction log and dispatches alarm notifications.|[`health`](health/#health-monitoring) -**Stream**|An lockless independent worker is streaming metrics, in full detail and in real-time, to remote netdata servers, as soon as they are collected.|[`streaming`](streaming/#streaming-and-replication) +**Stream**|An lockless independent worker is streaming metrics, in full detail and in real-time, to remote Netdata servers, as soon as they are collected.|[`streaming`](streaming/#streaming-and-replication) **Archive**|A lockless independent worker is down-sampling the metrics and pushes them to **backend** time-series databases.|[`backends`](backends/) **Query**|Multiple independent workers are attached to the [internal web server](web/server/#web-server), servicing API requests, including [data queries](web/api/queries/#database-queries).|[`web/api`](web/api/#api) @@ -230,7 +265,7 @@ The result is a highly efficient, low latency system, supporting multiple reader ## Infographic -This is a high level overview of netdata feature set and architecture. +This is a high level overview of Netdata feature set and architecture. Click it to to interact with it (it has direct links to documentation). [![image](https://user-images.githubusercontent.com/2662304/47672043-a47eb480-dbb9-11e8-92a4-fa422d053309.png)](https://my-netdata.io/infographic.html) @@ -271,7 +306,7 @@ This is what you should expect from Netdata: ### Positive and negative values -To improve clarity on charts, netdata dashboards present **positive** values for metrics representing `read`, `input`, `inbound`, `received` and **negative** values for metrics representing `write`, `output`, `outbound`, `sent`. +To improve clarity on charts, Netdata dashboards present **positive** values for metrics representing `read`, `input`, `inbound`, `received` and **negative** values for metrics representing `write`, `output`, `outbound`, `sent`. ![positive-and-negative-values](https://user-images.githubusercontent.com/2662304/48309090-7c5c6180-e57a-11e8-8e03-3a7538c14223.gif) @@ -287,14 +322,13 @@ Netdata charts automatically zoom vertically, to visualize the variation of each ### Charts are synchronized -Charts on netdata dashboards are synchronized to each other. There is no master chart. Any chart can be panned or zoomed at any time, and all other charts will follow. +Charts on Netdata dashboards are synchronized to each other. There is no master chart. Any chart can be panned or zoomed at any time, and all other charts will follow. ![charts-are-synchronized](https://user-images.githubusercontent.com/2662304/48309003-b4fb3b80-e578-11e8-86f6-f505c7059c15.gif) *Charts are panned by dragging them with the mouse. Charts can be zoomed in/out with`SHIFT` + `mouse wheel` while the mouse pointer is over a chart.* -> The visible time-frame (pan and zoom) is propagated from netdata server to netdata server, when navigating via the [`my-netdata` menu](registry#registry). - +> The visible time-frame (pan and zoom) is propagated from Netdata server to Netdata server, when navigating via the [node menu](registry#registry). ### Highlighted time-frame @@ -304,19 +338,19 @@ To improve visual anomaly detection across charts, the user can highlight a time *A highlighted time-frame can be given by pressing `ALT` + `mouse selection` on any chart. Netdata will highlight the same range on all charts.* -> Highlighted ranges are propagated from netdata server to netdata server, when navigating via the [`my-netdata` menu](registry#registry). +> Highlighted ranges are propagated from Netdata server to Netdata server, when navigating via the [node menu](registry#registry). ## What does it monitor Netdata data collection is **extensible** - you can monitor anything you can get a metric for. -Its [Plugin API](collectors/plugins.d/) supports all programing languages (anything can be a netdata plugin, BASH, python, perl, node.js, java, Go, ruby, etc). +Its [Plugin API](collectors/plugins.d/) supports all programing languages (anything can be a Netdata plugin, BASH, python, perl, node.js, java, Go, ruby, etc). - For better performance, most system related plugins (cpu, memory, disks, filesystems, networking, etc) have been written in `C`. - For faster development and easier contributions, most application related plugins (databases, web servers, etc) have been written in `python`. #### APM (Application Performance Monitoring) -- **[statsd](collectors/statsd.plugin/)** - netdata is a fully featured statsd server. +- **[statsd](collectors/statsd.plugin/)** - Netdata is a fully featured statsd server. - **[Go expvar](collectors/python.d.plugin/go_expvar/)** - collects metrics exposed by applications written in the Go programming language using the expvar package. - **[Spring Boot](collectors/python.d.plugin/springboot/)** - monitors running Java Spring Boot applications that expose their metrics with the use of the Spring Boot Actuator included in Spring Boot library. - **[uWSGI](collectors/python.d.plugin/uwsgi/)** - collects performance metrics from uWSGI applications. @@ -499,23 +533,23 @@ You can easily extend Netdata, by writing plugins that collect data from any sou ## Documentation -The netdata documentation is at [https://docs.netdata.cloud](https://docs.netdata.cloud). But you can also find it inside the repo, so by just navigating the repo on github you can find all the documentation. +The Netdata documentation is at [https://docs.netdata.cloud](https://docs.netdata.cloud). But you can also find it inside the repo, so by just navigating the repo on github you can find all the documentation. Here is a quick list: Directory|Description :---|:--- -[`installer`](packaging/installer/)|Instructions to install netdata on your systems. -[`docker`](packaging/docker/)|Instructions to install netdata using docker. -[`daemon`](daemon/)|Information about the netdata daemon and its configuration. +[`installer`](packaging/installer/)|Instructions to install Netdata on your systems. +[`docker`](packaging/docker/)|Instructions to install Netdata using docker. +[`daemon`](daemon/)|Information about the Netdata daemon and its configuration. [`collectors`](collectors/)|Information about data collection plugins. -[`health`](health/)|How netdata's health monitoring works, how to create your own alarms and how to configure alarm notification methods. -[`streaming`](streaming/)|How to build hierarchies of netdata servers, by streaming metrics between them. +[`health`](health/)|How Netdata's health monitoring works, how to create your own alarms and how to configure alarm notification methods. +[`streaming`](streaming/)|How to build hierarchies of Netdata servers, by streaming metrics between them. [`backends`](backends/)|Long term archiving of metrics to industry standard time-series databases, like `prometheus`, `graphite`, `opentsdb`. -[`web/api`](web/api/)|Learn how to query the netdata API and the queries it supports. +[`web/api`](web/api/)|Learn how to query the Netdata API and the queries it supports. [`web/api/badges`](web/api/badges/)|Learn how to generate badges (SVG images) from live data. -[`web/gui/custom`](web/gui/custom/)|Learn how to create custom netdata dashboards. -[`web/gui/confluence`](web/gui/confluence/)|Learn how to create netdata dashboards on Atlassian's Confluence. +[`web/gui/custom`](web/gui/custom/)|Learn how to create custom Netdata dashboards. +[`web/gui/confluence`](web/gui/confluence/)|Learn how to create Netdata dashboards on Atlassian's Confluence. You can also check all the other directories. Most of them have plenty of documentation. @@ -525,7 +559,7 @@ We welcome [contributions](CONTRIBUTING.md). So, feel free to join the team. To report bugs, or get help, use [GitHub Issues](https://github.com/netdata/netdata/issues). -You can also find netdata on: +You can also find Netdata on: - [Facebook](https://www.facebook.com/linuxnetdata/) - [Twitter](https://twitter.com/linuxnetdata) @@ -535,7 +569,7 @@ You can also find netdata on: ## License -netdata is [GPLv3+](LICENSE). +Netdata is [GPLv3+](LICENSE). Netdata re-distributes other open-source tools and libraries. Please check the [third party licenses](REDISTRIBUTED.md). diff --git a/docs/Netdata-Security-and-Disclosure-Information.md b/SECURITY.md index 8e8a66af..f0296893 100644 --- a/docs/Netdata-Security-and-Disclosure-Information.md +++ b/SECURITY.md @@ -1,18 +1,18 @@ -# Netdata Security and Disclosure Information +# Security Policy -This page describes netdata security and disclosure information. +## Supported Versions -## Security Announcements +| Version | Supported | +| ------- | ------------------ | +| Latest | Yes | -Every time a security issue is fixed in netdata, we immediately release a new version of it. So, to get notified of all security incidents, please subscribe to our releases on github. - -## Report a Vulnerability +## Reporting a Vulnerability We’re extremely grateful for security researchers and users that report vulnerabilities to Netdata Open Source Community. All reports are thoroughly investigated by a set of community volunteers. -To make a report, please email the private [security@netdata.cloud](mailto:security@netdata.cloud) list with the security details and the details expected for [all netdata bug reports](../.github/ISSUE_TEMPLATE/bug_report.md). +To make a report, please create a post [here](https://groups.google.com/a/netdata.cloud/forum/#!newtopic/security) with the vulnerability details and the details expected for [all Netdata bug reports](.github/ISSUE_TEMPLATE/bug_report.md). -## When Should I Report a Vulnerability? +### When Should I Report a Vulnerability? - You think you discovered a potential security vulnerability in Netdata - You are unsure how a vulnerability affects Netdata @@ -24,7 +24,7 @@ To make a report, please email the private [security@netdata.cloud](mailto:secur - You need help applying security related updates - Your issue is not security related -## Security Vulnerability Response +### Security Vulnerability Response Each report is acknowledged and analyzed by Netdata Team members within 3 working days. This will set off a Security Release Process. @@ -32,8 +32,12 @@ Any vulnerability information shared with Netdata Team stays within Netdata proj As the security issue moves from triage, to identified fix, to release planning we will keep the reporter updated. -## Public Disclosure Timing +### Public Disclosure Timing A public disclosure date is negotiated by the Netdata team and the bug submitter. We prefer to fully disclose the bug as soon as possible once a user mitigation is available. It is reasonable to delay disclosure when the bug or the fix is not yet fully understood, the solution is not well-tested, or for vendor coordination. The timeframe for disclosure is from immediate (especially if it's already publicly known) to a few weeks. As a basic default, we expect report date to disclosure date to be on the order of 7 days. The Netdata team holds the final say when setting a disclosure date. -[![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fdocs%2FNetdata-Security-and-Disclosure-Information&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)]() +### Security Announcements + +Every time a security issue is fixed in Netdata, we immediately release a new version of it. So, to get notified of all security incidents, please subscribe to our releases on github. + +[![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fdocs%2FSECURITY&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)]() diff --git a/backends/README.md b/backends/README.md index efaba0ca..bdc40901 100644 --- a/backends/README.md +++ b/backends/README.md @@ -22,7 +22,7 @@ X seconds (though, it can send them per second if you need it to). metrics are sent to the backend server as `prefix.hostname.chart.dimension`. `prefix` is configured below, `hostname` is the hostname of the machine (can also be configured). - - **opentsdb** (`telnet interface`, used by **OpenTSDB**, **InfluxDB**, **KairosDB**, etc) + - **opentsdb** (`telnet or HTTP interfaces`, used by **OpenTSDB**, **InfluxDB**, **KairosDB**, etc) metrics are sent to opentsdb as `prefix.chart.dimension` with tag `host=hostname`. @@ -32,6 +32,12 @@ X seconds (though, it can send them per second if you need it to). - **prometheus** is described at [prometheus page](prometheus/) since it pulls data from netdata. + - **prometheus remote write** (a binary snappy-compressed protocol buffer encoding over HTTP used by + a lot of [storage providers](https://prometheus.io/docs/operating/integrations/#remote-endpoints-and-storage)) + + metrics are labeled in the format, which is used by Netdata for the [plaintext prometheus protocol](prometheus/). + Notes on using the remote write backend are [here](prometheus/remote_write/). + - **AWS Kinesis Data Streams** metrics are sent to the service in `JSON` format. @@ -70,7 +76,7 @@ of `netdata.conf` from your netdata): ``` [backend] enabled = yes | no - type = graphite | opentsdb | json | kinesis + type = graphite | opentsdb:telnet | opentsdb:http | opentsdb:https | prometheus_remote_write | json | kinesis host tags = list of TAG=VALUE destination = space separated list of [PROTOCOL:]HOST[:PORT] - the first working will be used, or a region for kinesis data source = average | sum | as collected @@ -86,7 +92,7 @@ of `netdata.conf` from your netdata): - `enabled = yes | no`, enables or disables sending data to a backend -- `type = graphite | opentsdb | json | kinesis`, selects the backend type +- `type = graphite | opentsdb:telnet | opentsdb:http | opentsdb:https | json | kinesis`, selects the backend type - `destination = host1 host2 host3 ...`, accepts **a space separated list** of hostnames, IPs (IPv4 and IPv6) and ports to connect to. diff --git a/backends/aws_kinesis/README.md b/backends/aws_kinesis/README.md index a9cc77d6..42497909 100644 --- a/backends/aws_kinesis/README.md +++ b/backends/aws_kinesis/README.md @@ -4,7 +4,11 @@ To use AWS Kinesis as a backend AWS SDK for C++ should be [installed](https://docs.aws.amazon.com/en_us/sdk-for-cpp/v1/developer-guide/setup.html) first. `libcrypto`, `libssl`, and `libcurl` are also required to compile netdata with Kinesis support enabled. Next, netdata should be re-installed from the source. The installer will detect that the required libraries are now available. -If AWS SDK for C++ is being installed from sources, it is useful to set `-DBUILD_ONLY="kinesis"`. Otherwise, the building process could take a very long time. +If the AWS SDK for C++ is being installed from source, it is useful to set `-DBUILD_ONLY="kinesis"`. Otherwise, the building process could take a very long time. Take a note, that the default installation path for the libraries is `/usr/local/lib64`. Many Linux distributions don't include this path as the default one for a library search, so it is advisable to use the following options to `cmake` while building the AWS SDK: + +``` +cmake -DCMAKE_INSTALL_LIBDIR=/usr/lib -DCMAKE_INSTALL_INCLUDEDIR=/usr/include -DBUILD_SHARED_LIBS=OFF -DBUILD_ONLY=kinesis <aws-sdk-cpp sources> +``` ## Configuration diff --git a/backends/backends.c b/backends/backends.c index 0e791891..15a0cb41 100644 --- a/backends/backends.c +++ b/backends/backends.c @@ -246,6 +246,194 @@ static void backends_main_cleanup(void *ptr) { static_thread->enabled = NETDATA_MAIN_THREAD_EXITED; } +/** + * Set Kinesis variables + * + * Set the variables necessaries to work with this specific backend. + * + * @param default_port the default port of the backend + * @param brc function called to check the result. + * @param brf function called to format the msessage to the backend + * @param type the backend string selector. + */ +void backend_set_kinesis_variables(int *default_port, + backend_response_checker_t brc, + backend_request_formatter_t brf) +{ + (void)default_port; +#ifndef HAVE_KINESIS + (void)brc; + (void)brf; +#endif + +#if HAVE_KINESIS + *brc = process_json_response; + if (BACKEND_OPTIONS_DATA_SOURCE(global_backend_options) == BACKEND_SOURCE_DATA_AS_COLLECTED) + *brf = format_dimension_collected_json_plaintext; + else + *brf = format_dimension_stored_json_plaintext; +#endif +} + +/** + * Set Prometheus variables + * + * Set the variables necessaries to work with this specific backend. + * + * @param default_port the default port of the backend + * @param brc function called to check the result. + * @param brf function called to format the msessage to the backend + * @param type the backend string selector. + */ +void backend_set_prometheus_variables(int *default_port, + backend_response_checker_t brc, + backend_request_formatter_t brf) +{ + (void)default_port; + (void)brf; +#ifndef ENABLE_PROMETHEUS_REMOTE_WRITE + (void)brc; +#endif + +#if ENABLE_PROMETHEUS_REMOTE_WRITE + *brc = process_prometheus_remote_write_response; +#endif /* ENABLE_PROMETHEUS_REMOTE_WRITE */ +} + +/** + * Set JSON variables + * + * Set the variables necessaries to work with this specific backend. + * + * @param default_port the default port of the backend + * @param brc function called to check the result. + * @param brf function called to format the msessage to the backend + * @param type the backend string selector. + */ +void backend_set_json_variables(int *default_port, + backend_response_checker_t brc, + backend_request_formatter_t brf) +{ + *default_port = 5448; + *brc = process_json_response; + + if (BACKEND_OPTIONS_DATA_SOURCE(global_backend_options) == BACKEND_SOURCE_DATA_AS_COLLECTED) + *brf = format_dimension_collected_json_plaintext; + else + *brf = format_dimension_stored_json_plaintext; +} + +/** + * Set OpenTSDB HTTP variables + * + * Set the variables necessaries to work with this specific backend. + * + * @param default_port the default port of the backend + * @param brc function called to check the result. + * @param brf function called to format the msessage to the backend + * @param type the backend string selector. + */ +void backend_set_opentsdb_http_variables(int *default_port, + backend_response_checker_t brc, + backend_request_formatter_t brf) +{ + *default_port = 4242; + *brc = process_opentsdb_response; + + if(BACKEND_OPTIONS_DATA_SOURCE(global_backend_options) == BACKEND_SOURCE_DATA_AS_COLLECTED) + *brf = format_dimension_collected_opentsdb_http; + else + *brf = format_dimension_stored_opentsdb_http; + +} + +/** + * Set OpenTSDB Telnet variables + * + * Set the variables necessaries to work with this specific backend. + * + * @param default_port the default port of the backend + * @param brc function called to check the result. + * @param brf function called to format the msessage to the backend + * @param type the backend string selector. + */ +void backend_set_opentsdb_telnet_variables(int *default_port, + backend_response_checker_t brc, + backend_request_formatter_t brf) +{ + *default_port = 4242; + *brc = process_opentsdb_response; + + if(BACKEND_OPTIONS_DATA_SOURCE(global_backend_options) == BACKEND_SOURCE_DATA_AS_COLLECTED) + *brf = format_dimension_collected_opentsdb_telnet; + else + *brf = format_dimension_stored_opentsdb_telnet; +} + +/** + * Set Graphite variables + * + * Set the variables necessaries to work with this specific backend. + * + * @param default_port the default port of the backend + * @param brc function called to check the result. + * @param brf function called to format the msessage to the backend + * @param type the backend string selector. + */ +void backend_set_graphite_variables(int *default_port, + backend_response_checker_t brc, + backend_request_formatter_t brf) +{ + *default_port = 2003; + *brc = process_graphite_response; + + if(BACKEND_OPTIONS_DATA_SOURCE(global_backend_options) == BACKEND_SOURCE_DATA_AS_COLLECTED) + *brf = format_dimension_collected_graphite_plaintext; + else + *brf = format_dimension_stored_graphite_plaintext; +} + +/** + * Select Type + * + * Select the backedn type based in the user input + * + * @param type is the string that defines the backend type + * + * @return It returns the backend id. + */ +BACKEND_TYPE backend_select_type(const char *type) { + if(!strcmp(type, "graphite") || !strcmp(type, "graphite:plaintext")) { + return BACKEND_TYPE_GRAPHITE; + } + else if(!strcmp(type, "opentsdb") || !strcmp(type, "opentsdb:telnet")) { + return BACKEND_TYPE_OPENTSDB_USING_TELNET; + } + else if(!strcmp(type, "opentsdb:http") || !strcmp(type, "opentsdb:https")) { + return BACKEND_TYPE_OPENTSDB_USING_HTTP; + } + else if (!strcmp(type, "json") || !strcmp(type, "json:plaintext")) { + return BACKEND_TYPE_JSON; + } + else if (!strcmp(type, "prometheus_remote_write")) { + return BACKEND_TYPE_PROMETEUS; + } + else if (!strcmp(type, "kinesis") || !strcmp(type, "kinesis:plaintext")) { + return BACKEND_TYPE_KINESIS; + } + + return BACKEND_TYPE_UNKNOWN; +} + +/** + * Backend main + * + * The main thread used to control the backedns. + * + * @param ptr a pointer to netdata_static_structure. + * + * @return It always return NULL. + */ void *backends_main(void *ptr) { netdata_thread_cleanup_push(backends_main_cleanup, ptr); @@ -260,6 +448,15 @@ void *backends_main(void *ptr) { char *kinesis_auth_key_id = NULL, *kinesis_secure_key = NULL, *kinesis_stream_name = NULL; #endif +#if ENABLE_PROMETHEUS_REMOTE_WRITE + int do_prometheus_remote_write = 0; + BUFFER *http_request_header = buffer_create(1); +#endif + +#ifdef ENABLE_HTTPS + struct netdata_ssl opentsdb_ssl = {NULL , NETDATA_SSL_START}; +#endif + // ------------------------------------------------------------------------ // collect configuration options @@ -285,6 +482,10 @@ void *backends_main(void *ptr) { charts_pattern = simple_pattern_create(config_get(CONFIG_SECTION_BACKEND, "send charts matching", "*"), NULL, SIMPLE_PATTERN_EXACT); hosts_pattern = simple_pattern_create(config_get(CONFIG_SECTION_BACKEND, "send hosts matching", "localhost *"), NULL, SIMPLE_PATTERN_EXACT); +#if ENABLE_PROMETHEUS_REMOTE_WRITE + const char *remote_write_path = config_get(CONFIG_SECTION_BACKEND, "remote write URL path", "/receive"); +#endif + // ------------------------------------------------------------------------ // validate configuration options // and prepare for sending data to our backend @@ -303,90 +504,95 @@ void *backends_main(void *ptr) { // ------------------------------------------------------------------------ // select the backend type - - if(!strcmp(type, "graphite") || !strcmp(type, "graphite:plaintext")) { - - default_port = 2003; - backend_response_checker = process_graphite_response; - - if(BACKEND_OPTIONS_DATA_SOURCE(global_backend_options) == BACKEND_SOURCE_DATA_AS_COLLECTED) - backend_request_formatter = format_dimension_collected_graphite_plaintext; - else - backend_request_formatter = format_dimension_stored_graphite_plaintext; - - } - else if(!strcmp(type, "opentsdb") || !strcmp(type, "opentsdb:telnet")) { - - default_port = 4242; - backend_response_checker = process_opentsdb_response; - - if(BACKEND_OPTIONS_DATA_SOURCE(global_backend_options) == BACKEND_SOURCE_DATA_AS_COLLECTED) - backend_request_formatter = format_dimension_collected_opentsdb_telnet; - else - backend_request_formatter = format_dimension_stored_opentsdb_telnet; - + BACKEND_TYPE work_type = backend_select_type(type); + if (work_type == BACKEND_TYPE_UNKNOWN) { + error("BACKEND: Unknown backend type '%s'", type); + goto cleanup; } - else if (!strcmp(type, "json") || !strcmp(type, "json:plaintext")) { - default_port = 5448; - backend_response_checker = process_json_response; - - if (BACKEND_OPTIONS_DATA_SOURCE(global_backend_options) == BACKEND_SOURCE_DATA_AS_COLLECTED) - backend_request_formatter = format_dimension_collected_json_plaintext; - else - backend_request_formatter = format_dimension_stored_json_plaintext; + switch (work_type) { + case BACKEND_TYPE_OPENTSDB_USING_HTTP: { +#ifdef ENABLE_HTTPS + if (!strcmp(type, "opentsdb:https")) { + security_start_ssl(NETDATA_SSL_CONTEXT_OPENTSDB); + } +#endif + backend_set_opentsdb_http_variables(&default_port,&backend_response_checker,&backend_request_formatter); + break; + } + case BACKEND_TYPE_PROMETEUS: { +#if ENABLE_PROMETHEUS_REMOTE_WRITE + do_prometheus_remote_write = 1; - } + init_write_request(); +#else + error("BACKEND: Prometheus remote write support isn't compiled"); +#endif // ENABLE_PROMETHEUS_REMOTE_WRITE + backend_set_prometheus_variables(&default_port,&backend_response_checker,&backend_request_formatter); + break; + } + case BACKEND_TYPE_KINESIS: { #if HAVE_KINESIS - else if (!strcmp(type, "kinesis") || !strcmp(type, "kinesis:plaintext")) { + do_kinesis = 1; - do_kinesis = 1; + if(unlikely(read_kinesis_conf(netdata_configured_user_config_dir, &kinesis_auth_key_id, &kinesis_secure_key, &kinesis_stream_name))) { + error("BACKEND: kinesis backend type is set but cannot read its configuration from %s/aws_kinesis.conf", netdata_configured_user_config_dir); + goto cleanup; + } - if(unlikely(read_kinesis_conf(netdata_configured_user_config_dir, &kinesis_auth_key_id, &kinesis_secure_key, &kinesis_stream_name))) { - error("BACKEND: kinesis backend type is set but cannot read its configuration from %s/aws_kinesis.conf", netdata_configured_user_config_dir); - goto cleanup; + kinesis_init(destination, kinesis_auth_key_id, kinesis_secure_key, timeout.tv_sec * 1000 + timeout.tv_usec / 1000); +#else + error("BACKEND: AWS Kinesis support isn't compiled"); +#endif // HAVE_KINESIS + backend_set_kinesis_variables(&default_port,&backend_response_checker,&backend_request_formatter); + break; + } + case BACKEND_TYPE_GRAPHITE: { + backend_set_graphite_variables(&default_port,&backend_response_checker,&backend_request_formatter); + break; + } + case BACKEND_TYPE_OPENTSDB_USING_TELNET: { + backend_set_opentsdb_telnet_variables(&default_port,&backend_response_checker,&backend_request_formatter); + break; + } + case BACKEND_TYPE_JSON: { + backend_set_json_variables(&default_port,&backend_response_checker,&backend_request_formatter); + break; + } + case BACKEND_TYPE_UNKNOWN: { + break; } - - kinesis_init(destination, kinesis_auth_key_id, kinesis_secure_key, timeout.tv_sec * 1000 + timeout.tv_usec / 1000); - - backend_response_checker = process_json_response; - if (BACKEND_OPTIONS_DATA_SOURCE(global_backend_options) == BACKEND_SOURCE_DATA_AS_COLLECTED) - backend_request_formatter = format_dimension_collected_json_plaintext; - else - backend_request_formatter = format_dimension_stored_json_plaintext; - - } -#endif /* HAVE_KINESIS */ - else { - error("BACKEND: Unknown backend type '%s'", type); - goto cleanup; } +#if ENABLE_PROMETHEUS_REMOTE_WRITE + if((backend_request_formatter == NULL && !do_prometheus_remote_write) || backend_response_checker == NULL) { +#else if(backend_request_formatter == NULL || backend_response_checker == NULL) { +#endif error("BACKEND: backend is misconfigured - disabling it."); goto cleanup; } - // ------------------------------------------------------------------------ - // prepare the charts for monitoring the backend operation +// ------------------------------------------------------------------------ +// prepare the charts for monitoring the backend operation struct rusage thread; collected_number - chart_buffered_metrics = 0, - chart_lost_metrics = 0, - chart_sent_metrics = 0, - chart_buffered_bytes = 0, - chart_received_bytes = 0, - chart_sent_bytes = 0, - chart_receptions = 0, - chart_transmission_successes = 0, - chart_transmission_failures = 0, - chart_data_lost_events = 0, - chart_lost_bytes = 0, - chart_backend_reconnects = 0; - // chart_backend_latency = 0; + chart_buffered_metrics = 0, + chart_lost_metrics = 0, + chart_sent_metrics = 0, + chart_buffered_bytes = 0, + chart_received_bytes = 0, + chart_sent_bytes = 0, + chart_receptions = 0, + chart_transmission_successes = 0, + chart_transmission_failures = 0, + chart_data_lost_events = 0, + chart_lost_bytes = 0, + chart_backend_reconnects = 0; + // chart_backend_latency = 0; RRDSET *chart_metrics = rrdset_create_localhost("netdata", "backend_metrics", NULL, "backend", NULL, "Netdata Buffered Metrics", "metrics", "backends", NULL, 130600, global_backend_update_every, RRDSET_TYPE_LINE); rrddim_add(chart_metrics, "buffered", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); @@ -407,12 +613,12 @@ void *backends_main(void *ptr) { rrddim_add(chart_ops, "read", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); /* - * this is misleading - we can only measure the time we need to send data - * this time is not related to the time required for the data to travel to - * the backend database and the time that server needed to process them - * - * issue #1432 and https://www.softlab.ntua.gr/facilities/documentation/unix/unix-socket-faq/unix-socket-faq-2.html - * + * this is misleading - we can only measure the time we need to send data + * this time is not related to the time required for the data to travel to + * the backend database and the time that server needed to process them + * + * issue #1432 and https://www.softlab.ntua.gr/facilities/documentation/unix/unix-socket-faq/unix-socket-faq-2.html + * RRDSET *chart_latency = rrdset_create_localhost("netdata", "backend_latency", NULL, "backend", NULL, "Netdata Backend Latency", "ms", "backends", NULL, 130620, global_backend_update_every, RRDSET_TYPE_AREA); rrddim_add(chart_latency, "latency", NULL, 1, 1000, RRD_ALGORITHM_ABSOLUTE); */ @@ -451,6 +657,9 @@ void *backends_main(void *ptr) { size_t count_charts_total = 0; size_t count_dims_total = 0; +#if ENABLE_PROMETHEUS_REMOTE_WRITE + clear_write_request(); +#endif rrd_rdlock(); RRDHOST *host; rrdhost_foreach_read(host) { @@ -478,26 +687,45 @@ void *backends_main(void *ptr) { const char *__hostname = (host == localhost)?hostname:host->hostname; - RRDSET *st; - rrdset_foreach_read(st, host) { - if(likely(backends_can_send_rrdset(global_backend_options, st))) { - rrdset_rdlock(st); - - count_charts++; - - RRDDIM *rd; - rrddim_foreach_read(rd, st) { - if (likely(rd->last_collected_time.tv_sec >= after)) { - chart_buffered_metrics += backend_request_formatter(b, global_backend_prefix, host, __hostname, st, rd, after, before, global_backend_options); - count_dims++; - } - else { - debug(D_BACKEND, "BACKEND: not sending dimension '%s' of chart '%s' from host '%s', its last data collection (%lu) is not within our timeframe (%lu to %lu)", rd->id, st->id, __hostname, (unsigned long)rd->last_collected_time.tv_sec, (unsigned long)after, (unsigned long)before); - count_dims_skipped++; +#if ENABLE_PROMETHEUS_REMOTE_WRITE + if(do_prometheus_remote_write) { + rrd_stats_remote_write_allmetrics_prometheus( + host + , __hostname + , global_backend_prefix + , global_backend_options + , after + , before + , &count_charts + , &count_dims + , &count_dims_skipped + ); + chart_buffered_metrics += count_dims; + } + else +#endif + { + RRDSET *st; + rrdset_foreach_read(st, host) { + if(likely(backends_can_send_rrdset(global_backend_options, st))) { + rrdset_rdlock(st); + + count_charts++; + + RRDDIM *rd; + rrddim_foreach_read(rd, st) { + if (likely(rd->last_collected_time.tv_sec >= after)) { + chart_buffered_metrics += backend_request_formatter(b, global_backend_prefix, host, __hostname, st, rd, after, before, global_backend_options); + count_dims++; + } + else { + debug(D_BACKEND, "BACKEND: not sending dimension '%s' of chart '%s' from host '%s', its last data collection (%lu) is not within our timeframe (%lu to %lu)", rd->id, st->id, __hostname, (unsigned long)rd->last_collected_time.tv_sec, (unsigned long)after, (unsigned long)before); + count_dims_skipped++; + } } - } - rrdset_unlock(st); + rrdset_unlock(st); + } } } @@ -621,7 +849,16 @@ void *backends_main(void *ptr) { while(sock != -1 && errno != EWOULDBLOCK) { buffer_need_bytes(response, 4096); - ssize_t r = recv(sock, &response->buffer[response->len], response->size - response->len, MSG_DONTWAIT); + ssize_t r; +#ifdef ENABLE_HTTPS + if(opentsdb_ssl.conn && !opentsdb_ssl.flags) { + r = SSL_read(opentsdb_ssl.conn, &response->buffer[response->len], response->size - response->len); + } else { + r = recv(sock, &response->buffer[response->len], response->size - response->len, MSG_DONTWAIT); + } +#else + r = recv(sock, &response->buffer[response->len], response->size - response->len, MSG_DONTWAIT); +#endif if(likely(r > 0)) { // we received some data response->len += r; @@ -654,7 +891,37 @@ void *backends_main(void *ptr) { size_t reconnects = 0; sock = connect_to_one_of(destination, default_port, &timeout, &reconnects, NULL, 0); +#ifdef ENABLE_HTTPS + if(sock != -1) { + if(netdata_opentsdb_ctx) { + if(!opentsdb_ssl.conn) { + opentsdb_ssl.conn = SSL_new(netdata_opentsdb_ctx); + if(!opentsdb_ssl.conn) { + error("Failed to allocate SSL structure %d.", sock); + opentsdb_ssl.flags = NETDATA_SSL_NO_HANDSHAKE; + } + } else { + SSL_clear(opentsdb_ssl.conn); + } + } + if(opentsdb_ssl.conn) { + if(SSL_set_fd(opentsdb_ssl.conn, sock) != 1) { + error("Failed to set the socket to the SSL on socket fd %d.", host->rrdpush_sender_socket); + opentsdb_ssl.flags = NETDATA_SSL_NO_HANDSHAKE; + } else { + opentsdb_ssl.flags = NETDATA_SSL_HANDSHAKE_COMPLETE; + SSL_set_connect_state(opentsdb_ssl.conn); + int err = SSL_connect(opentsdb_ssl.conn); + if (err != 1) { + err = SSL_get_error(opentsdb_ssl.conn, err); + error("SSL cannot connect with the server: %s ", ERR_error_string((long)SSL_get_error(opentsdb_ssl.conn, err), NULL)); + opentsdb_ssl.flags = NETDATA_SSL_NO_HANDSHAKE; + } //TODO: check certificate here + } + } + } +#endif chart_backend_reconnects += reconnects; // chart_backend_latency += now_monotonic_usec() - start_ut; } @@ -672,7 +939,54 @@ void *backends_main(void *ptr) { flags += MSG_NOSIGNAL; #endif - ssize_t written = send(sock, buffer_tostring(b), len, flags); +#if ENABLE_PROMETHEUS_REMOTE_WRITE + if(do_prometheus_remote_write) { + size_t data_size = get_write_request_size(); + + if(unlikely(!data_size)) { + error("BACKEND: write request size is out of range"); + continue; + } + + buffer_flush(b); + buffer_need_bytes(b, data_size); + if(unlikely(pack_write_request(b->buffer, &data_size))) { + error("BACKEND: cannot pack write request"); + continue; + } + b->len = data_size; + chart_buffered_bytes = (collected_number)buffer_strlen(b); + + buffer_flush(http_request_header); + buffer_sprintf(http_request_header, + "POST %s HTTP/1.1\r\n" + "Host: %s\r\n" + "Accept: */*\r\n" + "Content-Length: %zu\r\n" + "Content-Type: application/x-www-form-urlencoded\r\n\r\n", + remote_write_path, + hostname, + data_size + ); + + len = buffer_strlen(http_request_header); + send(sock, buffer_tostring(http_request_header), len, flags); + + len = data_size; + } +#endif + + ssize_t written; +#ifdef ENABLE_HTTPS + if(opentsdb_ssl.conn && !opentsdb_ssl.flags) { + written = SSL_write(opentsdb_ssl.conn, buffer_tostring(b), len); + } else { + written = send(sock, buffer_tostring(b), len, flags); + } +#else + written = send(sock, buffer_tostring(b), len, flags); +#endif + // chart_backend_latency += now_monotonic_usec() - start_ut; if(written != -1 && (size_t)written == len) { // we sent the data successfully @@ -711,6 +1025,16 @@ void *backends_main(void *ptr) { } } + +#if ENABLE_PROMETHEUS_REMOTE_WRITE + if(failures) { + (void) buffer_on_failures; + failures = 0; + chart_lost_bytes = chart_buffered_bytes = get_write_request_size(); // estimated write request size + chart_data_lost_events++; + chart_lost_metrics = chart_buffered_metrics; + } +#else if(failures > buffer_on_failures) { // too bad! we are going to lose data chart_lost_bytes += buffer_strlen(b); @@ -720,6 +1044,7 @@ void *backends_main(void *ptr) { chart_data_lost_events++; chart_lost_metrics = chart_buffered_metrics; } +#endif /* ENABLE_PROMETHEUS_REMOTE_WRITE */ if(unlikely(netdata_exit)) break; @@ -775,12 +1100,27 @@ cleanup: } #endif +#if ENABLE_PROMETHEUS_REMOTE_WRITE + if(do_prometheus_remote_write) { + buffer_free(http_request_header); + protocol_buffers_shutdown(); + } +#endif + if(sock != -1) close(sock); buffer_free(b); buffer_free(response); +#ifdef ENABLE_HTTPS + if(netdata_opentsdb_ctx) { + if(opentsdb_ssl.conn) { + SSL_free(opentsdb_ssl.conn); + } + } +#endif + netdata_thread_cleanup_pop(1); return NULL; } diff --git a/backends/backends.h b/backends/backends.h index 11866549..8d0bda41 100644 --- a/backends/backends.h +++ b/backends/backends.h @@ -15,6 +15,20 @@ typedef enum backend_options { BACKEND_OPTION_SEND_NAMES = (1 << 16) } BACKEND_OPTIONS; +typedef enum backend_types { + BACKEND_TYPE_UNKNOWN, //Invalid type + BACKEND_TYPE_GRAPHITE, //Send plain text to Graphite + BACKEND_TYPE_OPENTSDB_USING_TELNET, //Send data to OpenTSDB using telnet API + BACKEND_TYPE_OPENTSDB_USING_HTTP, //Send data to OpenTSDB using HTTP API + BACKEND_TYPE_JSON, //Stores the data using JSON. + BACKEND_TYPE_PROMETEUS, //The user selected to use Prometheus backend + BACKEND_TYPE_KINESIS //Send message to AWS Kinesis +} BACKEND_TYPE; + + +typedef int (**backend_response_checker_t)(BUFFER *); +typedef int (**backend_request_formatter_t)(BUFFER *, const char *, RRDHOST *, const char *, RRDSET *, RRDDIM *, time_t, time_t, BACKEND_OPTIONS); + #define BACKEND_OPTIONS_SOURCE_BITS (BACKEND_SOURCE_DATA_AS_COLLECTED|BACKEND_SOURCE_DATA_AVERAGE|BACKEND_SOURCE_DATA_SUM) #define BACKEND_OPTIONS_DATA_SOURCE(backend_options) (backend_options & BACKEND_OPTIONS_SOURCE_BITS) @@ -53,4 +67,8 @@ extern int discard_response(BUFFER *b, const char *backend); #include "backends/aws_kinesis/aws_kinesis.h" #endif +#if ENABLE_PROMETHEUS_REMOTE_WRITE +#include "backends/prometheus/remote_write/remote_write.h" +#endif + #endif /* NETDATA_BACKENDS_H */ diff --git a/backends/opentsdb/README.md b/backends/opentsdb/README.md new file mode 100644 index 00000000..3d57e2e1 --- /dev/null +++ b/backends/opentsdb/README.md @@ -0,0 +1,26 @@ +# OpenTSDB with HTTP + +Since version 1.16 the Netdata has the feature to communicate with OpenTSDB using HTTP API. To enable this channel +it is necessary to set the following options in your netdata.conf + +``` +[backend] + type = opentsdb:http + destination = localhost:4242 +``` + +, in this example we are considering that OpenTSDB is running with its default port (4242). + +## HTTPS + +Netdata also supports sending the metrics using SSL/TLS, but OpenTDSB does not have support to safety connections, +so it will be necessary to configure a reverse-proxy to enable the HTTPS communication. After to configure your proxy the +following changes must be done in the netdata.conf: + +``` +[backend] + type = opentsdb:https + destination = localhost:8082 +``` + +In this example we used the port 8082 for our reverse proxy. diff --git a/backends/opentsdb/opentsdb.c b/backends/opentsdb/opentsdb.c index 6e3a31ab..6ee559db 100644 --- a/backends/opentsdb/opentsdb.c +++ b/backends/opentsdb/opentsdb.c @@ -80,6 +80,7 @@ int format_dimension_stored_opentsdb_telnet( return 1; } + return 0; } @@ -87,4 +88,118 @@ int process_opentsdb_response(BUFFER *b) { return discard_response(b, "opentsdb"); } +static inline void opentsdb_build_message(BUFFER *b, char *message, const char *hostname, int length) { + buffer_sprintf( + b + , "POST /api/put HTTP/1.1\r\n" + "Host: %s\r\n" + "Content-Type: application/json\r\n" + "Content-Length: %d\r\n" + "\r\n" + "%s" + , hostname + , length + , message + ); +} + +int format_dimension_collected_opentsdb_http( + BUFFER *b // the buffer to write data to + , const char *prefix // the prefix to use + , RRDHOST *host // the host this chart comes from + , const char *hostname // the hostname (to override host->hostname) + , RRDSET *st // the chart + , RRDDIM *rd // the dimension + , time_t after // the start timestamp + , time_t before // the end timestamp + , BACKEND_OPTIONS backend_options // BACKEND_SOURCE_* bitmap +) { + (void)host; + (void)after; + (void)before; + + char message[1024]; + char chart_name[RRD_ID_LENGTH_MAX + 1]; + char dimension_name[RRD_ID_LENGTH_MAX + 1]; + backend_name_copy(chart_name, (backend_options & BACKEND_OPTION_SEND_NAMES && st->name)?st->name:st->id, RRD_ID_LENGTH_MAX); + backend_name_copy(dimension_name, (backend_options & BACKEND_OPTION_SEND_NAMES && rd->name)?rd->name:rd->id, RRD_ID_LENGTH_MAX); + + int length = snprintfz(message + , sizeof(message) + , "{" + " \"metric\": \"%s.%s.%s\"," + " \"timestamp\": %llu," + " \"value\": "COLLECTED_NUMBER_FORMAT "," + " \"tags\": {" + " \"host\": \"%s%s%s\"" + " }" + "}" + , prefix + , chart_name + , dimension_name + , (unsigned long long)rd->last_collected_time.tv_sec + , rd->last_collected_value + , hostname + , (host->tags)?" ":"" + , (host->tags)?host->tags:"" + ); + + if(length > 0) { + opentsdb_build_message(b, message, hostname, length); + } + + return 1; +} +int format_dimension_stored_opentsdb_http( + BUFFER *b // the buffer to write data to + , const char *prefix // the prefix to use + , RRDHOST *host // the host this chart comes from + , const char *hostname // the hostname (to override host->hostname) + , RRDSET *st // the chart + , RRDDIM *rd // the dimension + , time_t after // the start timestamp + , time_t before // the end timestamp + , BACKEND_OPTIONS backend_options // BACKEND_SOURCE_* bitmap +) { + (void)host; + + time_t first_t = after, last_t = before; + calculated_number value = backend_calculate_value_from_stored_data(st, rd, after, before, backend_options, &first_t, &last_t); + + if(!isnan(value)) { + char chart_name[RRD_ID_LENGTH_MAX + 1]; + char dimension_name[RRD_ID_LENGTH_MAX + 1]; + backend_name_copy(chart_name, (backend_options & BACKEND_OPTION_SEND_NAMES && st->name)?st->name:st->id, RRD_ID_LENGTH_MAX); + backend_name_copy(dimension_name, (backend_options & BACKEND_OPTION_SEND_NAMES && rd->name)?rd->name:rd->id, RRD_ID_LENGTH_MAX); + + char message[1024]; + int length = snprintfz(message + , sizeof(message) + , "{" + " \"metric\": \"%s.%s.%s\"," + " \"timestamp\": %llu," + " \"value\": "CALCULATED_NUMBER_FORMAT "," + " \"tags\": {" + " \"host\": \"%s%s%s\"" + " }" + "}" + , prefix + , chart_name + , dimension_name + , (unsigned long long)last_t + , value + , hostname + , (host->tags)?" ":"" + , (host->tags)?host->tags:"" + ); + + if(length > 0) { + opentsdb_build_message(b, message, hostname, length); + } + + return 1; + } + + return 0; +} diff --git a/backends/opentsdb/opentsdb.h b/backends/opentsdb/opentsdb.h index fc83b39c..b9372d91 100644 --- a/backends/opentsdb/opentsdb.h +++ b/backends/opentsdb/opentsdb.h @@ -31,5 +31,28 @@ extern int format_dimension_stored_opentsdb_telnet( extern int process_opentsdb_response(BUFFER *b); +int format_dimension_collected_opentsdb_http( + BUFFER *b // the buffer to write data to + , const char *prefix // the prefix to use + , RRDHOST *host // the host this chart comes from + , const char *hostname // the hostname (to override host->hostname) + , RRDSET *st // the chart + , RRDDIM *rd // the dimension + , time_t after // the start timestamp + , time_t before // the end timestamp + , BACKEND_OPTIONS backend_options // BACKEND_SOURCE_* bitmap +); + +int format_dimension_stored_opentsdb_http( + BUFFER *b // the buffer to write data to + , const char *prefix // the prefix to use + , RRDHOST *host // the host this chart comes from + , const char *hostname // the hostname (to override host->hostname) + , RRDSET *st // the chart + , RRDDIM *rd // the dimension + , time_t after // the start timestamp + , time_t before // the end timestamp + , BACKEND_OPTIONS backend_options // BACKEND_SOURCE_* bitmap +); #endif //NETDATA_BACKEND_OPENTSDB_H diff --git a/backends/prometheus/Makefile.am b/backends/prometheus/Makefile.am index 19554bed..e5f74851 100644 --- a/backends/prometheus/Makefile.am +++ b/backends/prometheus/Makefile.am @@ -3,6 +3,10 @@ AUTOMAKE_OPTIONS = subdir-objects MAINTAINERCLEANFILES = $(srcdir)/Makefile.in +SUBDIRS = \ + remote_write \ + $(NULL) + dist_noinst_DATA = \ README.md \ $(NULL) diff --git a/backends/prometheus/backend_prometheus.c b/backends/prometheus/backend_prometheus.c index 3641b07c..67342ea7 100644 --- a/backends/prometheus/backend_prometheus.c +++ b/backends/prometheus/backend_prometheus.c @@ -153,6 +153,8 @@ static inline char *prometheus_units_copy(char *d, const char *s, size_t usable, #define PROMETHEUS_LABELS_MAX 1024 #define PROMETHEUS_VARIABLE_MAX 256 +#define PROMETHEUS_LABELS_MAX_NUMBER 128 + struct host_variables_callback_options { RRDHOST *host; BUFFER *wb; @@ -307,7 +309,7 @@ static void rrd_stats_api_v1_charts_allmetrics_prometheus(RRDHOST *host, BUFFER int as_collected = (BACKEND_OPTIONS_DATA_SOURCE(backend_options) == BACKEND_SOURCE_DATA_AS_COLLECTED); int homogeneous = 1; if(as_collected) { - if(rrdset_flag_check(st, RRDSET_FLAG_HOMEGENEOUS_CHECK)) + if(rrdset_flag_check(st, RRDSET_FLAG_HOMOGENEOUS_CHECK)) rrdset_update_heterogeneous_flag(st); if(rrdset_flag_check(st, RRDSET_FLAG_HETEROGENEOUS)) @@ -537,6 +539,177 @@ static void rrd_stats_api_v1_charts_allmetrics_prometheus(RRDHOST *host, BUFFER rrdhost_unlock(host); } +#if ENABLE_PROMETHEUS_REMOTE_WRITE +inline static void remote_write_split_words(char *str, char **words, int max_words) { + char *s = str; + int i = 0; + + while(*s && i < max_words - 1) { + while(*s && isspace(*s)) s++; // skip spaces to the begining of a tag name + + if(*s) + words[i] = s; + + while(*s && !isspace(*s) && *s != '=') s++; // find the end of the tag name + + if(*s != '=') { + words[i] = NULL; + break; + } + *s = '\0'; + s++; + i++; + + while(*s && isspace(*s)) s++; // skip spaces to the begining of a tag value + + if(*s && *s == '"') s++; // strip an opening quote + if(*s) + words[i] = s; + + while(*s && !isspace(*s) && *s != ',') s++; // find the end of the tag value + + if(*s && *s != ',') { + words[i] = NULL; + break; + } + if(s != words[i] && *(s - 1) == '"') *(s - 1) = '\0'; // strip a closing quote + if(*s != '\0') { + *s = '\0'; + s++; + i++; + } + } +} + +void rrd_stats_remote_write_allmetrics_prometheus( + RRDHOST *host + , const char *__hostname + , const char *prefix + , BACKEND_OPTIONS backend_options + , time_t after + , time_t before + , size_t *count_charts + , size_t *count_dims + , size_t *count_dims_skipped +) { + char hostname[PROMETHEUS_ELEMENT_MAX + 1]; + prometheus_label_copy(hostname, __hostname, PROMETHEUS_ELEMENT_MAX); + + add_host_info("netdata_info", hostname, host->program_name, host->program_version, now_realtime_usec() / USEC_PER_MS); + + if(host->tags && *(host->tags)) { + char tags[PROMETHEUS_LABELS_MAX + 1]; + strncpy(tags, host->tags, PROMETHEUS_LABELS_MAX); + char *words[PROMETHEUS_LABELS_MAX_NUMBER] = {NULL}; + int i; + + remote_write_split_words(tags, words, PROMETHEUS_LABELS_MAX_NUMBER); + + add_host_info("netdata_host_tags_info", hostname, NULL, NULL, now_realtime_usec() / USEC_PER_MS); + + for(i = 0; words[i] != NULL && words[i + 1] != NULL && (i + 1) < PROMETHEUS_LABELS_MAX_NUMBER; i += 2) { + add_tag(words[i], words[i + 1]); + } + } + + // for each chart + RRDSET *st; + rrdset_foreach_read(st, host) { + char chart[PROMETHEUS_ELEMENT_MAX + 1]; + char context[PROMETHEUS_ELEMENT_MAX + 1]; + char family[PROMETHEUS_ELEMENT_MAX + 1]; + char units[PROMETHEUS_ELEMENT_MAX + 1] = ""; + + prometheus_label_copy(chart, (backend_options & BACKEND_OPTION_SEND_NAMES && st->name)?st->name:st->id, PROMETHEUS_ELEMENT_MAX); + prometheus_label_copy(family, st->family, PROMETHEUS_ELEMENT_MAX); + prometheus_name_copy(context, st->context, PROMETHEUS_ELEMENT_MAX); + + if(likely(backends_can_send_rrdset(backend_options, st))) { + rrdset_rdlock(st); + + (*count_charts)++; + + int as_collected = (BACKEND_OPTIONS_DATA_SOURCE(backend_options) == BACKEND_SOURCE_DATA_AS_COLLECTED); + int homogeneous = 1; + if(as_collected) { + if(rrdset_flag_check(st, RRDSET_FLAG_HOMOGENEOUS_CHECK)) + rrdset_update_heterogeneous_flag(st); + + if(rrdset_flag_check(st, RRDSET_FLAG_HETEROGENEOUS)) + homogeneous = 0; + } + else { + if(BACKEND_OPTIONS_DATA_SOURCE(backend_options) == BACKEND_SOURCE_DATA_AVERAGE) + prometheus_units_copy(units, st->units, PROMETHEUS_ELEMENT_MAX, 0); + } + + // for each dimension + RRDDIM *rd; + rrddim_foreach_read(rd, st) { + if(rd->collections_counter && !rrddim_flag_check(rd, RRDDIM_FLAG_OBSOLETE)) { + char name[PROMETHEUS_LABELS_MAX + 1]; + char dimension[PROMETHEUS_ELEMENT_MAX + 1]; + char *suffix = ""; + + if (as_collected) { + // we need as-collected / raw data + + if(unlikely(rd->last_collected_time.tv_sec < after)) { + debug(D_BACKEND, "BACKEND: not sending dimension '%s' of chart '%s' from host '%s', its last data collection (%lu) is not within our timeframe (%lu to %lu)", rd->id, st->id, __hostname, (unsigned long)rd->last_collected_time.tv_sec, (unsigned long)after, (unsigned long)before); + (*count_dims_skipped)++; + continue; + } + + if(homogeneous) { + // all the dimensions of the chart, has the same algorithm, multiplier and divisor + // we add all dimensions as labels + + prometheus_label_copy(dimension, (backend_options & BACKEND_OPTION_SEND_NAMES && rd->name) ? rd->name : rd->id, PROMETHEUS_ELEMENT_MAX); + snprintf(name, PROMETHEUS_LABELS_MAX, "%s_%s%s", prefix, context, suffix); + + add_metric(name, chart, family, dimension, hostname, rd->last_collected_value, timeval_msec(&rd->last_collected_time)); + (*count_dims)++; + } + else { + // the dimensions of the chart, do not have the same algorithm, multiplier or divisor + // we create a metric per dimension + + prometheus_name_copy(dimension, (backend_options & BACKEND_OPTION_SEND_NAMES && rd->name) ? rd->name : rd->id, PROMETHEUS_ELEMENT_MAX); + snprintf(name, PROMETHEUS_LABELS_MAX, "%s_%s_%s%s", prefix, context, dimension, suffix); + + add_metric(name, chart, family, NULL, hostname, rd->last_collected_value, timeval_msec(&rd->last_collected_time)); + (*count_dims)++; + } + } + else { + // we need average or sum of the data + + time_t first_t = after, last_t = before; + calculated_number value = backend_calculate_value_from_stored_data(st, rd, after, before, backend_options, &first_t, &last_t); + + if(!isnan(value) && !isinf(value)) { + + if(BACKEND_OPTIONS_DATA_SOURCE(backend_options) == BACKEND_SOURCE_DATA_AVERAGE) + suffix = "_average"; + else if(BACKEND_OPTIONS_DATA_SOURCE(backend_options) == BACKEND_SOURCE_DATA_SUM) + suffix = "_sum"; + + prometheus_label_copy(dimension, (backend_options & BACKEND_OPTION_SEND_NAMES && rd->name) ? rd->name : rd->id, PROMETHEUS_ELEMENT_MAX); + snprintf(name, PROMETHEUS_LABELS_MAX, "%s_%s%s%s", prefix, context, units, suffix); + + add_metric(name, chart, family, dimension, hostname, rd->last_collected_value, timeval_msec(&rd->last_collected_time)); + (*count_dims)++; + } + } + } + } + + rrdset_unlock(st); + } + } +} +#endif /* ENABLE_PROMETHEUS_REMOTE_WRITE */ + static inline time_t prometheus_preparation(RRDHOST *host, BUFFER *wb, BACKEND_OPTIONS backend_options, const char *server, time_t now, PROMETHEUS_OUTPUT_OPTIONS output_options) { if(!server || !*server) server = "default"; @@ -599,3 +772,26 @@ void rrd_stats_api_v1_charts_allmetrics_prometheus_all_hosts(RRDHOST *host, BUFF } rrd_unlock(); } + +#if ENABLE_PROMETHEUS_REMOTE_WRITE +int process_prometheus_remote_write_response(BUFFER *b) { + if(unlikely(!b)) return 1; + + const char *s = buffer_tostring(b); + int len = buffer_strlen(b); + + // do nothing with HTTP response 200 + + while(!isspace(*s) && len) { + s++; + len--; + } + s++; + len--; + + if(likely(len > 4 && !strncmp(s, "200 ", 4))) + return 0; + else + return discard_response(b, "prometheus remote write"); +} +#endif diff --git a/backends/prometheus/backend_prometheus.h b/backends/prometheus/backend_prometheus.h index 72b65a22..d58d2400 100644 --- a/backends/prometheus/backend_prometheus.h +++ b/backends/prometheus/backend_prometheus.h @@ -19,4 +19,19 @@ typedef enum prometheus_output_flags { extern void rrd_stats_api_v1_charts_allmetrics_prometheus_single_host(RRDHOST *host, BUFFER *wb, const char *server, const char *prefix, BACKEND_OPTIONS backend_options, PROMETHEUS_OUTPUT_OPTIONS output_options); extern void rrd_stats_api_v1_charts_allmetrics_prometheus_all_hosts(RRDHOST *host, BUFFER *wb, const char *server, const char *prefix, BACKEND_OPTIONS backend_options, PROMETHEUS_OUTPUT_OPTIONS output_options); +#if ENABLE_PROMETHEUS_REMOTE_WRITE +extern void rrd_stats_remote_write_allmetrics_prometheus( + RRDHOST *host + , const char *__hostname + , const char *prefix + , BACKEND_OPTIONS backend_options + , time_t after + , time_t before + , size_t *count_charts + , size_t *count_dims + , size_t *count_dims_skipped +); +extern int process_prometheus_remote_write_response(BUFFER *b); +#endif + #endif //NETDATA_BACKEND_PROMETHEUS_H diff --git a/backends/prometheus/remote_write/Makefile.am b/backends/prometheus/remote_write/Makefile.am new file mode 100644 index 00000000..5f8f9d4c --- /dev/null +++ b/backends/prometheus/remote_write/Makefile.am @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-3.0-or-later + +AUTOMAKE_OPTIONS = subdir-objects +MAINTAINERCLEANFILES = $(srcdir)/Makefile.in + +CLEANFILES = \ + remote_write.pb.cc \ + remote_write.pb.h \ + $(NULL) + +dist_noinst_DATA = \ + remote_write.proto \ + README.md \ + $(NULL) diff --git a/backends/prometheus/remote_write/README.md b/backends/prometheus/remote_write/README.md new file mode 100644 index 00000000..73cb1daf --- /dev/null +++ b/backends/prometheus/remote_write/README.md @@ -0,0 +1,30 @@ +# Prometheus remote write backend + +## Prerequisites + +To use the prometheus remote write API with [storage providers](https://prometheus.io/docs/operating/integrations/#remote-endpoints-and-storage) [protobuf](https://developers.google.com/protocol-buffers/) and [snappy](https://github.com/google/snappy) libraries should be installed first. Next, netdata should be re-installed from the source. The installer will detect that the required libraries and utilities are now available. + +## Configuration + +An additional option in the backend configuration section is available for the remote write backend: + +``` +[backend] + remote write URL path = /receive +``` + +The default value is `/receive`. `remote write URL path` is used to set an endpoint path for the remote write protocol. For example, if your endpoint is `http://example.domain:example_port/storage/read` you should set + +``` +[backend] + destination = example.domain:example_port + remote write URL path = /storage/read +``` + +`buffered` and `lost` dimensions in the Netdata Backend Data Size operation monitoring chart estimate uncompressed buffer size on failures. + +## Notes + +The remote write backend does not support `buffer on failures` + +[![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fbackends%2Fprometheus%2Fremote_write%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)]() diff --git a/backends/prometheus/remote_write/remote_write.cc b/backends/prometheus/remote_write/remote_write.cc new file mode 100644 index 00000000..91d4305b --- /dev/null +++ b/backends/prometheus/remote_write/remote_write.cc @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include <snappy.h> +#include "remote_write.pb.h" +#include "remote_write.h" + +using namespace prometheus; + + +google::protobuf::Arena arena; +WriteRequest *write_request; + +void init_write_request() { + GOOGLE_PROTOBUF_VERIFY_VERSION; + write_request = google::protobuf::Arena::CreateMessage<WriteRequest>(&arena); +} + +void clear_write_request() { + write_request->clear_timeseries(); +} + +void add_host_info(const char *name, const char *instance, const char *application, const char *version, const int64_t timestamp) { + TimeSeries *timeseries; + Sample *sample; + Label *label; + + timeseries = write_request->add_timeseries(); + + label = timeseries->add_labels(); + label->set_name("__name__"); + label->set_value(name); + + label = timeseries->add_labels(); + label->set_name("instance"); + label->set_value(instance); + + if(application) { + label = timeseries->add_labels(); + label->set_name("application"); + label->set_value(application); + } + + if(version) { + label = timeseries->add_labels(); + label->set_name("version"); + label->set_value(version); + } + + sample = timeseries->add_samples(); + sample->set_value(1); + sample->set_timestamp(timestamp); +} + +// adds tag to the last created timeseries +void add_tag(char *tag, char *value) { + TimeSeries *timeseries; + Label *label; + + timeseries = write_request->mutable_timeseries(write_request->timeseries_size() - 1); + + label = timeseries->add_labels(); + label->set_name(tag); + label->set_value(value); +} + +void add_metric(const char *name, const char *chart, const char *family, const char *dimension, const char *instance, const double value, const int64_t timestamp) { + TimeSeries *timeseries; + Sample *sample; + Label *label; + + timeseries = write_request->add_timeseries(); + + label = timeseries->add_labels(); + label->set_name("__name__"); + label->set_value(name); + + label = timeseries->add_labels(); + label->set_name("chart"); + label->set_value(chart); + + label = timeseries->add_labels(); + label->set_name("family"); + label->set_value(family); + + if(dimension) { + label = timeseries->add_labels(); + label->set_name("dimension"); + label->set_value(dimension); + } + + label = timeseries->add_labels(); + label->set_name("instance"); + label->set_value(instance); + + sample = timeseries->add_samples(); + sample->set_value(value); + sample->set_timestamp(timestamp); +} + +size_t get_write_request_size(){ + size_t size = (size_t)snappy::MaxCompressedLength(write_request->ByteSize()); + + return (size < INT_MAX)?size:0; +} + +int pack_write_request(char *buffer, size_t *size) { + std::string uncompressed_write_request; + if(write_request->SerializeToString(&uncompressed_write_request) == false) return 1; + + snappy::RawCompress(uncompressed_write_request.data(), uncompressed_write_request.size(), buffer, size); + + return 0; +} + +void protocol_buffers_shutdown() { + google::protobuf::ShutdownProtobufLibrary(); +} diff --git a/backends/prometheus/remote_write/remote_write.h b/backends/prometheus/remote_write/remote_write.h new file mode 100644 index 00000000..edcc477b --- /dev/null +++ b/backends/prometheus/remote_write/remote_write.h @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_BACKEND_PROMETHEUS_REMOTE_WRITE_H +#define NETDATA_BACKEND_PROMETHEUS_REMOTE_WRITE_H + +#ifdef __cplusplus +extern "C" { +#endif + +void init_write_request(); + +void clear_write_request(); + +void add_host_info(const char *name, const char *instance, const char *application, const char *version, const int64_t timestamp); + +void add_tag(char *tag, char *value); + +void add_metric(const char *name, const char *chart, const char *family, const char *dimension, const char *instance, const double value, const int64_t timestamp); + +size_t get_write_request_size(); + +int pack_write_request(char *buffer, size_t *size); + +void protocol_buffers_shutdown(); + +#ifdef __cplusplus +} +#endif + +#endif //NETDATA_BACKEND_PROMETHEUS_REMOTE_WRITE_H diff --git a/backends/prometheus/remote_write/remote_write.proto b/backends/prometheus/remote_write/remote_write.proto new file mode 100644 index 00000000..dfde254e --- /dev/null +++ b/backends/prometheus/remote_write/remote_write.proto @@ -0,0 +1,29 @@ +syntax = "proto3"; +package prometheus; + +option cc_enable_arenas = true; + +import "google/protobuf/descriptor.proto"; + +message WriteRequest { + repeated TimeSeries timeseries = 1 [(nullable) = false]; +} + +message TimeSeries { + repeated Label labels = 1 [(nullable) = false]; + repeated Sample samples = 2 [(nullable) = false]; +} + +message Label { + string name = 1; + string value = 2; +} + +message Sample { + double value = 1; + int64 timestamp = 2; +} + +extend google.protobuf.FieldOptions { + bool nullable = 65001; +} diff --git a/collectors/Makefile.am b/collectors/Makefile.am index 87a037e7..fe62ba01 100644 --- a/collectors/Makefile.am +++ b/collectors/Makefile.am @@ -18,6 +18,7 @@ SUBDIRS = \ macos.plugin \ nfacct.plugin \ xenstat.plugin \ + perf.plugin \ node.d.plugin \ proc.plugin \ python.d.plugin \ diff --git a/collectors/README.md b/collectors/README.md index 154d193e..72521388 100644 --- a/collectors/README.md +++ b/collectors/README.md @@ -37,6 +37,7 @@ plugin|lang|O/S|runs as|modular|description [macos.plugin](macos.plugin/)|`C`|macos|internal|yes|collects resource usage and performance data on MacOS systems [nfacct.plugin](nfacct.plugin/)|`C`|linux|external|-|collects netfilter firewall, connection tracker and accounting metrics using `libmnl` and `libnetfilter_acct` [xenstat.plugin](xenstat.plugin/)|`C`|linux|external|-|collects XenServer and XCP-ng metrics using `libxenstat` +[perf.plugin](perf.plugin/)|`C`|linux|external|-|collects CPU performance metrics using performance monitoring units (PMU). [node.d.plugin](node.d.plugin/)|`node.js`|any|external|yes|a **plugin orchestrator** for data collection modules written in `node.js`. [plugins.d](plugins.d/)|`C`|any|internal|-|implements the **external plugins** API and serves external plugins [proc.plugin](proc.plugin/)|`C`|linux|internal|yes|collects resource usage and performance data on Linux systems diff --git a/collectors/apps.plugin/apps_groups.conf b/collectors/apps.plugin/apps_groups.conf index 7eba72f6..ab167ddb 100644 --- a/collectors/apps.plugin/apps_groups.conf +++ b/collectors/apps.plugin/apps_groups.conf @@ -77,6 +77,7 @@ freeipmi.plugin: freeipmi.plugin nfacct.plugin: nfacct.plugin cups.plugin: cups.plugin xenstat.plugin: xenstat.plugin +perf.plugin: perf.plugin charts.d.plugin: *charts.d.plugin* node.d.plugin: *node.d.plugin* python.d.plugin: *python.d.plugin* @@ -88,7 +89,7 @@ go.d.plugin: *go.d.plugin* # ----------------------------------------------------------------------------- # authentication/authorization related servers -auth: radius* openldap* ldap* +auth: radius* openldap* ldap* slapd fail2ban: fail2ban* # ----------------------------------------------------------------------------- @@ -188,7 +189,7 @@ print: cups* lpd lpq # ----------------------------------------------------------------------------- # time servers and clients -time: ntp* systemd-timesyncd chronyd +time: ntp* systemd-timesyn* chronyd # ----------------------------------------------------------------------------- # dhcp servers and clients @@ -301,3 +302,5 @@ ipfs: ipfs node: node* factorio: factorio + +p4: p4* diff --git a/collectors/cgroups.plugin/README.md b/collectors/cgroups.plugin/README.md index 3fffbd17..c01f9ec0 100644 --- a/collectors/cgroups.plugin/README.md +++ b/collectors/cgroups.plugin/README.md @@ -101,6 +101,15 @@ The whole point for the additional pattern list, is to limit the number of times The above pattern list is matched against the path of the cgroup. For matched cgroups, netdata calls the script [cgroup-name.sh](cgroup-name.sh.in) to get its name. This script queries `docker`, or applies heuristics to find give a name for the cgroup. +### charts with zero metrics + +By default, Netdata will enable monitoring metrics only when they are not zero. If they are constantly zero they are ignored. Metrics that will start having values, after netdata is started, will be detected and charts will be automatically added to the dashboard (a refresh of the dashboard is needed for them to appear though). Set `yes` for a chart instead of `auto` to enable it permanently. For example: + +``` +[plugin:cgroups] + enable memory (used mem including cache) = yes +``` + ### alarms CPU and memory limits are watched and used to rise alarms. Memory usage for every cgroup is checked against `ram` and `ram+swap` limits. CPU usage for every cgroup is checked against `cpuset.cpus` and `cpu.cfs_period_us` + `cpu.cfs_quota_us` pair assigned for the cgroup. Configuration for the alarms is available in `health.d/cgroups.conf` file. diff --git a/collectors/cgroups.plugin/sys_fs_cgroup.c b/collectors/cgroups.plugin/sys_fs_cgroup.c index 40b48557..4300788d 100644 --- a/collectors/cgroups.plugin/sys_fs_cgroup.c +++ b/collectors/cgroups.plugin/sys_fs_cgroup.c @@ -2831,17 +2831,16 @@ void update_cgroup_charts(int update_every) { , RRDSET_TYPE_LINE ); - rrddim_add(cg->st_cpu_limit, "used", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + if(!(cg->options & CGROUP_OPTIONS_IS_UNIFIED)) + rrddim_add(cg->st_cpu_limit, "used", NULL, 1, system_hz, RRD_ALGORITHM_ABSOLUTE); + else + rrddim_add(cg->st_cpu_limit, "used", NULL, 1, 1000000, RRD_ALGORITHM_ABSOLUTE); } else rrdset_next(cg->st_cpu_limit); calculated_number cpu_usage = 0; - if(!(cg->options & CGROUP_OPTIONS_IS_UNIFIED)) - cpu_usage = (calculated_number)(cg->cpuacct_stat.user + cg->cpuacct_stat.system) * 100 / system_hz; - else - cpu_usage = (calculated_number)(cg->cpuacct_stat.user + cg->cpuacct_stat.system) * 100 / 1000000; - + cpu_usage = (calculated_number)(cg->cpuacct_stat.user + cg->cpuacct_stat.system) * 100; calculated_number cpu_used = 100 * (cpu_usage - cg->prev_cpu_usage) / (value * update_every); rrdset_isnot_obsolete(cg->st_cpu_limit); diff --git a/collectors/charts.d.plugin/apcupsd/apcupsd.chart.sh b/collectors/charts.d.plugin/apcupsd/apcupsd.chart.sh index f5c4d795..31ff9316 100644 --- a/collectors/charts.d.plugin/apcupsd/apcupsd.chart.sh +++ b/collectors/charts.d.plugin/apcupsd/apcupsd.chart.sh @@ -47,8 +47,8 @@ apcupsd_check() { error "cannot get information for apcupsd server ${host} on ${apcupsd_sources[${host}]}." failed=$((failed + 1)) else - apcupsd_status = "$(apcupsd_get ${apcupsd_sources[${host}]} | awk '/^STATUS.*/{ print $3 }')" - if [ ${apcupsd_status} != "ONLINE" ] && [ ${apcupsd_status} != "ONBATT" ]; then + apcupsd_status="$(apcupsd_get ${apcupsd_sources[${host}]} | awk '/^STATUS.*/{ print $3 }')" + if [ "${apcupsd_status}" != "ONLINE" ] && [ "${apcupsd_status}" != "ONBATT" ]; then error "APC UPS ${host} on ${apcupsd_sources[${host}]} is not online." failed=$((failed + 1)) else diff --git a/collectors/charts.d.plugin/charts.d.conf b/collectors/charts.d.plugin/charts.d.conf index acb2a6fa..94c40cf6 100644 --- a/collectors/charts.d.plugin/charts.d.conf +++ b/collectors/charts.d.plugin/charts.d.conf @@ -34,6 +34,8 @@ # BY DEFAULT ENABLED MODULES # ap=yes +# apcupsd=yes +# libreswan=yes # nut=yes # opensips=yes diff --git a/collectors/charts.d.plugin/charts.d.plugin.in b/collectors/charts.d.plugin/charts.d.plugin.in index a3f0aa95..0df6c30c 100755 --- a/collectors/charts.d.plugin/charts.d.plugin.in +++ b/collectors/charts.d.plugin/charts.d.plugin.in @@ -304,7 +304,7 @@ run() { printf " --- END TRACE ---\n" } >&2 fi - rm "${TMP_DIR}/run.${pid}" + rm -f "${TMP_DIR}/run.${pid}" return ${ret} } diff --git a/collectors/diskspace.plugin/README.md b/collectors/diskspace.plugin/README.md index d743312c..8f859e35 100644 --- a/collectors/diskspace.plugin/README.md +++ b/collectors/diskspace.plugin/README.md @@ -10,6 +10,9 @@ Two charts are available for every mount: Simple patterns can be used to exclude mounts from showed statistics based on path or filesystem. By default read-only mounts are not displayed. To display them `yes` should be set for a chart instead of `auto`. +By default, Netdata will enable monitoring metrics only when they are not zero. If they are constantly zero they are ignored. Metrics that will start having values, after netdata is started, will be detected and charts will be automatically added to the dashboard (a refresh of the dashboard is needed for them to appear though). Set `yes` for a chart instead of `auto` to enable it permanently. + + ``` [plugin:proc:diskspace] # remove charts of unmounted disks = yes diff --git a/collectors/freeipmi.plugin/freeipmi_plugin.c b/collectors/freeipmi.plugin/freeipmi_plugin.c index 82a04ecd..ba1fbffa 100644 --- a/collectors/freeipmi.plugin/freeipmi_plugin.c +++ b/collectors/freeipmi.plugin/freeipmi_plugin.c @@ -338,7 +338,8 @@ static void netdata_mark_as_not_updated() { } static void send_chart_to_netdata_for_units(int units) { - struct sensor *sn; + struct sensor *sn, *sn_stored; + int dupfound, multiplier; switch(units) { case IPMI_MONITORING_SENSOR_UNITS_CELSIUS: @@ -398,29 +399,44 @@ static void send_chart_to_netdata_for_units(int units) { } for(sn = sensors_root; sn; sn = sn->next) { + dupfound = 0; if(sn->sensor_units == units && sn->updated && !sn->ignore) { sn->exposed = 1; + multiplier = 1; switch(sn->sensor_reading_type) { + case IPMI_MONITORING_SENSOR_READING_TYPE_DOUBLE: + multiplier = 1000; + // fallthrough case IPMI_MONITORING_SENSOR_READING_TYPE_UNSIGNED_INTEGER8_BOOL: case IPMI_MONITORING_SENSOR_READING_TYPE_UNSIGNED_INTEGER32: - printf("DIMENSION i%d_n%d_r%d '%s i%d' absolute 1 1\n" - , sn->sensor_number - , sn->record_id - , sn->sensor_reading_type - , sn->sensor_name - , sn->sensor_number - ); - break; - - case IPMI_MONITORING_SENSOR_READING_TYPE_DOUBLE: - printf("DIMENSION i%d_n%d_r%d '%s i%d' absolute 1 1000\n" - , sn->sensor_number - , sn->record_id - , sn->sensor_reading_type - , sn->sensor_name - , sn->sensor_number - ); + for (sn_stored = sensors_root; sn_stored; sn_stored = sn_stored->next) { + if (sn_stored == sn) continue; + // If the name is a duplicate, append the sensor number + if ( !strcmp(sn_stored->sensor_name, sn->sensor_name) ) { + dupfound = 1; + printf("DIMENSION i%d_n%d_r%d '%s i%d' absolute 1 %d\n" + , sn->sensor_number + , sn->record_id + , sn->sensor_reading_type + , sn->sensor_name + , sn->sensor_number + , multiplier + ); + break; + } + } + // No duplicate name was found, display it just with Name + if (!dupfound) { + // display without ID + printf("DIMENSION i%d_n%d_r%d '%s' absolute 1 %d\n" + , sn->sensor_number + , sn->record_id + , sn->sensor_reading_type + , sn->sensor_name + , multiplier + ); + } break; default: diff --git a/collectors/nfacct.plugin/plugin_nfacct.c b/collectors/nfacct.plugin/plugin_nfacct.c index 54589982..feec34b8 100644 --- a/collectors/nfacct.plugin/plugin_nfacct.c +++ b/collectors/nfacct.plugin/plugin_nfacct.c @@ -835,12 +835,12 @@ int main(int argc, char **argv) { error("update frequency %d seconds is too small for NFACCT. Using %d.", freq, netdata_update_every); #ifdef DO_NFACCT - if(debug) fprintf(stderr, "freeipmi.plugin: calling nfacct_init()\n"); + if(debug) fprintf(stderr, "nfacct.plugin: calling nfacct_init()\n"); int nfacct = !nfacct_init(netdata_update_every); #endif #ifdef DO_NFSTAT - if(debug) fprintf(stderr, "freeipmi.plugin: calling nfstat_init()\n"); + if(debug) fprintf(stderr, "nfacct.plugin: calling nfstat_init()\n"); int nfstat = !nfstat_init(netdata_update_every); #endif diff --git a/collectors/perf.plugin/Makefile.am b/collectors/perf.plugin/Makefile.am new file mode 100644 index 00000000..19554bed --- /dev/null +++ b/collectors/perf.plugin/Makefile.am @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-3.0-or-later + +AUTOMAKE_OPTIONS = subdir-objects +MAINTAINERCLEANFILES = $(srcdir)/Makefile.in + +dist_noinst_DATA = \ + README.md \ + $(NULL) diff --git a/collectors/perf.plugin/README.md b/collectors/perf.plugin/README.md new file mode 100644 index 00000000..ce696b06 --- /dev/null +++ b/collectors/perf.plugin/README.md @@ -0,0 +1,72 @@ +# perf.plugin + +`perf.plugin` collects system-wide CPU performance statistics from Performance Monitoring Units (PMU) using +the `perf_event_open()` system call. + +## Important Notes + +Accessing hardware PMUs requires root permissions, so the plugin is setuid to root. + +Keep in mind that the number of PMUs in a system is usually quite limited and every hardware monitoring +event for every CPU core needs a separate file descriptor to be opened. + +## Charts + +The plugin provides statistics for general hardware and software performance monitoring events: + +Hardware events: +1. CPU cycles +2. Instructions +3. Branch instructions +4. Cache operations +5. BUS cycles +6. Stalled frontend and backend cycles + +Software events: +1. CPU migrations +2. Alignment faults +3. Emulation faults + +Hardware cache events: +1. L1D cache operations +2. L1D prefetch cache operations +3. L1I cache operations +4. LL cache operations +5. DTLB cache operations +6. ITLB cache operations +7. PBU cache operations + +## Configuration + +The plugin is disabled by default because the number of PMUs is usually quite limited and it is not desired to +allow Netdata to struggle silently for PMUs, interfering with other performance monitoring software. If you need to +enable the perf plugin, edit /etc/netdata/netdata.conf and set: + +```raw +[plugins] + perf = yes +``` + +```raw +[plugin:perf] + update every = 1 + command options = all +``` + +You can use the `command options` parameter to pick what data should be collected and which charts should be +displayed. If `all` is used, all general performance monitoring counters are probed and corresponding charts +are enabled for the available counters. You can also define a particular set of enabled charts using the +following keywords: `cycles`, `instructions`, `branch`, `cache`, `bus`, `stalled`, `migrations`, `alighnment`, +`emulation`, `L1D`, `L1D-prefetch`, `L1I`, `LL`, `DTLB`, `ITLB`, `PBU`. + +## Debugging + +You can run the plugin by hand: + +```raw +sudo /usr/libexec/netdata/plugins.d/perf.plugin 1 all debug +``` + +You will get verbose output on what the plugin does. + +[![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fcollectors%2Fperf.plugin%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)]() diff --git a/collectors/perf.plugin/perf_plugin.c b/collectors/perf.plugin/perf_plugin.c new file mode 100644 index 00000000..c645c279 --- /dev/null +++ b/collectors/perf.plugin/perf_plugin.c @@ -0,0 +1,1348 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "../../libnetdata/libnetdata.h" + +#include <linux/perf_event.h> + +#define PLUGIN_PERF_NAME "perf.plugin" + +// Hardware counters +#define NETDATA_CHART_PRIO_PERF_CPU_CYCLES 8800 +#define NETDATA_CHART_PRIO_PERF_INSTRUCTIONS 8801 +#define NETDATA_CHART_PRIO_PERF_BRANCH_INSTRUSTIONS 8802 +#define NETDATA_CHART_PRIO_PERF_CACHE 8803 +#define NETDATA_CHART_PRIO_PERF_BUS_CYCLES 8804 +#define NETDATA_CHART_PRIO_PERF_FRONT_BACK_CYCLES 8805 + +// Software counters +#define NETDATA_CHART_PRIO_PERF_MIGRATIONS 8810 +#define NETDATA_CHART_PRIO_PERF_ALIGNMENT 8811 +#define NETDATA_CHART_PRIO_PERF_EMULATION 8812 + +// Hardware cache counters +#define NETDATA_CHART_PRIO_PERF_L1D 8820 +#define NETDATA_CHART_PRIO_PERF_L1D_PREFETCH 8821 +#define NETDATA_CHART_PRIO_PERF_L1I 8822 +#define NETDATA_CHART_PRIO_PERF_LL 8823 +#define NETDATA_CHART_PRIO_PERF_DTLB 8824 +#define NETDATA_CHART_PRIO_PERF_ITLB 8825 +#define NETDATA_CHART_PRIO_PERF_PBU 8826 + +// callback required by fatal() +void netdata_cleanup_and_exit(int ret) { + exit(ret); +} + +void send_statistics( const char *action, const char *action_result, const char *action_data) { + (void) action; + (void) action_result; + (void) action_data; + return; +} + +// callbacks required by popen() +void signals_block(void) {}; +void signals_unblock(void) {}; +void signals_reset(void) {}; + +// callback required by eval() +int health_variable_lookup(const char *variable, uint32_t hash, struct rrdcalc *rc, calculated_number *result) { + (void)variable; + (void)hash; + (void)rc; + (void)result; + return 0; +}; + +// required by get_system_cpus() +char *netdata_configured_host_prefix = ""; + +// Variables + +#define RRD_TYPE_PERF "perf" +#define RRD_FAMILY_HW "hardware" +#define RRD_FAMILY_SW "software" +#define RRD_FAMILY_CACHE "cache" + +#define NO_FD -1 +#define ALL_PIDS -1 +#define RUNNING_THRESHOLD 100 + +static int debug = 0; + +static int update_every = 1; +static int freq = 0; + +typedef enum perf_event_id { + // Hardware counters + EV_ID_CPU_CYCLES, + EV_ID_INSTRUCTIONS, + EV_ID_CACHE_REFERENCES, + EV_ID_CACHE_MISSES, + EV_ID_BRANCH_INSTRUCTIONS, + EV_ID_BRANCH_MISSES, + EV_ID_BUS_CYCLES, + EV_ID_STALLED_CYCLES_FRONTEND, + EV_ID_STALLED_CYCLES_BACKEND, + EV_ID_REF_CPU_CYCLES, + + // Software counters + // EV_ID_CPU_CLOCK, + // EV_ID_TASK_CLOCK, + // EV_ID_PAGE_FAULTS, + // EV_ID_CONTEXT_SWITCHES, + EV_ID_CPU_MIGRATIONS, + // EV_ID_PAGE_FAULTS_MIN, + // EV_ID_PAGE_FAULTS_MAJ, + EV_ID_ALIGNMENT_FAULTS, + EV_ID_EMULATION_FAULTS, + + // Hardware cache counters + EV_ID_L1D_READ_ACCESS, + EV_ID_L1D_READ_MISS, + EV_ID_L1D_WRITE_ACCESS, + EV_ID_L1D_WRITE_MISS, + EV_ID_L1D_PREFETCH_ACCESS, + + EV_ID_L1I_READ_ACCESS, + EV_ID_L1I_READ_MISS, + + EV_ID_LL_READ_ACCESS, + EV_ID_LL_READ_MISS, + EV_ID_LL_WRITE_ACCESS, + EV_ID_LL_WRITE_MISS, + + EV_ID_DTLB_READ_ACCESS, + EV_ID_DTLB_READ_MISS, + EV_ID_DTLB_WRITE_ACCESS, + EV_ID_DTLB_WRITE_MISS, + + EV_ID_ITLB_READ_ACCESS, + EV_ID_ITLB_READ_MISS, + + EV_ID_PBU_READ_ACCESS, + + EV_ID_END +} perf_event_id_t; + +enum perf_event_group { + EV_GROUP_CYCLES, + EV_GROUP_INSTRUCTIONS_AND_CACHE, + EV_GROUP_SOFTWARE, + EV_GROUP_CACHE_L1D, + EV_GROUP_CACHE_L1I_LL_DTLB, + EV_GROUP_CACHE_ITLB_BPU, + + EV_GROUP_NUM +}; + +static int number_of_cpus; + +static int *group_leader_fds[EV_GROUP_NUM]; + +static struct perf_event { + perf_event_id_t id; + + int type; + int config; + + int **group_leader_fd; + int *fd; + + int disabled; + int updated; + + uint64_t value; + + uint64_t *prev_value; + uint64_t *prev_time_enabled; + uint64_t *prev_time_running; +} perf_events[] = { + // Hardware counters + {EV_ID_CPU_CYCLES, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES, &group_leader_fds[EV_GROUP_CYCLES], NULL, 1, 0, 0, NULL, NULL, NULL}, + {EV_ID_INSTRUCTIONS, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, &group_leader_fds[EV_GROUP_INSTRUCTIONS_AND_CACHE], NULL, 1, 0, 0, NULL, NULL, NULL}, + {EV_ID_CACHE_REFERENCES, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES, &group_leader_fds[EV_GROUP_INSTRUCTIONS_AND_CACHE], NULL, 1, 0, 0, NULL, NULL, NULL}, + {EV_ID_CACHE_MISSES, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES, &group_leader_fds[EV_GROUP_INSTRUCTIONS_AND_CACHE], NULL, 1, 0, 0, NULL, NULL, NULL}, + {EV_ID_BRANCH_INSTRUCTIONS, PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_INSTRUCTIONS, &group_leader_fds[EV_GROUP_INSTRUCTIONS_AND_CACHE], NULL, 1, 0, 0, NULL, NULL, NULL}, + {EV_ID_BRANCH_MISSES, PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_MISSES, &group_leader_fds[EV_GROUP_INSTRUCTIONS_AND_CACHE], NULL, 1, 0, 0, NULL, NULL, NULL}, + {EV_ID_BUS_CYCLES, PERF_TYPE_HARDWARE, PERF_COUNT_HW_BUS_CYCLES, &group_leader_fds[EV_GROUP_CYCLES], NULL, 1, 0, 0, NULL, NULL, NULL}, + {EV_ID_STALLED_CYCLES_FRONTEND, PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, &group_leader_fds[EV_GROUP_CYCLES], NULL, 1, 0, 0, NULL, NULL, NULL}, + {EV_ID_STALLED_CYCLES_BACKEND, PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_BACKEND, &group_leader_fds[EV_GROUP_CYCLES], NULL, 1, 0, 0, NULL, NULL, NULL}, + {EV_ID_REF_CPU_CYCLES, PERF_TYPE_HARDWARE, PERF_COUNT_HW_REF_CPU_CYCLES, &group_leader_fds[EV_GROUP_CYCLES], NULL, 1, 0, 0, NULL, NULL, NULL}, + + // Software counters + // {EV_ID_CPU_CLOCK, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CPU_CLOCK, &group_leader_fds[EV_GROUP_SOFTWARE], NULL, 1, 0, 0, NULL, NULL, NULL}, + // {EV_ID_TASK_CLOCK, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_TASK_CLOCK, &group_leader_fds[EV_GROUP_SOFTWARE], NULL, 1, 0, 0, NULL, NULL, NULL}, + // {EV_ID_PAGE_FAULTS, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS, &group_leader_fds[EV_GROUP_SOFTWARE], NULL, 1, 0, 0, NULL, NULL, NULL}, + // {EV_ID_CONTEXT_SWITCHES, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CONTEXT_SWITCHES, &group_leader_fds[EV_GROUP_SOFTWARE], NULL, 1, 0, 0, NULL, NULL, NULL}, + {EV_ID_CPU_MIGRATIONS, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CPU_MIGRATIONS, &group_leader_fds[EV_GROUP_SOFTWARE], NULL, 1, 0, 0, NULL, NULL, NULL}, + // {EV_ID_PAGE_FAULTS_MIN, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS_MIN, &group_leader_fds[EV_GROUP_SOFTWARE], NULL, 1, 0, 0, NULL, NULL, NULL}, + // {EV_ID_PAGE_FAULTS_MAJ, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS_MAJ, &group_leader_fds[EV_GROUP_SOFTWARE], NULL, 1, 0, 0, NULL, NULL, NULL}, + {EV_ID_ALIGNMENT_FAULTS, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_ALIGNMENT_FAULTS, &group_leader_fds[EV_GROUP_SOFTWARE], NULL, 1, 0, 0, NULL, NULL, NULL}, + {EV_ID_EMULATION_FAULTS, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_EMULATION_FAULTS, &group_leader_fds[EV_GROUP_SOFTWARE], NULL, 1, 0, 0, NULL, NULL, NULL}, + + // Hardware cache counters + { + EV_ID_L1D_READ_ACCESS, PERF_TYPE_HW_CACHE, + (PERF_COUNT_HW_CACHE_L1D) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16), + &group_leader_fds[EV_GROUP_CACHE_L1D], NULL, 1, 0, 0, NULL, NULL, NULL + }, { + EV_ID_L1D_READ_MISS, PERF_TYPE_HW_CACHE, + (PERF_COUNT_HW_CACHE_L1D) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16), + &group_leader_fds[EV_GROUP_CACHE_L1D], NULL, 1, 0, 0, NULL, NULL, NULL + }, { + EV_ID_L1D_WRITE_ACCESS, PERF_TYPE_HW_CACHE, + (PERF_COUNT_HW_CACHE_L1D) | (PERF_COUNT_HW_CACHE_OP_WRITE << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16), + &group_leader_fds[EV_GROUP_CACHE_L1D], NULL, 1, 0, 0, NULL, NULL, NULL + }, { + EV_ID_L1D_WRITE_MISS, PERF_TYPE_HW_CACHE, + (PERF_COUNT_HW_CACHE_L1D) | (PERF_COUNT_HW_CACHE_OP_WRITE << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16), + &group_leader_fds[EV_GROUP_CACHE_L1D], NULL, 1, 0, 0, NULL, NULL, NULL + }, { + EV_ID_L1D_PREFETCH_ACCESS, PERF_TYPE_HW_CACHE, + (PERF_COUNT_HW_CACHE_L1D) | (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16), + &group_leader_fds[EV_GROUP_CACHE_L1D], NULL, 1, 0, 0, NULL, NULL, NULL + }, + + { + EV_ID_L1I_READ_ACCESS, PERF_TYPE_HW_CACHE, + (PERF_COUNT_HW_CACHE_L1I) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16), + &group_leader_fds[EV_GROUP_CACHE_L1I_LL_DTLB], NULL, 1, 0, 0, NULL, NULL, NULL + }, { + EV_ID_L1I_READ_MISS, PERF_TYPE_HW_CACHE, + (PERF_COUNT_HW_CACHE_L1I) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16), + &group_leader_fds[EV_GROUP_CACHE_L1I_LL_DTLB], NULL, 1, 0, 0, NULL, NULL, NULL + }, + + { + EV_ID_LL_READ_ACCESS, PERF_TYPE_HW_CACHE, + (PERF_COUNT_HW_CACHE_LL) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16), + &group_leader_fds[EV_GROUP_CACHE_L1I_LL_DTLB], NULL, 1, 0, 0, NULL, NULL, NULL + }, { + EV_ID_LL_READ_MISS, PERF_TYPE_HW_CACHE, + (PERF_COUNT_HW_CACHE_LL) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16), + &group_leader_fds[EV_GROUP_CACHE_L1I_LL_DTLB], NULL, 1, 0, 0, NULL, NULL, NULL + }, { + EV_ID_LL_WRITE_ACCESS, PERF_TYPE_HW_CACHE, + (PERF_COUNT_HW_CACHE_LL) | (PERF_COUNT_HW_CACHE_OP_WRITE << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16), + &group_leader_fds[EV_GROUP_CACHE_L1I_LL_DTLB], NULL, 1, 0, 0, NULL, NULL, NULL + }, { + EV_ID_LL_WRITE_MISS, PERF_TYPE_HW_CACHE, + (PERF_COUNT_HW_CACHE_LL) | (PERF_COUNT_HW_CACHE_OP_WRITE << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16), + &group_leader_fds[EV_GROUP_CACHE_L1I_LL_DTLB], NULL, 1, 0, 0, NULL, NULL, NULL + }, + + { + EV_ID_DTLB_READ_ACCESS, PERF_TYPE_HW_CACHE, + (PERF_COUNT_HW_CACHE_DTLB) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16), + &group_leader_fds[EV_GROUP_CACHE_L1I_LL_DTLB], NULL, 1, 0, 0, NULL, NULL, NULL + }, { + EV_ID_DTLB_READ_MISS, PERF_TYPE_HW_CACHE, + (PERF_COUNT_HW_CACHE_DTLB) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16), + &group_leader_fds[EV_GROUP_CACHE_L1I_LL_DTLB], NULL, 1, 0, 0, NULL, NULL, NULL + }, { + EV_ID_DTLB_WRITE_ACCESS, PERF_TYPE_HW_CACHE, + (PERF_COUNT_HW_CACHE_DTLB) | (PERF_COUNT_HW_CACHE_OP_WRITE << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16), + &group_leader_fds[EV_GROUP_CACHE_L1I_LL_DTLB], NULL, 1, 0, 0, NULL, NULL, NULL + }, { + EV_ID_DTLB_WRITE_MISS, PERF_TYPE_HW_CACHE, + (PERF_COUNT_HW_CACHE_DTLB) | (PERF_COUNT_HW_CACHE_OP_WRITE << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16), + &group_leader_fds[EV_GROUP_CACHE_ITLB_BPU], NULL, 1, 0, 0, NULL, NULL, NULL + }, + + { + EV_ID_ITLB_READ_ACCESS, PERF_TYPE_HW_CACHE, + (PERF_COUNT_HW_CACHE_ITLB) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16), + &group_leader_fds[EV_GROUP_CACHE_ITLB_BPU], NULL, 1, 0, 0, NULL, NULL, NULL + }, { + EV_ID_ITLB_READ_MISS, PERF_TYPE_HW_CACHE, + (PERF_COUNT_HW_CACHE_ITLB) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16), + &group_leader_fds[EV_GROUP_CACHE_ITLB_BPU], NULL, 1, 0, 0, NULL, NULL, NULL + }, + + { + EV_ID_PBU_READ_ACCESS, PERF_TYPE_HW_CACHE, + (PERF_COUNT_HW_CACHE_BPU) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16), + &group_leader_fds[EV_GROUP_CACHE_ITLB_BPU], NULL, 1, 0, 0, NULL, NULL, NULL + }, + + {EV_ID_END, 0, 0, NULL, NULL, 0, 0, 0, NULL, NULL, NULL} +}; + +static int perf_init() { + int cpu, group; + struct perf_event_attr perf_event_attr; + struct perf_event *current_event = NULL; + unsigned long flags = 0; + + number_of_cpus = (int)get_system_cpus(); + + // initialize all perf event file descriptors + for(current_event = &perf_events[0]; current_event->id != EV_ID_END; current_event++) { + current_event->fd = mallocz(number_of_cpus * sizeof(int)); + memset(current_event->fd, NO_FD, number_of_cpus * sizeof(int)); + + current_event->prev_value = mallocz(number_of_cpus * sizeof(uint64_t)); + memset(current_event->prev_value, 0, number_of_cpus * sizeof(uint64_t)); + + current_event->prev_time_enabled = mallocz(number_of_cpus * sizeof(uint64_t)); + memset(current_event->prev_time_enabled, 0, number_of_cpus * sizeof(uint64_t)); + + current_event->prev_time_running = mallocz(number_of_cpus * sizeof(uint64_t)); + memset(current_event->prev_time_running, 0, number_of_cpus * sizeof(uint64_t)); + } + + for(group = 0; group < EV_GROUP_NUM; group++) { + group_leader_fds[group] = mallocz(number_of_cpus * sizeof(int)); + memset(group_leader_fds[group], NO_FD, number_of_cpus * sizeof(int)); + } + + memset(&perf_event_attr, 0, sizeof(perf_event_attr)); + + for(cpu = 0; cpu < number_of_cpus; cpu++) { + for(current_event = &perf_events[0]; current_event->id != EV_ID_END; current_event++) { + if(unlikely(current_event->disabled)) continue; + + perf_event_attr.type = current_event->type; + perf_event_attr.config = current_event->config; + perf_event_attr.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING; + + int fd, group_leader_fd = *(*current_event->group_leader_fd + cpu); + + fd = syscall( + __NR_perf_event_open, + &perf_event_attr, + ALL_PIDS, + cpu, + group_leader_fd, + flags + ); + + if(unlikely(group_leader_fd == NO_FD)) group_leader_fd = fd; + + if(unlikely(fd < 0)) { + switch errno { + case EACCES: + error("Cannot access to the PMU: Permission denied"); + break; + case EBUSY: + error("Another event already has exclusive access to the PMU"); + break; + default: + error("Cannot open perf event"); + } + error("Disabling event %u", current_event->id); + current_event->disabled = 1; + } + + *(current_event->fd + cpu) = fd; + *(*current_event->group_leader_fd + cpu) = group_leader_fd; + + if(unlikely(debug)) fprintf(stderr, "perf.plugin: event id = %u, cpu = %d, fd = %d, leader_fd = %d\n", current_event->id, cpu, fd, group_leader_fd); + } + } + + return 0; +} + +static void perf_free(void) { + int cpu, group; + struct perf_event *current_event = NULL; + + for(current_event = &perf_events[0]; current_event->id != EV_ID_END; current_event++) { + for(cpu = 0; cpu < number_of_cpus; cpu++) + if(*(current_event->fd + cpu) != NO_FD) close(*(current_event->fd + cpu)); + + free(current_event->fd); + free(current_event->prev_value); + free(current_event->prev_time_enabled); + free(current_event->prev_time_running); + } + + for(group = 0; group < EV_GROUP_NUM; group++) + free(group_leader_fds[group]); +} + +static void reenable_events() { + int group, cpu; + + for(group = 0; group < EV_GROUP_NUM; group++) { + for(cpu = 0; cpu < number_of_cpus; cpu++) { + int current_fd = *(group_leader_fds[group] + cpu); + + if(unlikely(current_fd == NO_FD)) continue; + + if(ioctl(current_fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP) == -1 + || ioctl(current_fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) + { + error("Cannot reenable event group"); + } + } + } +} + +static int perf_collect() { + int cpu; + struct perf_event *current_event = NULL; + static uint64_t prev_cpu_cycles_value = 0; + struct { + uint64_t value; + uint64_t time_enabled; + uint64_t time_running; + } read_result; + + for(current_event = &perf_events[0]; current_event->id != EV_ID_END; current_event++) { + current_event->updated = 0; + current_event->value = 0; + + if(unlikely(current_event->disabled)) continue; + + for(cpu = 0; cpu < number_of_cpus; cpu++) { + + ssize_t read_size = read(current_event->fd[cpu], &read_result, sizeof(read_result)); + + if(likely(read_size == sizeof(read_result))) { + if (likely(read_result.time_running + && read_result.time_running != *(current_event->prev_time_running + cpu) + && (read_result.time_enabled / read_result.time_running < RUNNING_THRESHOLD))) { + current_event->value += (read_result.value - *(current_event->prev_value + cpu)) \ + * (read_result.time_enabled - *(current_event->prev_time_enabled + cpu)) \ + / (read_result.time_running - *(current_event->prev_time_running + cpu)); + } + + *(current_event->prev_value + cpu) = read_result.value; + *(current_event->prev_time_enabled + cpu) = read_result.time_enabled; + *(current_event->prev_time_running + cpu) = read_result.time_running; + + current_event->updated = 1; + } + else { + error("Cannot update value for event %u", current_event->id); + return 1; + } + } + + if(unlikely(debug)) fprintf(stderr, "perf.plugin: successfully read event id = %u, value = %lu\n", current_event->id, current_event->value); + } + + if(unlikely(perf_events[EV_ID_CPU_CYCLES].value == prev_cpu_cycles_value)) + reenable_events(); + prev_cpu_cycles_value = perf_events[EV_ID_CPU_CYCLES].value; + + return 0; +} + +static void perf_send_metrics() { + static int // Hardware counters + cpu_cycles_chart_generated = 0, + instructions_chart_generated = 0, + branch_chart_generated = 0, + cache_chart_generated = 0, + bus_cycles_chart_generated = 0, + stalled_cycles_chart_generated = 0, + + // Software counters + migrations_chart_generated = 0, + alighnment_chart_generated = 0, + emulation_chart_generated = 0, + + // Hardware cache counters + L1D_chart_generated = 0, + L1D_prefetch_chart_generated = 0, + L1I_chart_generated = 0, + LL_chart_generated = 0, + DTLB_chart_generated = 0, + ITLB_chart_generated = 0, + PBU_chart_generated = 0; + + // ------------------------------------------------------------------------ + + if(likely(perf_events[EV_ID_CPU_CYCLES].updated || perf_events[EV_ID_REF_CPU_CYCLES].updated)) { + if(unlikely(!cpu_cycles_chart_generated)) { + cpu_cycles_chart_generated = 1; + + printf("CHART %s.%s '' 'CPU cycles' 'cycles/s' %s '' line %d %d %s\n" + , RRD_TYPE_PERF + , "cpu_cycles" + , RRD_FAMILY_HW + , NETDATA_CHART_PRIO_PERF_CPU_CYCLES + , update_every + , PLUGIN_PERF_NAME + ); + printf("DIMENSION %s '' absolute 1 1\n", "cpu"); + printf("DIMENSION %s '' absolute 1 1\n", "ref_cpu"); + } + + printf( + "BEGIN %s.%s\n" + , RRD_TYPE_PERF + , "cpu_cycles" + ); + if(likely(perf_events[EV_ID_CPU_CYCLES].updated)) { + printf( + "SET %s = %lld\n" + , "cpu" + , (collected_number) perf_events[EV_ID_CPU_CYCLES].value + ); + } + if(likely(perf_events[EV_ID_REF_CPU_CYCLES].updated)) { + printf( + "SET %s = %lld\n" + , "ref_cpu" + , (collected_number) perf_events[EV_ID_REF_CPU_CYCLES].value + ); + } + printf("END\n"); + } + + // ------------------------------------------------------------------------ + + if(likely(perf_events[EV_ID_INSTRUCTIONS].updated)) { + if(unlikely(!instructions_chart_generated)) { + instructions_chart_generated = 1; + + printf("CHART %s.%s '' 'Instructions' 'instructions/s' %s '' line %d %d %s\n" + , RRD_TYPE_PERF + , "instructions" + , RRD_FAMILY_HW + , NETDATA_CHART_PRIO_PERF_INSTRUCTIONS + , update_every + , PLUGIN_PERF_NAME + ); + printf("DIMENSION %s '' absolute 1 1\n", "instructions"); + } + + printf( + "BEGIN %s.%s\n" + , RRD_TYPE_PERF + , "instructions" + ); + printf( + "SET %s = %lld\n" + , "instructions" + , (collected_number) perf_events[EV_ID_INSTRUCTIONS].value + ); + printf("END\n"); + } + + // ------------------------------------------------------------------------ + + if(likely(perf_events[EV_ID_BRANCH_INSTRUCTIONS].updated || perf_events[EV_ID_BRANCH_MISSES].updated)) { + if(unlikely(!branch_chart_generated)) { + branch_chart_generated = 1; + + printf("CHART %s.%s '' 'Branch instructions' 'instructions/s' %s '' line %d %d %s\n" + , RRD_TYPE_PERF + , "branch_instructions" + , RRD_FAMILY_HW + , NETDATA_CHART_PRIO_PERF_BRANCH_INSTRUSTIONS + , update_every + , PLUGIN_PERF_NAME + ); + printf("DIMENSION %s '' absolute 1 1\n", "instructions"); + printf("DIMENSION %s '' absolute 1 1\n", "misses"); + } + + printf( + "BEGIN %s.%s\n" + , RRD_TYPE_PERF + , "branch_instructions" + ); + if(likely(perf_events[EV_ID_BRANCH_INSTRUCTIONS].updated)) { + printf( + "SET %s = %lld\n" + , "instructions" + , (collected_number) perf_events[EV_ID_BRANCH_INSTRUCTIONS].value + ); + } + if(likely(perf_events[EV_ID_BRANCH_MISSES].updated)) { + printf( + "SET %s = %lld\n" + , "misses" + , (collected_number) perf_events[EV_ID_BRANCH_MISSES].value + ); + } + printf("END\n"); + } + + // ------------------------------------------------------------------------ + + if(likely(perf_events[EV_ID_CACHE_REFERENCES].updated || perf_events[EV_ID_CACHE_MISSES].updated)) { + if(unlikely(!cache_chart_generated)) { + cache_chart_generated = 1; + + printf("CHART %s.%s '' 'Cache operations' 'operations/s' %s '' line %d %d %s\n" + , RRD_TYPE_PERF + , "cache" + , RRD_FAMILY_HW + , NETDATA_CHART_PRIO_PERF_CACHE + , update_every + , PLUGIN_PERF_NAME + ); + printf("DIMENSION %s '' absolute 1 1\n", "references"); + printf("DIMENSION %s '' absolute 1 1\n", "misses"); + } + + printf( + "BEGIN %s.%s\n" + , RRD_TYPE_PERF + , "cache" + ); + if(likely(perf_events[EV_ID_CACHE_REFERENCES].updated)) { + printf( + "SET %s = %lld\n" + , "references" + , (collected_number) perf_events[EV_ID_CACHE_REFERENCES].value + ); + } + if(likely(perf_events[EV_ID_CACHE_MISSES].updated)) { + printf( + "SET %s = %lld\n" + , "misses" + , (collected_number) perf_events[EV_ID_CACHE_MISSES].value + ); + } + printf("END\n"); + } + + // ------------------------------------------------------------------------ + + if(likely(perf_events[EV_ID_BUS_CYCLES].updated)) { + if(unlikely(!bus_cycles_chart_generated)) { + bus_cycles_chart_generated = 1; + + printf("CHART %s.%s '' 'Bus cycles' 'cycles/s' %s '' line %d %d %s\n" + , RRD_TYPE_PERF + , "bus_cycles" + , RRD_FAMILY_HW + , NETDATA_CHART_PRIO_PERF_BUS_CYCLES + , update_every + , PLUGIN_PERF_NAME + ); + printf("DIMENSION %s '' absolute 1 1\n", "bus"); + } + + printf( + "BEGIN %s.%s\n" + , RRD_TYPE_PERF + , "bus_cycles" + ); + printf( + "SET %s = %lld\n" + , "bus" + , (collected_number) perf_events[EV_ID_BUS_CYCLES].value + ); + printf("END\n"); + } + + // ------------------------------------------------------------------------ + + if(likely(perf_events[EV_ID_STALLED_CYCLES_FRONTEND].updated || perf_events[EV_ID_STALLED_CYCLES_BACKEND].updated)) { + if(unlikely(!stalled_cycles_chart_generated)) { + stalled_cycles_chart_generated = 1; + + printf("CHART %s.%s '' 'Stalled frontend and backend cycles' 'cycles/s' %s '' line %d %d %s\n" + , RRD_TYPE_PERF + , "stalled_cycles" + , RRD_FAMILY_HW + , NETDATA_CHART_PRIO_PERF_FRONT_BACK_CYCLES + , update_every + , PLUGIN_PERF_NAME + ); + printf("DIMENSION %s '' absolute 1 1\n", "frontend"); + printf("DIMENSION %s '' absolute 1 1\n", "backend"); + } + + printf( + "BEGIN %s.%s\n" + , RRD_TYPE_PERF + , "stalled_cycles" + ); + if(likely(perf_events[EV_ID_STALLED_CYCLES_FRONTEND].updated)) { + printf( + "SET %s = %lld\n" + , "frontend" + , (collected_number) perf_events[EV_ID_STALLED_CYCLES_FRONTEND].value + ); + } + if(likely(perf_events[EV_ID_STALLED_CYCLES_BACKEND].updated)) { + printf( + "SET %s = %lld\n" + , "backend" + , (collected_number) perf_events[EV_ID_STALLED_CYCLES_BACKEND].value + ); + } + printf("END\n"); + } + + // ------------------------------------------------------------------------ + + if(likely(perf_events[EV_ID_CPU_MIGRATIONS].updated)) { + if(unlikely(!migrations_chart_generated)) { + migrations_chart_generated = 1; + + printf("CHART %s.%s '' 'CPU migrations' 'migrations' %s '' line %d %d %s\n" + , RRD_TYPE_PERF + , "migrations" + , RRD_FAMILY_SW + , NETDATA_CHART_PRIO_PERF_MIGRATIONS + , update_every + , PLUGIN_PERF_NAME + ); + printf("DIMENSION %s '' absolute 1 1\n", "migrations"); + } + + printf( + "BEGIN %s.%s\n" + , RRD_TYPE_PERF + , "migrations" + ); + printf( + "SET %s = %lld\n" + , "migrations" + , (collected_number) perf_events[EV_ID_CPU_MIGRATIONS].value + ); + printf("END\n"); + } + + // ------------------------------------------------------------------------ + + if(likely(perf_events[EV_ID_ALIGNMENT_FAULTS].updated)) { + if(unlikely(!alighnment_chart_generated)) { + alighnment_chart_generated = 1; + + printf("CHART %s.%s '' 'Alighnment faults' 'faults' %s '' line %d %d %s\n" + , RRD_TYPE_PERF + , "alighnment_faults" + , RRD_FAMILY_SW + , NETDATA_CHART_PRIO_PERF_ALIGNMENT + , update_every + , PLUGIN_PERF_NAME + ); + printf("DIMENSION %s '' absolute 1 1\n", "faults"); + } + + printf( + "BEGIN %s.%s\n" + , RRD_TYPE_PERF + , "alighnment_faults" + ); + printf( + "SET %s = %lld\n" + , "faults" + , (collected_number) perf_events[EV_ID_ALIGNMENT_FAULTS].value + ); + printf("END\n"); + } + + // ------------------------------------------------------------------------ + + if(likely(perf_events[EV_ID_EMULATION_FAULTS].updated)) { + if(unlikely(!emulation_chart_generated)) { + emulation_chart_generated = 1; + + printf("CHART %s.%s '' 'Emulation faults' 'faults' %s '' line %d %d %s\n" + , RRD_TYPE_PERF + , "emulation_faults" + , RRD_FAMILY_SW + , NETDATA_CHART_PRIO_PERF_EMULATION + , update_every + , PLUGIN_PERF_NAME + ); + printf("DIMENSION %s '' absolute 1 1\n", "faults"); + } + + printf( + "BEGIN %s.%s\n" + , RRD_TYPE_PERF + , "emulation_faults" + ); + printf( + "SET %s = %lld\n" + , "faults" + , (collected_number) perf_events[EV_ID_EMULATION_FAULTS].value + ); + printf("END\n"); + } + + // ------------------------------------------------------------------------ + + if(likely(perf_events[EV_ID_L1D_READ_ACCESS].updated || perf_events[EV_ID_L1D_READ_MISS].updated + || perf_events[EV_ID_L1D_WRITE_ACCESS].updated || perf_events[EV_ID_L1D_WRITE_MISS].updated)) { + if(unlikely(!L1D_chart_generated)) { + L1D_chart_generated = 1; + + printf("CHART %s.%s '' 'L1D cache operations' 'events/s' %s '' line %d %d %s\n" + , RRD_TYPE_PERF + , "l1d_cache" + , RRD_FAMILY_CACHE + , NETDATA_CHART_PRIO_PERF_L1D + , update_every + , PLUGIN_PERF_NAME + ); + printf("DIMENSION %s '' absolute 1 1\n", "read_access"); + printf("DIMENSION %s '' absolute 1 1\n", "read_misses"); + printf("DIMENSION %s '' absolute -1 1\n", "write_access"); + printf("DIMENSION %s '' absolute -1 1\n", "write_misses"); + } + + printf( + "BEGIN %s.%s\n" + , RRD_TYPE_PERF + , "l1d_cache" + ); + if(likely(perf_events[EV_ID_L1D_READ_ACCESS].updated)) { + printf( + "SET %s = %lld\n" + , "read_access" + , (collected_number) perf_events[EV_ID_L1D_READ_ACCESS].value + ); + } + if(likely(perf_events[EV_ID_L1D_READ_MISS].updated)) { + printf( + "SET %s = %lld\n" + , "read_misses" + , (collected_number) perf_events[EV_ID_L1D_READ_MISS].value + ); + } + if(likely(perf_events[EV_ID_L1D_WRITE_ACCESS].updated)) { + printf( + "SET %s = %lld\n" + , "write_access" + , (collected_number) perf_events[EV_ID_L1D_WRITE_ACCESS].value + ); + } + if(likely(perf_events[EV_ID_L1D_WRITE_MISS].updated)) { + printf( + "SET %s = %lld\n" + , "write_misses" + , (collected_number) perf_events[EV_ID_L1D_WRITE_MISS].value + ); + } + printf("END\n"); + } + + // ------------------------------------------------------------------------ + + if(likely(perf_events[EV_ID_L1D_PREFETCH_ACCESS].updated)) { + if(unlikely(!L1D_prefetch_chart_generated)) { + L1D_prefetch_chart_generated = 1; + + printf("CHART %s.%s '' 'L1D prefetch cache operations' 'prefetches/s' %s '' line %d %d %s\n" + , RRD_TYPE_PERF + , "l1d_cache_prefetch" + , RRD_FAMILY_CACHE + , NETDATA_CHART_PRIO_PERF_L1D_PREFETCH + , update_every + , PLUGIN_PERF_NAME + ); + printf("DIMENSION %s '' absolute 1 1\n", "prefetches"); + } + + printf( + "BEGIN %s.%s\n" + , RRD_TYPE_PERF + , "l1d_cache_prefetch" + ); + printf( + "SET %s = %lld\n" + , "prefetches" + , (collected_number) perf_events[EV_ID_L1D_PREFETCH_ACCESS].value + ); + printf("END\n"); + } + + // ------------------------------------------------------------------------ + + if(likely(perf_events[EV_ID_L1I_READ_ACCESS].updated || perf_events[EV_ID_L1I_READ_MISS].updated)) { + if(unlikely(!L1I_chart_generated)) { + L1I_chart_generated = 1; + + printf("CHART %s.%s '' 'L1I cache operations' 'events/s' %s '' line %d %d %s\n" + , RRD_TYPE_PERF + , "l1i_cache" + , RRD_FAMILY_CACHE + , NETDATA_CHART_PRIO_PERF_L1I + , update_every + , PLUGIN_PERF_NAME + ); + printf("DIMENSION %s '' absolute 1 1\n", "read_access"); + printf("DIMENSION %s '' absolute 1 1\n", "read_misses"); + } + + printf( + "BEGIN %s.%s\n" + , RRD_TYPE_PERF + , "l1i_cache" + ); + if(likely(perf_events[EV_ID_L1I_READ_ACCESS].updated)) { + printf( + "SET %s = %lld\n" + , "read_access" + , (collected_number) perf_events[EV_ID_L1I_READ_ACCESS].value + ); + } + if(likely(perf_events[EV_ID_L1I_READ_MISS].updated)) { + printf( + "SET %s = %lld\n" + , "read_misses" + , (collected_number) perf_events[EV_ID_L1I_READ_MISS].value + ); + } + printf("END\n"); + } + + // ------------------------------------------------------------------------ + + if(likely(perf_events[EV_ID_LL_READ_ACCESS].updated || perf_events[EV_ID_LL_READ_MISS].updated + || perf_events[EV_ID_LL_WRITE_ACCESS].updated || perf_events[EV_ID_LL_WRITE_MISS].updated)) { + if(unlikely(!LL_chart_generated)) { + LL_chart_generated = 1; + + printf("CHART %s.%s '' 'LL cache operations' 'events/s' %s '' line %d %d %s\n" + , RRD_TYPE_PERF + , "ll_cache" + , RRD_FAMILY_CACHE + , NETDATA_CHART_PRIO_PERF_LL + , update_every + , PLUGIN_PERF_NAME + ); + printf("DIMENSION %s '' absolute 1 1\n", "read_access"); + printf("DIMENSION %s '' absolute 1 1\n", "read_misses"); + printf("DIMENSION %s '' absolute -1 1\n", "write_access"); + printf("DIMENSION %s '' absolute -1 1\n", "write_misses"); + } + + printf( + "BEGIN %s.%s\n" + , RRD_TYPE_PERF + , "ll_cache" + ); + if(likely(perf_events[EV_ID_LL_READ_ACCESS].updated)) { + printf( + "SET %s = %lld\n" + , "read_access" + , (collected_number) perf_events[EV_ID_LL_READ_ACCESS].value + ); + } + if(likely(perf_events[EV_ID_LL_READ_MISS].updated)) { + printf( + "SET %s = %lld\n" + , "read_misses" + , (collected_number) perf_events[EV_ID_LL_READ_MISS].value + ); + } + if(likely(perf_events[EV_ID_LL_WRITE_ACCESS].updated)) { + printf( + "SET %s = %lld\n" + , "write_access" + , (collected_number) perf_events[EV_ID_LL_WRITE_ACCESS].value + ); + } + if(likely(perf_events[EV_ID_LL_WRITE_MISS].updated)) { + printf( + "SET %s = %lld\n" + , "write_misses" + , (collected_number) perf_events[EV_ID_LL_WRITE_MISS].value + ); + } + printf("END\n"); + } + + // ------------------------------------------------------------------------ + + if(likely(perf_events[EV_ID_DTLB_READ_ACCESS].updated || perf_events[EV_ID_DTLB_READ_MISS].updated + || perf_events[EV_ID_DTLB_WRITE_ACCESS].updated || perf_events[EV_ID_DTLB_WRITE_MISS].updated)) { + if(unlikely(!DTLB_chart_generated)) { + DTLB_chart_generated = 1; + + printf("CHART %s.%s '' 'DTLB cache operations' 'events/s' %s '' line %d %d %s\n" + , RRD_TYPE_PERF + , "dtlb_cache" + , RRD_FAMILY_CACHE + , NETDATA_CHART_PRIO_PERF_DTLB + , update_every + , PLUGIN_PERF_NAME + ); + printf("DIMENSION %s '' absolute 1 1\n", "read_access"); + printf("DIMENSION %s '' absolute 1 1\n", "read_misses"); + printf("DIMENSION %s '' absolute -1 1\n", "write_access"); + printf("DIMENSION %s '' absolute -1 1\n", "write_misses"); + } + + printf( + "BEGIN %s.%s\n" + , RRD_TYPE_PERF + , "dtlb_cache" + ); + if(likely(perf_events[EV_ID_DTLB_READ_ACCESS].updated)) { + printf( + "SET %s = %lld\n" + , "read_access" + , (collected_number) perf_events[EV_ID_DTLB_READ_ACCESS].value + ); + } + if(likely(perf_events[EV_ID_DTLB_READ_MISS].updated)) { + printf( + "SET %s = %lld\n" + , "read_misses" + , (collected_number) perf_events[EV_ID_DTLB_READ_MISS].value + ); + } + if(likely(perf_events[EV_ID_DTLB_WRITE_ACCESS].updated)) { + printf( + "SET %s = %lld\n" + , "write_access" + , (collected_number) perf_events[EV_ID_DTLB_WRITE_ACCESS].value + ); + } + if(likely(perf_events[EV_ID_DTLB_WRITE_MISS].updated)) { + printf( + "SET %s = %lld\n" + , "write_misses" + , (collected_number) perf_events[EV_ID_DTLB_WRITE_MISS].value + ); + } + printf("END\n"); + } + + // ------------------------------------------------------------------------ + + if(likely(perf_events[EV_ID_ITLB_READ_ACCESS].updated || perf_events[EV_ID_ITLB_READ_MISS].updated)) { + if(unlikely(!ITLB_chart_generated)) { + ITLB_chart_generated = 1; + + printf("CHART %s.%s '' 'ITLB cache operations' 'events/s' %s '' line %d %d %s\n" + , RRD_TYPE_PERF + , "itlb_cache" + , RRD_FAMILY_CACHE + , NETDATA_CHART_PRIO_PERF_ITLB + , update_every + , PLUGIN_PERF_NAME + ); + printf("DIMENSION %s '' absolute 1 1\n", "read_access"); + printf("DIMENSION %s '' absolute 1 1\n", "read_misses"); + } + + printf( + "BEGIN %s.%s\n" + , RRD_TYPE_PERF + , "itlb_cache" + ); + if(likely(perf_events[EV_ID_ITLB_READ_ACCESS].updated)) { + printf( + "SET %s = %lld\n" + , "read_access" + , (collected_number) perf_events[EV_ID_ITLB_READ_ACCESS].value + ); + } + if(likely(perf_events[EV_ID_ITLB_READ_MISS].updated)) { + printf( + "SET %s = %lld\n" + , "read_misses" + , (collected_number) perf_events[EV_ID_ITLB_READ_MISS].value + ); + } + printf("END\n"); + } + + // ------------------------------------------------------------------------ + + if(likely(perf_events[EV_ID_PBU_READ_ACCESS].updated)) { + if(unlikely(!PBU_chart_generated)) { + PBU_chart_generated = 1; + + printf("CHART %s.%s '' 'PBU cache operations' 'events/s' %s '' line %d %d %s\n" + , RRD_TYPE_PERF + , "pbu_cache" + , RRD_FAMILY_CACHE + , NETDATA_CHART_PRIO_PERF_PBU + , update_every + , PLUGIN_PERF_NAME + ); + printf("DIMENSION %s '' absolute 1 1\n", "read_access"); + } + + printf( + "BEGIN %s.%s\n" + , RRD_TYPE_PERF + , "pbu_cache" + ); + printf( + "SET %s = %lld\n" + , "read_access" + , (collected_number) perf_events[EV_ID_PBU_READ_ACCESS].value + ); + printf("END\n"); + } +} + +void parse_command_line(int argc, char **argv) { + int i, plugin_enabled = 0; + + for(i = 1; i < argc ; i++) { + if(isdigit(*argv[i]) && !freq) { + int n = str2i(argv[i]); + if(n > 0 && n < 86400) { + freq = n; + continue; + } + } + else if(strcmp("version", argv[i]) == 0 || strcmp("-version", argv[i]) == 0 || strcmp("--version", argv[i]) == 0 || strcmp("-v", argv[i]) == 0 || strcmp("-V", argv[i]) == 0) { + printf("perf.plugin %s\n", VERSION); + exit(0); + } + else if(strcmp("all", argv[i]) == 0) { + struct perf_event *current_event = NULL; + + for(current_event = &perf_events[0]; current_event->id != EV_ID_END; current_event++) + current_event->disabled = 0; + + plugin_enabled = 1; + continue; + } + else if(strcmp("cycles", argv[i]) == 0) { + perf_events[EV_ID_CPU_CYCLES].disabled = 0; + perf_events[EV_ID_REF_CPU_CYCLES].disabled = 0; + plugin_enabled = 1; + continue; + } + else if(strcmp("instructions", argv[i]) == 0) { + perf_events[EV_ID_INSTRUCTIONS].disabled = 0; + plugin_enabled = 1; + continue; + } + else if(strcmp("branch", argv[i]) == 0) { + perf_events[EV_ID_BRANCH_INSTRUCTIONS].disabled = 0; + perf_events[EV_ID_BRANCH_MISSES].disabled = 0; + plugin_enabled = 1; + continue; + } + else if(strcmp("cache", argv[i]) == 0) { + perf_events[EV_ID_CACHE_REFERENCES].disabled = 0; + perf_events[EV_ID_CACHE_MISSES].disabled = 0; + plugin_enabled = 1; + continue; + } + else if(strcmp("bus", argv[i]) == 0) { + perf_events[EV_ID_BUS_CYCLES].disabled = 0; + plugin_enabled = 1; + continue; + } + else if(strcmp("stalled", argv[i]) == 0) { + perf_events[EV_ID_STALLED_CYCLES_FRONTEND].disabled = 0; + perf_events[EV_ID_STALLED_CYCLES_BACKEND].disabled = 0; + plugin_enabled = 1; + continue; + } + else if(strcmp("migrations", argv[i]) == 0) { + perf_events[EV_ID_CPU_MIGRATIONS].disabled = 0; + plugin_enabled = 1; + continue; + } + else if(strcmp("alighnment", argv[i]) == 0) { + perf_events[EV_ID_ALIGNMENT_FAULTS].disabled = 0; + plugin_enabled = 1; + continue; + } + else if(strcmp("emulation", argv[i]) == 0) { + perf_events[EV_ID_EMULATION_FAULTS].disabled = 0; + plugin_enabled = 1; + continue; + } + else if(strcmp("L1D", argv[i]) == 0) { + perf_events[EV_ID_L1D_READ_ACCESS].disabled = 0; + perf_events[EV_ID_L1D_READ_MISS].disabled = 0; + perf_events[EV_ID_L1D_WRITE_ACCESS].disabled = 0; + perf_events[EV_ID_L1D_WRITE_MISS].disabled = 0; + plugin_enabled = 1; + continue; + } + else if(strcmp("L1D-prefetch", argv[i]) == 0) { + perf_events[EV_ID_L1D_PREFETCH_ACCESS].disabled = 0; + plugin_enabled = 1; + continue; + } + else if(strcmp("L1I", argv[i]) == 0) { + perf_events[EV_ID_L1I_READ_ACCESS].disabled = 0; + perf_events[EV_ID_L1I_READ_MISS].disabled = 0; + plugin_enabled = 1; + continue; + } + else if(strcmp("LL", argv[i]) == 0) { + perf_events[EV_ID_LL_READ_ACCESS].disabled = 0; + perf_events[EV_ID_LL_READ_MISS].disabled = 0; + perf_events[EV_ID_LL_WRITE_ACCESS].disabled = 0; + perf_events[EV_ID_LL_WRITE_MISS].disabled = 0; + plugin_enabled = 1; + continue; + } + else if(strcmp("DTLB", argv[i]) == 0) { + perf_events[EV_ID_DTLB_READ_ACCESS].disabled = 0; + perf_events[EV_ID_DTLB_READ_MISS].disabled = 0; + perf_events[EV_ID_DTLB_WRITE_ACCESS].disabled = 0; + perf_events[EV_ID_DTLB_WRITE_MISS].disabled = 0; + plugin_enabled = 1; + continue; + } + else if(strcmp("ITLB", argv[i]) == 0) { + perf_events[EV_ID_ITLB_READ_ACCESS].disabled = 0; + perf_events[EV_ID_ITLB_READ_MISS].disabled = 0; + plugin_enabled = 1; + continue; + } + else if(strcmp("PBU", argv[i]) == 0) { + perf_events[EV_ID_PBU_READ_ACCESS].disabled = 0; + plugin_enabled = 1; + continue; + } + else if(strcmp("debug", argv[i]) == 0) { + debug = 1; + continue; + } + else if(strcmp("-h", argv[i]) == 0 || strcmp("--help", argv[i]) == 0) { + fprintf(stderr, + "\n" + " netdata perf.plugin %s\n" + " Copyright (C) 2019 Netdata Inc.\n" + " Released under GNU General Public License v3 or later.\n" + " All rights reserved.\n" + "\n" + " This program is a data collector plugin for netdata.\n" + "\n" + " Available command line options:\n" + "\n" + " COLLECTION_FREQUENCY data collection frequency in seconds\n" + " minimum: %d\n" + "\n" + " all enable all charts\n" + "\n" + " cycles enable CPU cycles chart\n" + "\n" + " instructions enable Instructions chart\n" + "\n" + " branch enable Branch instructions chart\n" + "\n" + " cache enable Cache operations chart\n" + "\n" + " bus enable Bus cycles chart\n" + "\n" + " stalled enable Stalled frontend and backend cycles chart\n" + "\n" + " migrations enable CPU migrations chart\n" + "\n" + " alighnment enable Alignment faults chart\n" + "\n" + " emulation enable Emulation faults chart\n" + "\n" + " L1D enable L1D cache operations chart\n" + "\n" + " L1D-prefetch enable L1D prefetch cache operations chart\n" + "\n" + " L1I enable L1I cache operations chart\n" + "\n" + " LL enable LL cache operations chart\n" + "\n" + " DTLB enable DTLB cache operations chart\n" + "\n" + " ITLB enable ITLB cache operations chart\n" + "\n" + " PBU enable PBU cache operations chart\n" + "\n" + " debug enable verbose output\n" + " default: disabled\n" + "\n" + " -v\n" + " -V\n" + " --version print version and exit\n" + "\n" + " -h\n" + " --help print this message and exit\n" + "\n" + " For more information:\n" + " https://github.com/netdata/netdata/tree/master/collectors/perf.plugin\n" + "\n" + , VERSION + , update_every + ); + exit(1); + } + + error("ignoring parameter '%s'", argv[i]); + } + + if(!plugin_enabled){ + info("no charts enabled - nothing to do."); + printf("DISABLE\n"); + exit(1); + } +} + +int main(int argc, char **argv) { + + // ------------------------------------------------------------------------ + // initialization of netdata plugin + + program_name = "perf.plugin"; + + // disable syslog + error_log_syslog = 0; + + // set errors flood protection to 100 logs per hour + error_log_errors_per_period = 100; + error_log_throttle_period = 3600; + + parse_command_line(argc, argv); + + errno = 0; + + if(freq >= update_every) + update_every = freq; + else if(freq) + error("update frequency %d seconds is too small for PERF. Using %d.", freq, update_every); + + if(unlikely(debug)) fprintf(stderr, "perf.plugin: calling perf_init()\n"); + int perf = !perf_init(); + + // ------------------------------------------------------------------------ + // the main loop + + if(unlikely(debug)) fprintf(stderr, "perf.plugin: starting data collection\n"); + + time_t started_t = now_monotonic_sec(); + + size_t iteration; + usec_t step = update_every * USEC_PER_SEC; + + heartbeat_t hb; + heartbeat_init(&hb); + for(iteration = 0; 1; iteration++) { + usec_t dt = heartbeat_next(&hb, step); + + if(unlikely(netdata_exit)) break; + + if(unlikely(debug && iteration)) + fprintf(stderr, "perf.plugin: iteration %zu, dt %llu usec\n" + , iteration + , dt + ); + + if(likely(perf)) { + if(unlikely(debug)) fprintf(stderr, "perf.plugin: calling perf_collect()\n"); + perf = !perf_collect(); + + if(likely(perf)) { + if(unlikely(debug)) fprintf(stderr, "perf.plugin: calling perf_send_metrics()\n"); + perf_send_metrics(); + } + } + + fflush(stdout); + + // restart check (14400 seconds) + if(now_monotonic_sec() - started_t > 14400) break; + } + + info("process exiting"); + perf_free(); +} diff --git a/collectors/plugins.d/README.md b/collectors/plugins.d/README.md index 105a60eb..9134d516 100644 --- a/collectors/plugins.d/README.md +++ b/collectors/plugins.d/README.md @@ -15,6 +15,7 @@ plugin|language|O/S|description [freeipmi.plugin](../freeipmi.plugin/)|`C`|linux|collects metrics from enterprise hardware sensors, on Linux servers. [nfacct.plugin](../nfacct.plugin/)|`C`|linux|collects netfilter firewall, connection tracker and accounting metrics using `libmnl` and `libnetfilter_acct`. [xenstat.plugin](../xenstat.plugin/)|`C`|linux|collects XenServer and XCP-ng metrics using `lxenstat`. +[perf.plugin](../perf.plugin/)|`C`|linux|collects CPU performance metrics using performance monitoring units (PMU). [node.d.plugin](../node.d.plugin/)|`node.js`|all|a **plugin orchestrator** for data collection modules written in `node.js`. [python.d.plugin](../python.d.plugin/)|`python`|all|a **plugin orchestrator** for data collection modules written in `python` v2 or v3 (both are supported). diff --git a/collectors/plugins.d/plugins_d.c b/collectors/plugins.d/plugins_d.c index 024dd292..66ec5d0e 100644 --- a/collectors/plugins.d/plugins_d.c +++ b/collectors/plugins.d/plugins_d.c @@ -154,7 +154,12 @@ inline size_t pluginsd_process(RRDHOST *host, struct plugind *cd, FILE *fp, int char *r = fgets(line, PLUGINSD_LINE_MAX, fp); if(unlikely(!r)) { - error("read failed"); + if(feof(fp)) + error("read failed: end of file"); + else if(ferror(fp)) + error("read failed: input error"); + else + error("read failed: unknown error"); break; } diff --git a/collectors/proc.plugin/README.md b/collectors/proc.plugin/README.md index 37dc19f8..cacde84f 100644 --- a/collectors/proc.plugin/README.md +++ b/collectors/proc.plugin/README.md @@ -75,7 +75,7 @@ netdata will automatically set the name of disks on the dashboard, from the moun ### performance metrics -By default netdata will enable monitoring metrics only when they are not zero. If they are constantly zero they are ignored. Metrics that will start having values, after netdata is started, will be detected and charts will be automatically added to the dashboard (a refresh of the dashboard is needed for them to appear though). +By default, Netdata will enable monitoring metrics only when they are not zero. If they are constantly zero they are ignored. Metrics that will start having values, after netdata is started, will be detected and charts will be automatically added to the dashboard (a refresh of the dashboard is needed for them to appear though). Set `yes` for a chart instead of `auto` to enable it permanently. netdata categorizes all block devices in 3 categories: diff --git a/collectors/proc.plugin/proc_mdstat.c b/collectors/proc.plugin/proc_mdstat.c index d0925ec3..5c29d31c 100644 --- a/collectors/proc.plugin/proc_mdstat.c +++ b/collectors/proc.plugin/proc_mdstat.c @@ -13,7 +13,7 @@ struct raid { unsigned long long failed_disks;
RRDSET *st_disks;
- RRDDIM *rd_total;
+ RRDDIM *rd_down;
RRDDIM *rd_inuse;
unsigned long long total_disks;
unsigned long long inuse_disks;
@@ -439,11 +439,11 @@ int do_proc_mdstat(int update_every, usec_t dt) { if(unlikely(!raid->rd_inuse && !(raid->rd_inuse = rrddim_find(raid->st_disks, "inuse"))))
raid->rd_inuse = rrddim_add(raid->st_disks, "inuse", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
- if(unlikely(!raid->rd_total && !(raid->rd_total = rrddim_find(raid->st_disks, "total"))))
- raid->rd_total = rrddim_add(raid->st_disks, "total", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+ if(unlikely(!raid->rd_down && !(raid->rd_down = rrddim_find(raid->st_disks, "down"))))
+ raid->rd_down = rrddim_add(raid->st_disks, "down", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
rrddim_set_by_pointer(raid->st_disks, raid->rd_inuse, raid->inuse_disks);
- rrddim_set_by_pointer(raid->st_disks, raid->rd_total, raid->total_disks);
+ rrddim_set_by_pointer(raid->st_disks, raid->rd_down, raid->failed_disks);
rrdset_done(raid->st_disks);
}
diff --git a/collectors/python.d.plugin/Makefile.am b/collectors/python.d.plugin/Makefile.am index 652a35da..ad72cfae 100644 --- a/collectors/python.d.plugin/Makefile.am +++ b/collectors/python.d.plugin/Makefile.am @@ -87,6 +87,7 @@ include rabbitmq/Makefile.inc include redis/Makefile.inc include rethinkdbs/Makefile.inc include retroshare/Makefile.inc +include riakkv/Makefile.inc include samba/Makefile.inc include sensors/Makefile.inc include smartd_log/Makefile.inc diff --git a/collectors/python.d.plugin/README.md b/collectors/python.d.plugin/README.md index 8955197a..32437c6d 100644 --- a/collectors/python.d.plugin/README.md +++ b/collectors/python.d.plugin/README.md @@ -150,7 +150,7 @@ Classes implement `_get_raw_data` which should be used to grab raw data. This me _This is last resort class, if a new module cannot be written by using other framework class this one can be used._ -_Example: `mysql`, `sensors`_ +_Example: `ceph`, `sensors`_ It is the lowest-level class which implements most of module logic, like: - threading diff --git a/collectors/python.d.plugin/adaptec_raid/adaptec_raid.chart.py b/collectors/python.d.plugin/adaptec_raid/adaptec_raid.chart.py index 052c9314..3fcb5fda 100644 --- a/collectors/python.d.plugin/adaptec_raid/adaptec_raid.chart.py +++ b/collectors/python.d.plugin/adaptec_raid/adaptec_raid.chart.py @@ -56,8 +56,8 @@ GOOD_PD_STATUS = ( ) RE_LD = re.compile( - r'Logical device number\s+([0-9]+).*?' - r'Status of logical device\s+: ([a-zA-Z]+)' + r'Logical [dD]evice number\s+([0-9]+).*?' + r'Status of [lL]ogical [dD]evice\s+: ([a-zA-Z]+)' ) diff --git a/collectors/python.d.plugin/dns_query_time/dns_query_time.chart.py b/collectors/python.d.plugin/dns_query_time/dns_query_time.chart.py index 47a7d23f..7fe86031 100644 --- a/collectors/python.d.plugin/dns_query_time/dns_query_time.chart.py +++ b/collectors/python.d.plugin/dns_query_time/dns_query_time.chart.py @@ -8,11 +8,6 @@ from socket import getaddrinfo, gaierror from threading import Thread try: - from time import monotonic as time -except ImportError: - from time import time - -try: import dns.message import dns.query import dns.name @@ -89,13 +84,15 @@ def dns_request(server_list, timeout, domains): request = dns.message.make_query(domain, dns.rdatatype.A) try: - dns_start = time() - dns.query.udp(request, ns, timeout=t) - dns_end = time() - query_time = round((dns_end - dns_start) * 1000) - q.put({'_'.join(['ns', ns.replace('.', '_')]): query_time}) + resp = dns.query.udp(request, ns, timeout=t) + if (resp.rcode() == dns.rcode.NOERROR and resp.answer): + query_time = resp.time * 1000 + else: + query_time = -100 except dns.exception.Timeout: - q.put({'_'.join(['ns', ns.replace('.', '_')]): -100}) + query_time = -100 + finally: + q.put({'_'.join(['ns', ns.replace('.', '_')]): query_time}) for server in server_list: th = Thread(target=dns_req, args=(server, timeout, que)) diff --git a/collectors/python.d.plugin/elasticsearch/elasticsearch.chart.py b/collectors/python.d.plugin/elasticsearch/elasticsearch.chart.py index 9b3c1284..20109c64 100644 --- a/collectors/python.d.plugin/elasticsearch/elasticsearch.chart.py +++ b/collectors/python.d.plugin/elasticsearch/elasticsearch.chart.py @@ -10,9 +10,9 @@ from collections import namedtuple from socket import gethostbyname, gaierror try: - from queue import Queue + from queue import Queue except ImportError: - from Queue import Queue + from Queue import Queue from bases.FrameworkServices.UrlService import UrlService @@ -83,11 +83,11 @@ NODE_STATS = [ ] CLUSTER_STATS = [ - 'nodes.count.data_only', - 'nodes.count.master_data', + 'nodes.count.data', + 'nodes.count.master', 'nodes.count.total', - 'nodes.count.master_only', - 'nodes.count.client', + 'nodes.count.coordinating_only', + 'nodes.count.ingest', 'indices.docs.count', 'indices.query_cache.hit_count', 'indices.query_cache.miss_count', @@ -371,7 +371,7 @@ CHARTS = { }, 'cluster_health_nodes': { 'options': [None, 'Nodes Statistics', 'nodes', 'cluster health API', - 'elastic.cluster_health_nodes', 'stacked'], + 'elastic.cluster_health_nodes', 'area'], 'lines': [ ['number_of_nodes', 'nodes', 'absolute'], ['number_of_data_nodes', 'data_nodes', 'absolute'], @@ -417,13 +417,13 @@ CHARTS = { }, 'cluster_stats_nodes': { 'options': [None, 'Nodes Statistics', 'nodes', 'cluster stats API', - 'elastic.cluster_nodes', 'stacked'], + 'elastic.cluster_nodes', 'area'], 'lines': [ - ['nodes_count_data_only', 'data_only', 'absolute'], - ['nodes_count_master_data', 'master_data', 'absolute'], + ['nodes_count_data', 'data', 'absolute'], + ['nodes_count_master', 'master', 'absolute'], ['nodes_count_total', 'total', 'absolute'], - ['nodes_count_master_only', 'master_only', 'absolute'], - ['nodes_count_client', 'client', 'absolute'] + ['nodes_count_ingest', 'ingest', 'absolute'], + ['nodes_count_coordinating_only', 'coordinating_only', 'absolute'] ] }, 'cluster_stats_query_cache': { diff --git a/collectors/python.d.plugin/monit/monit.chart.py b/collectors/python.d.plugin/monit/monit.chart.py index 3ac0032c..9f327057 100644 --- a/collectors/python.d.plugin/monit/monit.chart.py +++ b/collectors/python.d.plugin/monit/monit.chart.py @@ -4,23 +4,49 @@ # SPDX-License-Identifier: GPL-3.0-or-later import xml.etree.ElementTree as ET + +from collections import namedtuple + from bases.FrameworkServices.UrlService import UrlService -# see enum State_Type from monit.h (https://bitbucket.org/tildeslash/monit/src/master/src/monit.h) -MONIT_SERVICE_NAMES = [ - 'Filesystem', - 'Directory', - 'File', - 'Process', - 'Host', - 'System', - 'Fifo', - 'Program', - 'Net', -] +MonitType = namedtuple('MonitType', ('index', 'name')) + +# see enum Service_Type from monit.h (https://bitbucket.org/tildeslash/monit/src/master/src/monit.h) +# typedef enum { +# Service_Filesystem = 0, +# Service_Directory, +# Service_File, +# Service_Process, +# Service_Host, +# Service_System, +# Service_Fifo, +# Service_Program, +# Service_Net, +# Service_Last = Service_Net +# } __attribute__((__packed__)) Service_Type; -DEFAULT_SERVICES_IDS = [0, 1, 2, 3, 4, 6, 7, 8] +TYPE_FILESYSTEM = MonitType(0, 'filesystem') +TYPE_DIRECTORY = MonitType(1, 'directory') +TYPE_FILE = MonitType(2, 'file') +TYPE_PROCESS = MonitType(3, 'process') +TYPE_HOST = MonitType(4, 'host') +TYPE_SYSTEM = MonitType(5, 'system') +TYPE_FIFO = MonitType(6, 'fifo') +TYPE_PROGRAM = MonitType(7, 'program') +TYPE_NET = MonitType(8, 'net') + +TYPES = ( + TYPE_FILESYSTEM, + TYPE_DIRECTORY, + TYPE_FILE, + TYPE_PROCESS, + TYPE_HOST, + TYPE_SYSTEM, + TYPE_FIFO, + TYPE_PROGRAM, + TYPE_NET, +) # charts order (can be overridden if you want less charts, or different order) ORDER = [ @@ -38,6 +64,7 @@ ORDER = [ 'program', 'net' ] + CHARTS = { 'filesystem': { 'options': ['filesystems', 'Filesystems', 'filesystems', 'filesystem', 'monit.filesystems', 'line'], @@ -83,7 +110,7 @@ CHARTS = { 'lines': [] }, 'host_latency': { - 'options': ['hosts latency', 'Hosts latency', 'milliseconds/s', 'network', 'monit.host_latency', 'line'], + 'options': ['hosts latency', 'Hosts latency', 'milliseconds', 'network', 'monit.host_latency', 'line'], 'lines': [] }, 'net': { @@ -94,85 +121,224 @@ CHARTS = { } +class BaseMonitService(object): + def __init__(self, typ, name, status, monitor): + self.type = typ + self.name = name + self.status = status + self.monitor = monitor + + def __repr__(self): + return 'MonitService({0}:{1})'.format(self.type.name, self.name) + + def __eq__(self, other): + if not isinstance(other, BaseMonitService): + return False + return self.type == other.type and self.name == other.name + + def __ne__(self, other): + return not self == other + + def __hash__(self): + return hash(repr(self)) + + def is_running(self): + return self.status == '0' and self.monitor == '1' + + def key(self): + return '{0}_{1}'.format(self.type.name, self.name) + + def data(self): + return {self.key(): int(self.is_running())} + + +class ProcessMonitService(BaseMonitService): + def __init__(self, typ, name, status, monitor): + super(ProcessMonitService, self).__init__(typ, name, status, monitor) + self.uptime = None + self.threads = None + self.children = None + + def uptime_key(self): + return 'process_uptime_{0}'.format(self.name) + + def threads_key(self): + return 'process_threads_{0}'.format(self.name) + + def children_key(self): + return 'process_children_{0}'.format(self.name) + + def data(self): + base_data = super(ProcessMonitService, self).data() + # skipping bugged metrics with negative uptime (monit before v5.16) + uptime = self.uptime if self.uptime and int(self.uptime) >= 0 else None + data = { + self.uptime_key(): uptime, + self.threads_key(): self.threads, + self.children_key(): self.children, + } + data.update(base_data) + + return data + + +class HostMonitService(BaseMonitService): + def __init__(self, typ, name, status, monitor): + super(HostMonitService, self).__init__(typ, name, status, monitor) + self.latency = None + + def latency_key(self): + return 'host_latency_{0}'.format(self.name) + + def data(self): + base_data = super(HostMonitService, self).data() + latency = float(self.latency) * 1000000 if self.latency else None + data = {self.latency_key(): latency} + data.update(base_data) + + return data + + class Service(UrlService): def __init__(self, configuration=None, name=None): UrlService.__init__(self, configuration=configuration, name=name) self.order = ORDER self.definitions = CHARTS - base_url = self.configuration.get('url', 'http://localhost:2812') + base_url = self.configuration.get('url', "http://localhost:2812") self.url = '{0}/_status?format=xml&level=full'.format(base_url) + self.active_services = list() - def parse(self, data): + def parse(self, raw): try: - xml = ET.fromstring(data) + root = ET.fromstring(raw) except ET.ParseError: - self.error("URL {0} didn't return a vaild XML page. Please check your settings.".format(self.url)) + self.error("URL {0} didn't return a valid XML page. Please check your settings.".format(self.url)) + return None + return root + + def _get_data(self): + raw = self._get_raw_data() + if not raw: return None - return xml - def check(self): - self._manager = self._build_manager() + root = self.parse(raw) + if root is None: + return None - raw_data = self._get_raw_data() - if not raw_data: + services = self.get_services(root) + if not services: return None - return bool(self.parse(raw_data)) + if len(self.charts) > 0: + self.update_charts(services) - def _get_data(self): - raw_data = self._get_raw_data() + data = dict() - if not raw_data: - return None + for svc in services: + data.update(svc.data()) - xml = self.parse(raw_data) - if not xml: - return None + return data - data = {} - for service_id in DEFAULT_SERVICES_IDS: - service_category = MONIT_SERVICE_NAMES[service_id].lower() + def get_services(self, root): + services = list() - if service_category == 'system': - self.debug("Skipping service from 'System' category, because it's useless in graphs") + for typ in TYPES: + if typ == TYPE_SYSTEM: + self.debug("skipping service from '{0}' category, it's useless in graphs".format(TYPE_SYSTEM.name)) continue - xpath_query = "./service[@type='{0}']".format(service_id) - self.debug('Searching for {0} as {1}'.format(service_category, xpath_query)) - for service_node in xml.findall(xpath_query): - - service_name = service_node.find('name').text - service_status = service_node.find('status').text - service_monitoring = service_node.find('monitor').text - self.debug('=> found {0} with type={1}, status={2}, monitoring={3}'.format(service_name, - service_id, service_status, service_monitoring)) - - dimension_key = service_category + '_' + service_name - if dimension_key not in self.charts[service_category]: - self.charts[service_category].add_dimension([dimension_key, service_name, 'absolute']) - data[dimension_key] = 1 if service_status == '0' and service_monitoring == '1' else 0 - - if service_category == 'process': - for subnode in ('uptime', 'threads', 'children'): - subnode_value = service_node.find(subnode) - if subnode_value is None: - continue - if subnode == 'uptime' and int(subnode_value.text) < 0: - self.debug('Skipping bugged metrics with negative uptime (monit before v5.16') - continue - dimension_key = 'process_{0}_{1}'.format(subnode, service_name) - if dimension_key not in self.charts['process_' + subnode]: - self.charts['process_' + subnode].add_dimension([dimension_key, service_name, 'absolute']) - data[dimension_key] = int(subnode_value.text) - - if service_category == 'host': - subnode_value = service_node.find('./icmp/responsetime') - if subnode_value is None: - continue - dimension_key = 'host_latency_{0}'.format(service_name) - if dimension_key not in self.charts['host_latency']: - self.charts['host_latency'].add_dimension([dimension_key, service_name, - 'absolute', 1000, 1000000]) - data[dimension_key] = float(subnode_value.text) * 1000000 - - return data or None + xpath_query = "./service[@type='{0}']".format(typ.index) + self.debug('Searching for {0} as {1}'.format(typ.name, xpath_query)) + + for svc_root in root.findall(xpath_query): + svc = create_service(svc_root, typ) + self.debug('=> found {0} with type={1}, status={2}, monitoring={3}'.format( + svc.name, svc.type.name, svc.status, svc.monitor)) + + services.append(svc) + + return services + + def update_charts(self, services): + remove = [svc for svc in self.active_services if svc not in services] + add = [svc for svc in services if svc not in self.active_services] + + self.remove_services_from_charts(remove) + self.add_services_to_charts(add) + + self.active_services = services + + def add_services_to_charts(self, services): + for svc in services: + if svc.type == TYPE_HOST: + self.charts['host_latency'].add_dimension([svc.latency_key(), svc.name, 'absolute', 1000, 1000000]) + if svc.type == TYPE_PROCESS: + self.charts['process_uptime'].add_dimension([svc.uptime_key(), svc.name]) + self.charts['process_threads'].add_dimension([svc.threads_key(), svc.name]) + self.charts['process_children'].add_dimension([svc.children_key(), svc.name]) + self.charts[svc.type.name].add_dimension([svc.key(), svc.name]) + + def remove_services_from_charts(self, services): + for svc in services: + if svc.type == TYPE_HOST: + self.charts['host_latency'].del_dimension(svc.latency_key(), False) + if svc.type == TYPE_PROCESS: + self.charts['process_uptime'].del_dimension(svc.uptime_key(), False) + self.charts['process_threads'].del_dimension(svc.threads_key(), False) + self.charts['process_children'].del_dimension(svc.children_key(), False) + self.charts[svc.type.name].del_dimension(svc.key(), False) + + +def create_service(root, typ): + if typ == TYPE_HOST: + return create_host_service(root) + elif typ == TYPE_PROCESS: + return create_process_service(root) + return create_base_service(root, typ) + + +def create_host_service(root): + svc = HostMonitService( + TYPE_HOST, + root.find('name').text, + root.find('status').text, + root.find('monitor').text, + ) + + latency = root.find('./icmp/responsetime') + if latency is not None: + svc.latency = latency.text + + return svc + + +def create_process_service(root): + svc = ProcessMonitService( + TYPE_PROCESS, + root.find('name').text, + root.find('status').text, + root.find('monitor').text, + ) + + uptime = root.find('uptime') + if uptime is not None: + svc.uptime = uptime.text + + threads = root.find('threads') + if threads is not None: + svc.threads = threads.text + + children = root.find('children') + if children is not None: + svc.children = children.text + + return svc + + +def create_base_service(root, typ): + return BaseMonitService( + typ, + root.find('name').text, + root.find('status').text, + root.find('monitor').text, + ) diff --git a/collectors/python.d.plugin/mysql/README.md b/collectors/python.d.plugin/mysql/README.md index eba9d7a2..f7028ab6 100644 --- a/collectors/python.d.plugin/mysql/README.md +++ b/collectors/python.d.plugin/mysql/README.md @@ -218,6 +218,24 @@ It will produce following charts (if data is available): 45. **Flow Control** in ms * paused +46. **Users CPU time** in percentage + * users + +**Per user statistics:** + +1. **Rows Operations** in operations/s + * read + * send + * updated + * inserted + * deleted + +2. **Commands** in commands/s + * select + * update + * other + + ### configuration You can provide, per server, the following: @@ -234,7 +252,7 @@ You can provide, per server, the following: - ca: the path name of the Certificate Authority (CA) certificate file. This option, if used, must specify the same certificate used by the server. - capath: the path name of the directory that contains trusted SSL CA certificate files. - cipher: the list of permitted ciphers for SSL encryption. - + Here is an example for 3 servers: ```yaml @@ -260,6 +278,8 @@ remote: If no configuration is given, module will attempt to connect to mysql server via unix socket at `/var/run/mysqld/mysqld.sock` without password and with username `root` +`userstats` graph works only if you enable such plugin in MariaDB server and set proper mysql priviliges (SUPER or PROCESS). For more detail please check [MariaDB User Statistics page](https://mariadb.com/kb/en/library/user-statistics/) + --- [![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fcollectors%2Fpython.d.plugin%2Fmysql%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)]() diff --git a/collectors/python.d.plugin/mysql/mysql.chart.py b/collectors/python.d.plugin/mysql/mysql.chart.py index 139fac15..82bd9079 100644 --- a/collectors/python.d.plugin/mysql/mysql.chart.py +++ b/collectors/python.d.plugin/mysql/mysql.chart.py @@ -11,6 +11,7 @@ from bases.FrameworkServices.MySQLService import MySQLService QUERY_GLOBAL = 'SHOW GLOBAL STATUS;' QUERY_SLAVE = 'SHOW SLAVE STATUS;' QUERY_VARIABLES = 'SHOW GLOBAL VARIABLES LIKE \'max_connections\';' +QUERY_USER_STATISTICS = 'SHOW USER_STATISTICS;' GLOBAL_STATS = [ 'Bytes_received', @@ -90,6 +91,7 @@ GLOBAL_STATS = [ 'Innodb_buffer_pool_write_requests', 'Innodb_buffer_pool_reads', 'Innodb_buffer_pool_wait_free', + 'Innodb_deadlocks', 'Qcache_hits', 'Qcache_lowmem_prunes', 'Qcache_inserts', @@ -149,6 +151,18 @@ SLAVE_STATS = [ ('Slave_IO_Running', slave_running) ] +USER_STATISTICS = [ + 'Select_commands', + 'Update_commands', + 'Other_commands', + 'Cpu_time', + 'Rows_read', + 'Rows_sent', + 'Rows_deleted', + 'Rows_inserted', + 'Rows_updated' +] + VARIABLES = [ 'max_connections' ] @@ -178,6 +192,7 @@ ORDER = [ 'innodb_os_log_fsync_writes', 'innodb_os_log_io', 'innodb_cur_row_lock', + 'innodb_deadlocks', 'innodb_rows', 'innodb_buffer_pool_pages', 'innodb_buffer_pool_flush_pages_requests', @@ -200,7 +215,8 @@ ORDER = [ 'galera_bytes', 'galera_queue', 'galera_conflicts', - 'galera_flow_control' + 'galera_flow_control', + 'userstats_cpu' ] CHARTS = { @@ -382,6 +398,13 @@ CHARTS = { ['Innodb_row_lock_current_waits', 'current_waits', 'absolute'] ] }, + 'innodb_deadlocks': { + 'options': [None, 'InnoDB Deadlocks', 'operations/s', 'innodb', + 'mysql.innodb_deadlocks', 'area'], + 'lines': [ + ['Innodb_deadlocks', 'deadlocks', 'incremental'] + ] + }, 'innodb_rows': { 'options': [None, 'InnoDB Row Operations', 'operations/s', 'innodb', 'mysql.innodb_rows', 'area'], 'lines': [ @@ -570,10 +593,45 @@ CHARTS = { 'lines': [ ['wsrep_flow_control_paused_ns', 'paused', 'incremental', 1, 1000000], ] + }, + 'userstats_cpu': { + 'options': [None, 'Users CPU time', 'percentage', 'userstats', 'mysql.userstats_cpu', 'stacked'], + 'lines': [] } } +def userstats_chart_template(name): + order = [ + 'userstats_rows_{0}'.format(name), + 'userstats_commands_{0}'.format(name) + ] + family = 'userstats {0}'.format(name) + + charts = { + order[0]: { + 'options': [None, 'Rows Operations', 'operations/s', family, 'mysql.userstats_rows', 'stacked'], + 'lines': [ + ['userstats_{0}_Rows_read'.format(name), 'read', 'incremental'], + ['userstats_{0}_Rows_send'.format(name), 'send', 'incremental'], + ['userstats_{0}_Rows_updated'.format(name), 'updated', 'incremental'], + ['userstats_{0}_Rows_inserted'.format(name), 'inserted', 'incremental'], + ['userstats_{0}_Rows_deleted'.format(name), 'deleted', 'incremental'] + ] + }, + order[1]: { + 'options': [None, 'Commands', 'commands/s', family, 'mysql.userstats_commands', 'stacked'], + 'lines': [ + ['userstats_{0}_Select_commands'.format(name), 'select', 'incremental'], + ['userstats_{0}_Update_commands'.format(name), 'update', 'incremental'], + ['userstats_{0}_Other_commands'.format(name), 'other', 'incremental'] + ] + } + } + + return order, charts + + class Service(MySQLService): def __init__(self, configuration=None, name=None): MySQLService.__init__(self, configuration=configuration, name=name) @@ -583,6 +641,7 @@ class Service(MySQLService): global_status=QUERY_GLOBAL, slave_status=QUERY_SLAVE, variables=QUERY_VARIABLES, + user_statistics=QUERY_USER_STATISTICS, ) def _get_data(self): @@ -612,6 +671,12 @@ class Service(MySQLService): else: self.queries.pop('slave_status') + if 'user_statistics' in raw_data: + if raw_data['user_statistics'][0]: + to_netdata.update(self.get_userstats(raw_data)) + else: + self.queries.pop('user_statistics') + if 'variables' in raw_data: variables = dict(raw_data['variables'][0]) for key in VARIABLES: @@ -619,3 +684,70 @@ class Service(MySQLService): to_netdata[key] = variables[key] return to_netdata or None + + # raw_data['user_statistics'] contains the following data structure: + # ( + # ( + # ('netdata', 42L, 0L, 1264L, 3.111252999999968, 2.968510299999994, 110267L, 19741424L, 0L, 0L, 1265L, 0L, + # 0L, 0L, 3L, 0L, 1301L, 0L, 0L, 7633L, 0L, 83L, 44L, 0L, 0L), + # ('root', 60L, 0L, 184L, 0.22856499999999966, 0.1601419999999998, 11605L, 1516513L, 0L, 9L, 220L, 0L, 2L, 1L, + # 6L, 4L,127L, 0L, 0L, 45L, 0L, 45L, 0L, 0L, 0L) + # ), + # ( + # ('User', 253, 9, 128, 128, 0, 0), + # ('Total_connections', 3, 2, 11, 11, 0, 0), + # ('Concurrent_connections', 3, 1, 11, 11, 0, 0), + # ('Connected_time', 3, 4, 11, 11, 0, 0), + # ('Busy_time', 5, 21, 21, 21, 31, 0), + # ('Cpu_time', 5, 18, 21, 21, 31, 0), + # ('Bytes_received', 8, 6, 21, 21, 0, 0), + # ('Bytes_sent', 8, 8, 21, 21, 0, 0), + # ('Binlog_bytes_written', 8, 1, 21, 21, 0, 0), + # ('Rows_read', 8, 1, 21, 21, 0, 0), + # ('Rows_sent', 8, 4, 21, 21, 0, 0), + # ('Rows_deleted', 8, 1, 21, 21, 0, 0), + # ('Rows_inserted', 8, 1, 21, 21, 0, 0), + # ('Rows_updated', 8, 1, 21, 21, 0, 0), + # ('Select_commands', 8, 1, 21, 21, 0, 0), + # ('Update_commands', 8, 1, 21, 21, 0, 0), + # ('Other_commands', 8, 4, 21, 21, 0, 0), + # ('Commit_transactions', 8, 1, 21, 21, 0, 0), + # ('Rollback_transactions', 8, 1, 21, 21, 0, 0), + # ('Denied_connections', 8, 4, 21, 21, 0, 0), + # ('Lost_connections', 8, 1, 21, 21, 0, 0), + # ('Access_denied', 8, 2, 21, 21, 0, 0), + # ('Empty_queries', 8, 2, 21, 21, 0, 0), + # ('Total_ssl_connections', 8, 1, 21, 21, 0, 0), + # ('Max_statement_time_exceeded', 8, 1, 21, 21, 0, 0)), + # ) + def get_userstats(self, raw_data): + data = dict() + userstats_vars = [e[0] for e in raw_data['user_statistics'][1]] + for i, _ in enumerate(raw_data['user_statistics'][0]): + user_name = raw_data['user_statistics'][0][i][0] + userstats = dict(zip(userstats_vars, raw_data['user_statistics'][0][i])) + + if len(self.charts) > 0: + if ('userstats_{0}_Cpu_time'.format(user_name)) not in self.charts['userstats_cpu']: + self.add_userstats_dimensions(user_name) + self.create_new_userstats_charts(user_name) + + for key in USER_STATISTICS: + if key in userstats: + data['userstats_{0}_{1}'.format(user_name, key)] = userstats[key] + + return data + + def add_userstats_dimensions(self, name): + self.charts['userstats_cpu'].add_dimension(['userstats_{0}_Cpu_time'.format(name), name, 'incremental', 100, 1]) + + def create_new_userstats_charts(self, tube): + order, charts = userstats_chart_template(tube) + + for chart_name in order: + params = [chart_name] + charts[chart_name]['options'] + dimensions = charts[chart_name]['lines'] + + new_chart = self.charts.add_chart(params) + for dimension in dimensions: + new_chart.add_dimension(dimension) diff --git a/collectors/python.d.plugin/python.d.conf b/collectors/python.d.plugin/python.d.conf index 63eecbba..e2ee8eee 100644 --- a/collectors/python.d.plugin/python.d.conf +++ b/collectors/python.d.plugin/python.d.conf @@ -41,7 +41,7 @@ chrony: no # dockerd: yes # dovecot: yes # elasticsearch: yes -# energi: yes +# energid: yes # this is just an example example: no @@ -88,6 +88,7 @@ nginx_log: no # redis: yes # rethinkdbs: yes # retroshare: yes +# riakkv: yes # samba: yes # sensors: yes # smartd_log: yes @@ -101,4 +102,4 @@ unbound: no # uwsgi: yes # varnish: yes # w1sensor: yes -# web_log: yes
\ No newline at end of file +# web_log: yes diff --git a/collectors/python.d.plugin/python_modules/bases/FrameworkServices/UrlService.py b/collectors/python.d.plugin/python_modules/bases/FrameworkServices/UrlService.py index 43945665..b6f75bd5 100644 --- a/collectors/python.d.plugin/python_modules/bases/FrameworkServices/UrlService.py +++ b/collectors/python.d.plugin/python_modules/bases/FrameworkServices/UrlService.py @@ -6,6 +6,8 @@ import urllib3 +from distutils.version import StrictVersion as version + from bases.FrameworkServices.SimpleService import SimpleService try: @@ -14,9 +16,30 @@ except AttributeError: pass +# https://github.com/urllib3/urllib3/blob/master/CHANGES.rst#19-2014-07-04 +# New retry logic and urllib3.util.retry.Retry configuration object. (Issue https://github.com/urllib3/urllib3/pull/326) +URLLIB3_MIN_REQUIRED_VERSION = '1.9' +URLLIB3_VERSION = urllib3.__version__ +URLLIB3 = 'urllib3' + + +def version_check(): + if version(URLLIB3_VERSION) >= version(URLLIB3_MIN_REQUIRED_VERSION): + return + + err = '{0} version: {1}, minimum required version: {2}, please upgrade'.format( + URLLIB3, + URLLIB3_VERSION, + URLLIB3_MIN_REQUIRED_VERSION, + ) + raise Exception(err) + + class UrlService(SimpleService): def __init__(self, configuration=None, name=None): + version_check() SimpleService.__init__(self, configuration=configuration, name=name) + self.debug("{0} version: {1}".format(URLLIB3, URLLIB3_VERSION)) self.url = self.configuration.get('url') self.user = self.configuration.get('user') self.password = self.configuration.get('pass') diff --git a/collectors/python.d.plugin/riakkv/Makefile.inc b/collectors/python.d.plugin/riakkv/Makefile.inc new file mode 100644 index 00000000..87d29f82 --- /dev/null +++ b/collectors/python.d.plugin/riakkv/Makefile.inc @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-3.0-or-later + +# THIS IS NOT A COMPLETE Makefile +# IT IS INCLUDED BY ITS PARENT'S Makefile.am +# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT + +# install these files +dist_python_DATA += riakkv/riakkv.chart.py +dist_pythonconfig_DATA += riakkv/riakkv.conf + +# do not install these files, but include them in the distribution +dist_noinst_DATA += riakkv/README.md riakkv/Makefile.inc + diff --git a/collectors/python.d.plugin/riakkv/README.md b/collectors/python.d.plugin/riakkv/README.md new file mode 100644 index 00000000..0bcf22c5 --- /dev/null +++ b/collectors/python.d.plugin/riakkv/README.md @@ -0,0 +1,110 @@ +# riakkv + +Monitors one or more Riak KV servers. + +**Requirements:** + +* An accessible `/stats` endpoint. See [the Riak KV configuration reference] + documentation](https://docs.riak.com/riak/kv/2.2.3/configuring/reference/#client-interfaces) + for how to enable this. + +The following charts are included, which are mostly derived from the metrics +listed +[here](https://docs.riak.com/riak/kv/latest/using/reference/statistics-monitoring/index.html#riak-metrics-to-graph). + +1. **Throughput** in operations/s + * **KV operations** + * gets + * puts + + * **Data type updates** + * counters + * sets + * maps + + * **Search queries** + * queries + + * **Search documents** + * indexed + + * **Strong consistency operations** + * gets + * puts + +2. **Latency** in milliseconds + * **KV latency** of the past minute + * get (mean, median, 95th / 99th / 100th percentile) + * put (mean, median, 95th / 99th / 100th percentile) + + * **Data type latency** of the past minute + * counter_merge (mean, median, 95th / 99th / 100th percentile) + * set_merge (mean, median, 95th / 99th / 100th percentile) + * map_merge (mean, median, 95th / 99th / 100th percentile) + + * **Search latency** of the past minute + * query (median, min, max, 95th / 99th percentile) + * index (median, min, max, 95th / 99th percentile) + + * **Strong consistency latency** of the past minute + * get (mean, median, 95th / 99th / 100th percentile) + * put (mean, median, 95th / 99th / 100th percentile) + +3. **Erlang VM metrics** + * **System counters** + * processes + + * **Memory allocation** in MB + * processes.allocated + * processes.used + +4. **General load / health metrics** + * **Siblings encountered in KV operations** during the past minute + * get (mean, median, 95th / 99th / 100th percentile) + + * **Object size in KV operations** during the past minute in KB + * get (mean, median, 95th / 99th / 100th percentile) + + * **Message queue length** in unprocessed messages + * vnodeq_size (mean, median, 95th / 99th / 100th percentile) + + * **Index operations** encountered by Search + * errors + + * **Protocol buffer connections** + * active + + * **Repair operations coordinated by this node** + * read + + * **Active finite state machines by kind** + * get + * put + * secondary_index + * list_keys + + * **Rejected finite state machines** + * get + * put + + * **Number of writes to Search failed due to bad data format by reason** + * bad_entry + * extract_fail + + +### configuration + +The module needs to be passed the full URL to Riak's stats endpoint. +For example: + +```yaml +myriak: + url: http://myriak.example.com:8098/stats +``` + +With no explicit configuration given, the module will attempt to connect to +`http://localhost:8098/stats`. + +The default update frequency for the plugin is set to 2 seconds as Riak +internally updates the metrics every second. If we were to update the metrics +every second, the resulting graph would contain odd jitter. diff --git a/collectors/python.d.plugin/riakkv/riakkv.chart.py b/collectors/python.d.plugin/riakkv/riakkv.chart.py new file mode 100644 index 00000000..f81e177a --- /dev/null +++ b/collectors/python.d.plugin/riakkv/riakkv.chart.py @@ -0,0 +1,315 @@ +# -*- coding: utf-8 -*- +# Description: riak netdata python.d module +# +# See also: +# https://docs.riak.com/riak/kv/latest/using/reference/statistics-monitoring/index.html + +from json import loads + +from bases.FrameworkServices.UrlService import UrlService + +# Riak updates the metrics at the /stats endpoint every 1 second. +# If we use `update_every = 1` here, that means we might get weird jitter in the graph, +# so the default is set to 2 seconds to prevent it. +update_every = 2 + +# charts order (can be overridden if you want less charts, or different order) +ORDER = [ + # Throughput metrics + # https://docs.riak.com/riak/kv/latest/using/reference/statistics-monitoring/index.html#throughput-metrics + # Collected in totals. + "kv.node_operations", # K/V node operations. + "dt.vnode_updates", # Data type vnode updates. + "search.queries", # Search queries on the node. + "search.documents", # Documents indexed by Search. + "consistent.operations", # Consistent node operations. + + # Latency metrics + # https://docs.riak.com/riak/kv/latest/using/reference/statistics-monitoring/index.html#throughput-metrics + # Collected for the past minute in milliseconds, + # returned from riak in microseconds. + "kv.latency.get", # K/V GET FSM traversal latency. + "kv.latency.put", # K/V PUT FSM traversal latency. + "dt.latency.counter", # Update Counter Data type latency. + "dt.latency.set", # Update Set Data type latency. + "dt.latency.map", # Update Map Data type latency. + "search.latency.query", # Search query latency. + "search.latency.index", # Time it takes for search to index a new document. + "consistent.latency.get", # Strong consistent read latency. + "consistent.latency.put", # Strong consistent write latency. + + # Erlang resource usage metrics + # https://docs.riak.com/riak/kv/latest/using/reference/statistics-monitoring/index.html#erlang-resource-usage-metrics + # Processes collected as a gauge, + # memory collected as Megabytes, returned as bytes from Riak. + "vm.processes", # Number of processes currently running in the Erlang VM. + "vm.memory.processes", # Total amount of memory allocated & used for Erlang processes. + + # General Riak Load / Health metrics + # https://docs.riak.com/riak/kv/latest/using/reference/statistics-monitoring/index.html#general-riak-load-health-metrics + # The following are collected by Riak over the past minute: + "kv.siblings_encountered.get", # Siblings encountered during GET operations by this node. + "kv.objsize.get", # Object size encountered by this node. + "search.vnodeq_size", # Number of unprocessed messages in the vnode message queues (Search). + # The following are calculated in total, or as gauges: + "search.index_errors", # Errors of the search subsystem while indexing documents. + "core.pbc", # Number of currently active protocol buffer connections. + "core.repairs", # Total read repair operations coordinated by this node. + "core.fsm_active", # Active finite state machines by kind. + "core.fsm_rejected", # Rejected finite state machines by kind. + + # General Riak Search Load / Health metrics + # https://docs.riak.com/riak/kv/latest/using/reference/statistics-monitoring/index.html#general-riak-search-load-health-metrics + # Reported as counters. + "search.errors", # Write and read errors of the Search subsystem. +] + +CHARTS = { + # Throughput metrics + "kv.node_operations": { + "options": [None, "Reads & writes coordinated by this node", "operations/s", "throughput", "riak.kv.throughput", "line"], + "lines": [ + ["node_gets_total", "gets", "incremental"], + ["node_puts_total", "puts", "incremental"] + ] + }, + "dt.vnode_updates": { + "options": [None, "Update operations coordinated by local vnodes by data type", "operations/s", "throughput", "riak.dt.vnode_updates", "line"], + "lines": [ + ["vnode_counter_update_total", "counters", "incremental"], + ["vnode_set_update_total", "sets", "incremental"], + ["vnode_map_update_total", "maps", "incremental"], + ] + }, + "search.queries": { + "options": [None, "Search queries on the node", "queries/s", "throughput", "riak.search", "line"], + "lines": [ + ["search_query_throughput_count", "queries", "incremental"] + ] + }, + "search.documents": { + "options": [None, "Documents indexed by search", "documents/s", "throughput", "riak.search.documents", "line"], + "lines": [ + ["search_index_throughput_count", "indexed", "incremental"] + ] + }, + "consistent.operations": { + "options": [None, "Consistent node operations", "operations/s", "throughput", "riak.consistent.operations", "line"], + "lines": [ + ["consistent_gets_total", "gets", "incremental"], + ["consistent_puts_total", "puts", "incremental"], + ] + }, + + # Latency metrics + "kv.latency.get": { + "options": [None, "Time between reception of a client GET request and subsequent response to client", "ms", "latency", "riak.kv.latency.get", "line"], + "lines": [ + ["node_get_fsm_time_mean", "mean", "absolute", 1, 1000], + ["node_get_fsm_time_median", "median", "absolute", 1, 1000], + ["node_get_fsm_time_95", "95", "absolute", 1, 1000], + ["node_get_fsm_time_99", "99", "absolute", 1, 1000], + ["node_get_fsm_time_100", "100", "absolute", 1, 1000], + ] + }, + "kv.latency.put": { + "options": [None, "Time between reception of a client PUT request and subsequent response to client", "ms", "latency", "riak.kv.latency.put", "line"], + "lines": [ + ["node_put_fsm_time_mean", "mean", "absolute", 1, 1000], + ["node_put_fsm_time_median", "median", "absolute", 1, 1000], + ["node_put_fsm_time_95", "95", "absolute", 1, 1000], + ["node_put_fsm_time_99", "99", "absolute", 1, 1000], + ["node_put_fsm_time_100", "100", "absolute", 1, 1000], + ] + }, + "dt.latency.counter": { + "options": [None, "Time it takes to perform an Update Counter operation", "ms", "latency", "riak.dt.latency.counter_merge", "line"], + "lines": [ + ["object_counter_merge_time_mean", "mean", "absolute", 1, 1000], + ["object_counter_merge_time_median", "median", "absolute", 1, 1000], + ["object_counter_merge_time_95", "95", "absolute", 1, 1000], + ["object_counter_merge_time_99", "99", "absolute", 1, 1000], + ["object_counter_merge_time_100", "100", "absolute", 1, 1000], + ] + }, + "dt.latency.set": { + "options": [None, "Time it takes to perform an Update Set operation", "ms", "latency", "riak.dt.latency.set_merge", "line"], + "lines": [ + ["object_set_merge_time_mean", "mean", "absolute", 1, 1000], + ["object_set_merge_time_median", "median", "absolute", 1, 1000], + ["object_set_merge_time_95", "95", "absolute", 1, 1000], + ["object_set_merge_time_99", "99", "absolute", 1, 1000], + ["object_set_merge_time_100", "100", "absolute", 1, 1000], + ] + }, + "dt.latency.map": { + "options": [None, "Time it takes to perform an Update Map operation", "ms", "latency", "riak.dt.latency.map_merge", "line"], + "lines": [ + ["object_map_merge_time_mean", "mean", "absolute", 1, 1000], + ["object_map_merge_time_median", "median", "absolute", 1, 1000], + ["object_map_merge_time_95", "95", "absolute", 1, 1000], + ["object_map_merge_time_99", "99", "absolute", 1, 1000], + ["object_map_merge_time_100", "100", "absolute", 1, 1000], + ] + }, + "search.latency.query": { + "options": [None, "Search query latency", "ms", "latency", "riak.search.latency.query", "line"], + "lines": [ + ["search_query_latency_median", "median", "absolute", 1, 1000], + ["search_query_latency_min", "min", "absolute", 1, 1000], + ["search_query_latency_95", "95", "absolute", 1, 1000], + ["search_query_latency_99", "99", "absolute", 1, 1000], + ["search_query_latency_999", "999", "absolute", 1, 1000], + ["search_query_latency_max", "max", "absolute", 1, 1000], + ] + }, + "search.latency.index": { + "options": [None, "Time it takes Search to index a new document", "ms", "latency", "riak.search.latency.index", "line"], + "lines": [ + ["search_index_latency_median", "median", "absolute", 1, 1000], + ["search_index_latency_min", "min", "absolute", 1, 1000], + ["search_index_latency_95", "95", "absolute", 1, 1000], + ["search_index_latency_99", "99", "absolute", 1, 1000], + ["search_index_latency_999", "999", "absolute", 1, 1000], + ["search_index_latency_max", "max", "absolute", 1, 1000], + ] + }, + + # Riak Strong Consistency metrics + "consistent.latency.get": { + "options": [None, "Strongly consistent read latency", "ms", "latency", "riak.consistent.latency.get", "line"], + "lines": [ + ["consistent_get_time_mean", "mean", "absolute", 1, 1000], + ["consistent_get_time_median", "median", "absolute", 1, 1000], + ["consistent_get_time_95", "95", "absolute", 1, 1000], + ["consistent_get_time_99", "99", "absolute", 1, 1000], + ["consistent_get_time_100", "100", "absolute", 1, 1000], + ] + }, + "consistent.latency.put": { + "options": [None, "Strongly consistent write latency", "ms", "latency", "riak.consistent.latency.put", "line"], + "lines": [ + ["consistent_put_time_mean", "mean", "absolute", 1, 1000], + ["consistent_put_time_median", "median", "absolute", 1, 1000], + ["consistent_put_time_95", "95", "absolute", 1, 1000], + ["consistent_put_time_99", "99", "absolute", 1, 1000], + ["consistent_put_time_100", "100", "absolute", 1, 1000], + ] + }, + + # BEAM metrics + "vm.processes": { + "options": [None, "Total processes running in the Erlang VM", "total", "vm", "riak.vm", "line"], + "lines": [ + ["sys_process_count", "processes", "absolute"], + ] + }, + "vm.memory.processes": { + "options": [None, "Memory allocated & used by Erlang processes", "MB", "vm", "riak.vm.memory.processes", "line"], + "lines": [ + ["memory_processes", "allocated", "absolute", 1, 1024 * 1024], + ["memory_processes_used", "used", "absolute", 1, 1024 * 1024] + ] + }, + + # General Riak Load/Health metrics + "kv.siblings_encountered.get": { + "options": [None, "Number of siblings encountered during GET operations by this node during the past minute", "siblings", "load", "riak.kv.siblings_encountered.get", "line"], + "lines": [ + ["node_get_fsm_siblings_mean", "mean", "absolute"], + ["node_get_fsm_siblings_median", "median", "absolute"], + ["node_get_fsm_siblings_95", "95", "absolute"], + ["node_get_fsm_siblings_99", "99", "absolute"], + ["node_get_fsm_siblings_100", "100", "absolute"], + ] + }, + "kv.objsize.get": { + "options": [None, "Object size encountered by this node during the past minute", "KB", "load", "riak.kv.objsize.get", "line"], + "lines": [ + ["node_get_fsm_objsize_mean", "mean", "absolute", 1, 1024], + ["node_get_fsm_objsize_median", "median", "absolute", 1, 1024], + ["node_get_fsm_objsize_95", "95", "absolute", 1, 1024], + ["node_get_fsm_objsize_99", "99", "absolute", 1, 1024], + ["node_get_fsm_objsize_100", "100", "absolute", 1, 1024], + ] + }, + "search.vnodeq_size": { + "options": [None, "Number of unprocessed messages in the vnode message queues of Search on this node in the past minute", "messages", "load", "riak.search.vnodeq_size", "line"], + "lines": [ + ["riak_search_vnodeq_mean", "mean", "absolute"], + ["riak_search_vnodeq_median", "median", "absolute"], + ["riak_search_vnodeq_95", "95", "absolute"], + ["riak_search_vnodeq_99", "99", "absolute"], + ["riak_search_vnodeq_100", "100", "absolute"], + ] + }, + "search.index_errors": { + "options": [None, "Number of document index errors encountered by Search", "errors", "load", "riak.search.index", "line"], + "lines": [ + ["search_index_fail_count", "errors", "absolute"] + ] + }, + "core.pbc": { + "options": [None, "Protocol buffer connections by status", "connections", "load", "riak.core.protobuf_connections", "line"], + "lines": [ + ["pbc_active", "active", "absolute"], + # ["pbc_connects", "established_pastmin", "absolute"] + ] + }, + "core.repairs": { + "options": [None, "Number of repair operations this node has coordinated", "repairs", "load", "riak.core.repairs", "line"], + "lines": [ + ["read_repairs", "read", "absolute"] + ] + }, + "core.fsm_active": { + "options": [None, "Active finite state machines by kind", "fsms", "load", "riak.core.fsm_active", "line"], + "lines": [ + ["node_get_fsm_active", "get", "absolute"], + ["node_put_fsm_active", "put", "absolute"], + ["index_fsm_active", "secondary index", "absolute"], + ["list_fsm_active", "list keys", "absolute"] + ] + }, + "core.fsm_rejected": { + # Writing "Sidejob's" here seems to cause some weird issues: it results in this chart being rendered in + # its own context and additionally, moves the entire Riak graph all the way up to the top of the Netdata + # dashboard for some reason. + "options": [None, "Finite state machines being rejected by Sidejobs overload protection", "fsms", "load", "riak.core.fsm_rejected", "line"], + "lines": [ + ["node_get_fsm_rejected", "get", "absolute"], + ["node_put_fsm_rejected", "put", "absolute"] + ] + }, + + # General Riak Search Load / Health metrics + "search.errors": { + "options": [None, "Number of writes to Search failed due to bad data format by reason", "writes", "load", "riak.search.index", "line"], + "lines": [ + ["search_index_bad_entry_count", "bad_entry", "absolute"], + ["search_index_extract_fail_count", "extract_fail", "absolute"], + ] + } +} + + +class Service(UrlService): + def __init__(self, configuration=None, name=None): + UrlService.__init__(self, configuration=configuration, name=name) + self.order = ORDER + self.definitions = CHARTS + + def _get_data(self): + """ + Format data received from http request + :return: dict + """ + raw = self._get_raw_data() + if not raw: + return None + + try: + return loads(raw) + except (TypeError, ValueError) as err: + self.error(err) + return None diff --git a/collectors/python.d.plugin/riakkv/riakkv.conf b/collectors/python.d.plugin/riakkv/riakkv.conf new file mode 100644 index 00000000..be01c48a --- /dev/null +++ b/collectors/python.d.plugin/riakkv/riakkv.conf @@ -0,0 +1,68 @@ +# netdata python.d.plugin configuration for riak +# +# This file is in YaML format. Generally the format is: +# +# name: value +# +# There are 2 sections: +# - global variables +# - one or more JOBS +# +# JOBS allow you to collect values from multiple sources. +# Each source will have its own set of charts. +# +# JOB parameters have to be indented (using spaces only, example below). + +# ---------------------------------------------------------------------- +# Global Variables +# These variables set the defaults for all JOBs, however each JOB +# may define its own, overriding the defaults. + +# update_every sets the default data collection frequency. +# If unset, the python.d.plugin default is used. +# update_every: 1 + +# priority controls the order of charts at the netdata dashboard. +# Lower numbers move the charts towards the top of the page. +# If unset, the default for python.d.plugin is used. +# priority: 60000 + +# penalty indicates whether to apply penalty to update_every in case of failures. +# Penalty will increase every 5 failed updates in a row. Maximum penalty is 10 minutes. +# penalty: yes + +# autodetection_retry sets the job re-check interval in seconds. +# The job is not deleted if check fails. +# Attempts to start the job are made once every autodetection_retry. +# This feature is disabled by default. +# autodetection_retry: 0 + +# ---------------------------------------------------------------------- +# JOBS (data collection sources) +# +# The default JOBS share the same *name*. JOBS with the same name +# are mutually exclusive. Only one of them will be allowed running at +# any time. This allows autodetection to try several alternatives and +# pick the one that works. +# +# Any number of jobs is supported. +# +# All python.d.plugin JOBS (for all its modules) support a set of +# predefined parameters. These are: +# +# job_name: +# name: myname # the JOB's name as it will appear at the +# # dashboard (by default is the job_name) +# # JOBs sharing a name are mutually exclusive +# update_every: 1 # the JOB's data collection frequency +# priority: 60000 # the JOB's order on the dashboard +# penalty: yes # the JOB's penalty +# autodetection_retry: 0 # the JOB's re-check interval in seconds +# +# +# ---------------------------------------------------------------------- +# AUTO-DETECTION JOBS +# only one of them will run (they have the same name) + +local: + url : 'http://localhost:8098/stats' diff --git a/collectors/python.d.plugin/smartd_log/README.md b/collectors/python.d.plugin/smartd_log/README.md index 3b0816fb..f6584be7 100644 --- a/collectors/python.d.plugin/smartd_log/README.md +++ b/collectors/python.d.plugin/smartd_log/README.md @@ -85,7 +85,11 @@ For this you need to set `smartd_opts` (or `SMARTD_ARGS`, check _smartd.service_ # dump smartd attrs info every 600 seconds smartd_opts="-A /var/log/smartd/ -i 600" ``` - +You may need to create the smartd directory before smartd will write to it: +``` +mkdir -p /var/log/smartd +``` +Otherwise, all the smartd `.csv` files may get written to `/var/lib/smartmontools` (default location). See also [https://linux.die.net/man/8/smartd](https://linux.die.net/man/8/smartd) for more info on the `-A --attributelog=PREFIX` command. `smartd` appends logs at every run. It's strongly recommended to use `logrotate` for smartd files. diff --git a/collectors/python.d.plugin/tomcat/tomcat.chart.py b/collectors/python.d.plugin/tomcat/tomcat.chart.py index 01578c56..ab300330 100644 --- a/collectors/python.d.plugin/tomcat/tomcat.chart.py +++ b/collectors/python.d.plugin/tomcat/tomcat.chart.py @@ -5,11 +5,17 @@ # SPDX-License-Identifier: GPL-3.0-or-later import xml.etree.ElementTree as ET +import re from bases.FrameworkServices.UrlService import UrlService MiB = 1 << 20 +# Regex fix for Tomcat single quote XML attributes +# affecting Tomcat < 8.5.24 & 9.0.2 running with Java > 9 +# cf. https://bz.apache.org/bugzilla/show_bug.cgi?id=61603 +single_quote_regex = re.compile(r"='([^']+)'([^']+)''") + ORDER = [ 'accesses', 'bandwidth', @@ -95,6 +101,32 @@ class Service(UrlService): self.definitions = CHARTS self.url = self.configuration.get('url', 'http://127.0.0.1:8080/manager/status?XML=true') self.connector_name = self.configuration.get('connector_name', None) + self.parse = self.xml_parse + + def xml_parse(self, data): + try: + return ET.fromstring(data) + except ET.ParseError: + self.debug('%s is not a valid XML page. Please add "?XML=true" to tomcat status page.' % self.url) + return None + + def xml_single_quote_fix_parse(self, data): + data = single_quote_regex.sub(r"='\g<1>\g<2>'", data) + return self.xml_parse(data) + + def check(self): + self._manager = self._build_manager() + + raw_data = self._get_raw_data() + if not raw_data: + return False + + if single_quote_regex.search(raw_data): + self.warning('Tomcat status page is returning invalid single quote XML, please consider upgrading ' + 'your Tomcat installation. See https://bz.apache.org/bugzilla/show_bug.cgi?id=61603') + self.parse = self.xml_single_quote_fix_parse + + return self.parse(raw_data) is not None def _get_data(self): """ @@ -104,11 +136,10 @@ class Service(UrlService): data = None raw_data = self._get_raw_data() if raw_data: - try: - xml = ET.fromstring(raw_data) - except ET.ParseError: - self.debug('%s is not a vaild XML page. Please add "?XML=true" to tomcat status page.' % self.url) + xml = self.parse(raw_data) + if xml is None: return None + data = {} jvm = xml.find('jvm') @@ -153,7 +184,7 @@ class Service(UrlService): data['metaspace_committed'] = pool.get('usageCommitted') data['metaspace_max'] = pool.get('usageMax') - if connector: + if connector is not None: thread_info = connector.find('threadInfo') data['currentThreadsBusy'] = thread_info.get('currentThreadsBusy') data['currentThreadCount'] = thread_info.get('currentThreadCount') diff --git a/collectors/python.d.plugin/varnish/varnish.chart.py b/collectors/python.d.plugin/varnish/varnish.chart.py index 70af50cc..58745e24 100644 --- a/collectors/python.d.plugin/varnish/varnish.chart.py +++ b/collectors/python.d.plugin/varnish/varnish.chart.py @@ -5,9 +5,8 @@ import re -from bases.collection import find_binary from bases.FrameworkServices.ExecutableService import ExecutableService - +from bases.collection import find_binary ORDER = [ 'session_connections', @@ -138,6 +137,18 @@ CHARTS = { VARNISHSTAT = 'varnishstat' +re_version = re.compile(r'varnish-(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)') + + +class VarnishVersion: + def __init__(self, major, minor, patch): + self.major = major + self.minor = minor + self.patch = patch + + def __str__(self): + return '{0}.{1}.{2}'.format(self.major, self.minor, self.patch) + class Parser: _backend_new = re.compile(r'VBE.([\d\w_.]+)\(.*?\).(beresp[\w_]+)\s+(\d+)') @@ -185,10 +196,32 @@ class Service(ExecutableService): self.error("can't locate '{0}' binary or binary is not executable by user netdata".format(VARNISHSTAT)) return False + command = [varnishstat, '-V'] + reply = self._get_raw_data(stderr=True, command=command) + if not reply: + self.error( + "no output from '{0}'. Is varnish running? Not enough privileges?".format(' '.join(self.command))) + return False + + ver = parse_varnish_version(reply) + if not ver: + self.error("failed to parse reply from '{0}', used regex :'{1}', reply : {2}".format( + ' '.join(command), + re_version.pattern, + reply, + )) + return False + if self.instance_name: - self.command = [varnishstat, '-1', '-n', self.instance_name, '-t', '1'] + self.command = [varnishstat, '-1', '-n', self.instance_name] else: - self.command = [varnishstat, '-1', '-t', '1'] + self.command = [varnishstat, '-1'] + + if ver.major > 4: + self.command.extend(['-t', '1']) + + self.info("varnish version: {0}, will use command: '{1}'".format(ver, ' '.join(self.command))) + return True def check(self): @@ -198,14 +231,14 @@ class Service(ExecutableService): # STDOUT is not empty reply = self._get_raw_data() if not reply: - self.error("No output from 'varnishstat'. Is it running? Not enough privileges?") + self.error("no output from '{0}'. Is it running? Not enough privileges?".format(' '.join(self.command))) return False self.parser.init(reply) # Output is parsable if not self.parser.re_default: - self.error('Cant parse the output...') + self.error('cant parse the output...') return False if self.parser.re_backend: @@ -260,3 +293,16 @@ class Service(ExecutableService): self.order.insert(0, chart_name) self.definitions.update(chart) + + +def parse_varnish_version(lines): + m = re_version.search(lines[0]) + if not m: + return None + + m = m.groupdict() + return VarnishVersion( + int(m['major']), + int(m['minor']), + int(m['patch']), + ) diff --git a/collectors/python.d.plugin/web_log/web_log.chart.py b/collectors/python.d.plugin/web_log/web_log.chart.py index 6d6a261c..fa5a8bc3 100644 --- a/collectors/python.d.plugin/web_log/web_log.chart.py +++ b/collectors/python.d.plugin/web_log/web_log.chart.py @@ -4,9 +4,8 @@ # SPDX-License-Identifier: GPL-3.0-or-later import bisect -import re import os - +import re from collections import namedtuple, defaultdict from copy import deepcopy @@ -660,7 +659,7 @@ class Web: r' (?P<bytes_sent>\d+)' r' (?P<resp_length>\d+)' r' (?P<resp_time>\d+\.\d+)' - r' (?P<resp_time_upstream>[\d.-]+) ') + r' (?P<resp_time_upstream>[\d.-]+)') nginx_ext_append = re.compile(r'(?P<address>[\da-f.:]+)' r' -.*?"(?P<request>[^"]*)"' diff --git a/collectors/tc.plugin/README.md b/collectors/tc.plugin/README.md index b54e8085..e71944e3 100644 --- a/collectors/tc.plugin/README.md +++ b/collectors/tc.plugin/README.md @@ -191,6 +191,7 @@ Add the following configuration option in `/etc/netdata.conf`: Finally, create `/etc/netdata/tc-qos-helper.conf` with this content: ```tc_show="class"``` +Please note, that by default Netdata will enable monitoring metrics only when they are not zero. If they are constantly zero they are ignored. Metrics that will start having values, after netdata is started, will be detected and charts will be automatically added to the dashboard (a refresh of the dashboard is needed for them to appear though). Set `yes` for a chart instead of `auto` to enable it permanently. [![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fcollectors%2Ftc.plugin%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)]() diff --git a/configs.signatures b/configs.signatures index afc8dbe5..b0ded05e 100644 --- a/configs.signatures +++ b/configs.signatures @@ -381,6 +381,7 @@ declare -A configs_signatures=( ['7deb236ec68a512b9bdd18e6a51d76f7']='python.d/mysql.conf' ['7e5fc1644aa7a54f9dbb1bd102521b09']='health.d/memcached.conf' ['7f13631183fbdf79c21c8e5a171e9b34']='health.d/zfs.conf' + ['ce285c90747428ee5da4efb547418dda']='health.d/dbengine.conf' ['7fb8184d56a27040e73261ed9c6fc76f']='health_alarm_notify.conf' ['80266bddd3df374923c750a6de91d120']='health.d/apache.conf' ['803a7f9dcb942eeac0fd764b9e3e38ca']='fping.conf' diff --git a/configure.ac b/configure.ac index d5a6556c..b922ad5b 100644 --- a/configure.ac +++ b/configure.ac @@ -76,7 +76,12 @@ AC_ARG_ENABLE( , [enable_backend_kinesis="detect"] ) - +AC_ARG_ENABLE( + [backend-prometheus-remote-write], + [AS_HELP_STRING([--enable-backend-prometheus-remote-write], [enable prometheus remote write backend @<:@default autodetect@:>@])], + , + [enable_backend_prometheus_remote_write="detect"] +) AC_ARG_ENABLE( [pedantic], [AS_HELP_STRING([--enable-pedantic], [enable pedantic compiler warnings @<:@default disabled@:>@])], @@ -132,12 +137,23 @@ AC_ARG_ENABLE( [enable_lto="detect"] ) AC_ARG_ENABLE( + [https], + [AS_HELP_STRING([--enable-https], [Enable SSL support @<:@default autodetect@:>@])], + , + [enable_https="detect"] +) +AC_ARG_ENABLE( [dbengine], [AS_HELP_STRING([--disable-dbengine], [disable netdata dbengine @<:@default autodetect@:>@])], , [enable_dbengine="detect"] ) - +AC_ARG_ENABLE( + [jsonc], + [AS_HELP_STRING([--enable-jsonc], [Enable JSON-C support @<:@default autodetect@:>@])], + , + [enable_jsonc="detect"] +) # ----------------------------------------------------------------------------- # netdata required checks @@ -258,7 +274,7 @@ AC_CHECK_LIB( [UV_LIBS="-luv"] ) -OPTIONAL_UV_CLFAGS="${UV_CFLAGS}" +OPTIONAL_UV_CFLAGS="${UV_CFLAGS}" OPTIONAL_UV_LIBS="${UV_LIBS}" @@ -271,7 +287,7 @@ AC_CHECK_LIB( [LZ4_LIBS="-llz4"] ) -OPTIONAL_LZ4_CLFAGS="${LZ4_CFLAGS}" +OPTIONAL_LZ4_CFLAGS="${LZ4_CFLAGS}" OPTIONAL_LZ4_LIBS="${LZ4_LIBS}" @@ -284,7 +300,7 @@ AC_CHECK_LIB( [JUDY_LIBS="-lJudy"] ) -OPTIONAL_JUDY_CLFAGS="${JUDY_CFLAGS}" +OPTIONAL_JUDY_CFLAGS="${JUDY_CFLAGS}" OPTIONAL_JUDY_LIBS="${JUDY_LIBS}" @@ -334,11 +350,25 @@ AC_CHECK_LIB( [SSL_LIBS="-lcrypto -lssl"] ) -OPTIONAL_SSL_CLFAGS="${SSL_CFLAGS}" +OPTIONAL_SSL_CFLAGS="${SSL_CFLAGS}" OPTIONAL_SSL_LIBS="${SSL_LIBS}" # ----------------------------------------------------------------------------- -# DB engine +# JSON-C library + +PKG_CHECK_MODULES([JSON],[json-c],AC_CHECK_LIB( + [json-c], + [json_object_get_type], + [JSONC_LIBS="-ljson-c"]),AC_CHECK_LIB( + [json], + [json_object_get_type], + [JSONC_LIBS="-ljson"]) + ) + +OPTIONAL_JSONC_LIBS="${JSONC_LIBS}" + +# ----------------------------------------------------------------------------- +# DB engine and HTTPS test "${enable_dbengine}" = "yes" -a -z "${UV_LIBS}" && \ AC_MSG_ERROR([libuv required but not found. Try installing 'libuv1-dev' or 'libuv-devel'.]) @@ -348,7 +378,7 @@ test "${enable_dbengine}" = "yes" -a -z "${LZ4_LIBS}" && \ test "${enable_dbengine}" = "yes" -a -z "${JUDY_LIBS}" && \ AC_MSG_ERROR([libJudy required but not found. Try installing 'libjudy-dev' or 'Judy-devel'.]) -test "${enable_dbengine}" = "yes" -a -z "${SSL_LIBS}" && \ +test "${enable_dbengine}" = "yes" -o "${enable_https}" = "yes" -a -z "${SSL_LIBS}" && \ AC_MSG_ERROR([OpenSSL required but not found. Try installing 'libssl-dev' or 'openssl-devel'.]) AC_MSG_CHECKING([if netdata dbengine should be used]) @@ -361,6 +391,30 @@ fi AC_MSG_RESULT([${enable_dbengine}]) AM_CONDITIONAL([ENABLE_DBENGINE], [test "${enable_dbengine}" = "yes"]) +AC_MSG_CHECKING([if netdata https should be used]) +if test "${enable_https}" != "no" -a "${SSL_LIBS}"; then + enable_https="yes" + AC_DEFINE([ENABLE_HTTPS], [1], [netdata HTTPS usability]) +else + enable_https="no" +fi +AC_MSG_RESULT([${enable_https}]) +AM_CONDITIONAL([ENABLE_HTTPS], [test "${enable_https}" = "yes"]) + +# ----------------------------------------------------------------------------- +# JSON-C +test "${enable_jsonc}" = "yes" -a -z "${JSONC_LIBS}" && \ + AC_MSG_ERROR([JSON-C required but not found. Try installing 'libjson-c-dev' or 'json-c'.]) + +AC_MSG_CHECKING([if json-c should be used]) +if test "${enable_jsonc}" != "no" -a "${JSONC_LIBS}"; then + enable_jsonc="yes" + AC_DEFINE([ENABLE_JSONC], [1], [netdata json-c usability]) +else + enable_jsonc="no" +fi +AC_MSG_RESULT([${enable_jsonc}]) +AM_CONDITIONAL([ENABLE_JSONC], [test "${enable_jsonc}" = "yes"]) # ----------------------------------------------------------------------------- # compiler options @@ -553,7 +607,16 @@ AM_CONDITIONAL([ENABLE_PLUGIN_CUPS], [test "${enable_plugin_cups}" = "yes"]) # ----------------------------------------------------------------------------- # nfacct.plugin - libmnl, libnetfilter_acct -AC_CHECK_HEADERS_ONCE([linux/netfilter/nfnetlink_conntrack.h]) +AC_CHECK_HEADER( + [linux/netfilter/nfnetlink_conntrack.h], + [AC_CHECK_DECL( + [CTA_STATS_MAX], + [have_nfnetlink_conntrack=yes], + [have_nfnetlink_conntrack=no], + [#include <linux/netfilter/nfnetlink_conntrack.h>] + )], + [have_nfnetlink_conntrack=no] +) PKG_CHECK_MODULES( [NFACCT], @@ -579,6 +642,9 @@ PKG_CHECK_MODULES( [have_libmnl=no] ) +test "${enable_plugin_nfacct}" = "yes" -a "${have_nfnetlink_conntrack}" != "yes" && \ + AC_MSG_ERROR([nfnetlink_conntrack.h required but not found or too old]) + test "${enable_plugin_nfacct}" = "yes" -a "${have_libnetfilter_acct}" != "yes" && \ AC_MSG_ERROR([netfilter_acct required but not found]) @@ -586,7 +652,9 @@ test "${enable_plugin_nfacct}" = "yes" -a "${have_libmnl}" != "yes" && \ AC_MSG_ERROR([libmnl required but not found. Try installing 'libmnl-dev' or 'libmnl-devel']) AC_MSG_CHECKING([if nfacct.plugin should be enabled]) -if test "${enable_plugin_nfacct}" != "no" -a "${have_libnetfilter_acct}" = "yes" -a "${have_libmnl}" = "yes"; then +if test "${enable_plugin_nfacct}" != "no" -a "${have_libnetfilter_acct}" = "yes" \ + -a "${have_libmnl}" = "yes" \ + -a "${have_nfnetlink_conntrack}" = "yes"; then enable_plugin_nfacct="yes" AC_DEFINE([HAVE_LIBMNL], [1], [libmnl usability]) AC_DEFINE([HAVE_LIBNETFILTER_ACCT], [1], [libnetfilter_acct usability]) @@ -667,6 +735,30 @@ AM_CONDITIONAL([ENABLE_PLUGIN_XENSTAT], [test "${enable_plugin_xenstat}" = "yes" # ----------------------------------------------------------------------------- +# perf.plugin + +AC_CHECK_HEADER( + [linux/perf_event.h], + [AC_CHECK_DECL( + [PERF_COUNT_HW_REF_CPU_CYCLES], + [have_perf_event=yes], + [have_perf_event=no], + [#include <linux/perf_event.h>] + )], + [have_perf_event=no] +) + +AC_MSG_CHECKING([if perf.plugin should be enabled]) +if test "${build_target}" == "linux" -a "${have_perf_event}" = "yes"; then + enable_plugin_perf="yes" +else + enable_plugin_perf="no" +fi +AC_MSG_RESULT([${enable_plugin_perf}]) +AM_CONDITIONAL([ENABLE_PLUGIN_PERF], [test "${enable_plugin_perf}" = "yes"]) + + +# ----------------------------------------------------------------------------- # AWS Kinesis backend - libaws-cpp-sdk-kinesis, libaws-cpp-sdk-core, libssl, libcrypto, libcurl PKG_CHECK_MODULES( @@ -778,6 +870,65 @@ AM_CONDITIONAL([ENABLE_BACKEND_KINESIS], [test "${enable_backend_kinesis}" = "ye # ----------------------------------------------------------------------------- +# Prometheus remote write backend - libprotobuf, libsnappy, protoc + +PKG_CHECK_MODULES( + [PROTOBUF], + [protobuf], + [have_libprotobuf=yes], + [have_libprotobuf=no] +) + +PKG_CHECK_MODULES( + [SNAPPY], + [snappy], + [have_libsnappy=yes], + [have_libsnappy=no] +) + +AC_PATH_PROG([PROTOC], [protoc], [no]) +AS_IF( + [test x"${PROTOC}" == x"no"], + [have_protoc=no], + [have_protoc=yes] +) + +AC_PATH_PROG([CXX_BINARY], [${CXX}], [no]) +AS_IF( + [test x"${CXX_BINARY}" == x"no"], + [have_CXX_compiler=no], + [have_CXX_compiler=yes] +) + +test "${enable_backend_prometheus_remote_write}" = "yes" -a "${have_libprotobuf}" != "yes" && \ + AC_MSG_ERROR([libprotobuf required but not found. try installing protobuf]) + +test "${enable_backend_prometheus_remote_write}" = "yes" -a "${have_libsnappy}" != "yes" && \ + AC_MSG_ERROR([libsnappy required but not found. try installing snappy]) + +test "${enable_backend_prometheus_remote_write}" = "yes" -a "${have_protoc}" != "yes" && \ + AC_MSG_ERROR([protoc compiler required but not found. try installing protobuf]) + +test "${enable_backend_prometheus_remote_write}" = "yes" -a "${have_CXX_compiler}" != "yes" && \ + AC_MSG_ERROR([C++ compiler required but not found. try installing g++]) + +AC_MSG_CHECKING([if prometheus remote write backend should be enabled]) +if test "${enable_backend_prometeus_remote_write}" != "no" -a "${have_libprotobuf}" = "yes" -a "${have_libsnappy}" = "yes" \ + -a "${have_protoc}" = "yes" -a "${have_CXX_compiler}" = "yes"; then + enable_backend_prometheus_remote_write="yes" + AC_DEFINE([ENABLE_PROMETHEUS_REMOTE_WRITE], [1], [Prometheus remote write API usability]) + OPTIONAL_PROMETHEUS_REMOTE_WRITE_CFLAGS="${PROTOBUF_CFLAGS} ${SNAPPY_CFLAGS}" + CXX11FLAG="-std=c++11" + OPTIONAL_PROMETHEUS_REMOTE_WRITE_LIBS="${PROTOBUF_LIBS} ${SNAPPY_LIBS} " +else + enable_backend_prometheus_remote_write="no" +fi + +AC_MSG_RESULT([${enable_backend_prometheus_remote_write}]) +AM_CONDITIONAL([ENABLE_BACKEND_PROMETHEUS_REMOTE_WRITE], [test "${enable_backend_prometheus_remote_write}" = "yes"]) + + +# ----------------------------------------------------------------------------- # check for setns() - cgroup-network AC_CHECK_FUNC([setns]) @@ -801,7 +952,7 @@ if test "${enable_lto}" != "no"; then fi if test "${have_lto}" = "yes"; then oCFLAGS="${CFLAGS}" - CFLAGS="${CFLAGS} -flto ${OPTIONAL_MATH_CFLAGS} ${OPTIONAL_NFACCT_CFLAGS} ${OPTIONAL_ZLIB_CFLAGS} ${OPTIONAL_UUID_CFLAGS} ${OPTIONAL_LIBCAP_CFLAGS} ${OPTIONAL_IPMIMONITORING_CFLAGS} ${OPTIONAL_CUPS_CFLAGS} ${OPTIONAL_XENSTAT_FLAGS} ${OPTIONAL_KINESIS_CFLAGS}" + CFLAGS="${CFLAGS} -flto ${OPTIONAL_MATH_CFLAGS} ${OPTIONAL_NFACCT_CFLAGS} ${OPTIONAL_ZLIB_CFLAGS} ${OPTIONAL_UUID_CFLAGS} ${OPTIONAL_LIBCAP_CFLAGS} ${OPTIONAL_IPMIMONITORING_CFLAGS} ${OPTIONAL_CUPS_CFLAGS} ${OPTIONAL_XENSTAT_FLAGS} ${OPTIONAL_KINESIS_CFLAGS} ${OPTIONAL_PROMETHEUS_REMOTE_WRITE_CFLAGS}" ac_cv_c_lto_cross_compile="${enable_lto}" test "${ac_cv_c_lto_cross_compile}" != "yes" && ac_cv_c_lto_cross_compile="no" AC_C_LTO @@ -822,6 +973,8 @@ AC_MSG_RESULT([${enable_lto}]) # ----------------------------------------------------------------------------- +AM_CONDITIONAL([ENABLE_CXX_LINKER], [test "${enable_backend_kinesis}" = "yes" -o "${enable_backend_prometheus_remote_write}" = "yes"]) + AC_DEFINE_UNQUOTED([NETDATA_USER], ["${with_user}"], [use this user to drop privileged]) varlibdir="${localstatedir}/lib/netdata" @@ -864,12 +1017,12 @@ CPPFLAGS="\ AC_SUBST([OPTIONAL_MATH_CFLAGS]) AC_SUBST([OPTIONAL_MATH_LIBS]) -AC_SUBST([OPTIONAL_RT_CLFAGS]) AC_SUBST([OPTIONAL_UV_LIBS]) AC_SUBST([OPTIONAL_LZ4_LIBS]) AC_SUBST([OPTIONAL_JUDY_LIBS]) AC_SUBST([OPTIONAL_SSL_LIBS]) -AC_SUBST([OPTIONAL_NFACCT_CLFAGS]) +AC_SUBST([OPTIONAL_JSONC_LIBS]) +AC_SUBST([OPTIONAL_NFACCT_CFLAGS]) AC_SUBST([OPTIONAL_NFACCT_LIBS]) AC_SUBST([OPTIONAL_ZLIB_CFLAGS]) AC_SUBST([OPTIONAL_ZLIB_LIBS]) @@ -885,6 +1038,8 @@ AC_SUBST([OPTIONAL_XENSTAT_CFLAGS]) AC_SUBST([OPTIONAL_XENSTAT_LIBS]) AC_SUBST([OPTIONAL_KINESIS_CFLAGS]) AC_SUBST([OPTIONAL_KINESIS_LIBS]) +AC_SUBST([OPTIONAL_PROMETHEUS_REMOTE_WRITE_CFLAGS]) +AC_SUBST([OPTIONAL_PROMETHEUS_REMOTE_WRITE_LIBS]) AC_CONFIG_FILES([ @@ -895,6 +1050,7 @@ AC_CONFIG_FILES([ backends/Makefile backends/opentsdb/Makefile backends/prometheus/Makefile + backends/prometheus/remote_write/Makefile backends/aws_kinesis/Makefile collectors/Makefile collectors/apps.plugin/Makefile @@ -917,6 +1073,7 @@ AC_CONFIG_FILES([ collectors/statsd.plugin/Makefile collectors/tc.plugin/Makefile collectors/xenstat.plugin/Makefile + collectors/perf.plugin/Makefile daemon/Makefile database/Makefile database/engine/Makefile @@ -941,6 +1098,8 @@ AC_CONFIG_FILES([ libnetdata/storage_number/Makefile libnetdata/threads/Makefile libnetdata/url/Makefile + libnetdata/json/Makefile + libnetdata/health/Makefile registry/Makefile streaming/Makefile system/Makefile diff --git a/daemon/anonymous-statistics.sh.in b/daemon/anonymous-statistics.sh.in index 9f548ce9..7d73f6d6 100755 --- a/daemon/anonymous-statistics.sh.in +++ b/daemon/anonymous-statistics.sh.in @@ -22,6 +22,9 @@ if [ -f "@configdir_POST@/.opt-out-from-anonymous-statistics" ]; then exit 0 fi +# Shorten version for easier reporting +NETDATA_VERSION=$(echo "${NETDATA_VERSION}" | sed 's/-.*//g' | tr -d 'v') + echo "&av=${NETDATA_VERSION}\ &ec=${ACTION}\ &ea=${ACTION_RESULT}\ @@ -62,7 +65,7 @@ if [ -n "$(command -v curl 2>/dev/null)" ]; then --data-urlencode "cd2=${NETDATA_SYSTEM_OS_ID}" \ --data-urlencode "cd3=${NETDATA_SYSTEM_OS_ID_LIKE}" \ --data-urlencode "cd4=${NETDATA_SYSTEM_OS_VERSION}" \ - --data-urlencode "cd5=${NETDATA_SYSTEM_OS_DETECTION}" \ + --data-urlencode "cd5=${NETDATA_SYSTEM_OS_VERSION_ID}" \ --data-urlencode "cd6=${NETDATA_SYSTEM_OS_DETECTION}" \ --data-urlencode "cd7=${NETDATA_SYSTEM_KERNEL_NAME}" \ --data-urlencode "cd8=${NETDATA_SYSTEM_KERNEL_VERSION}" \ diff --git a/daemon/common.h b/daemon/common.h index 9a55fa3a..a15ddb87 100644 --- a/daemon/common.h +++ b/daemon/common.h @@ -14,6 +14,7 @@ #define config_get_float(section, name, value) appconfig_get_float(&netdata_config, section, name, value) #define config_get_boolean(section, name, value) appconfig_get_boolean(&netdata_config, section, name, value) #define config_get_boolean_ondemand(section, name, value) appconfig_get_boolean_ondemand(&netdata_config, section, name, value) +#define config_get_duration(section, name, value) appconfig_get_duration(&netdata_config, section, name, value) #define config_set(section, name, default_value) appconfig_set(&netdata_config, section, name, default_value) #define config_set_default(section, name, value) appconfig_set_default(&netdata_config, section, name, value) diff --git a/daemon/config/README.md b/daemon/config/README.md index 0508a19d..4778cad2 100644 --- a/daemon/config/README.md +++ b/daemon/config/README.md @@ -62,7 +62,7 @@ memory deduplication (ksm) | `yes` | When set to `yes`, netdata will offer its i TZ environment variable | `:/etc/localtime` | Where to find the timezone timezone | auto-detected | The timezone retrieved from the environment variable debug flags | `0x0000000000000000` | Bitmap of debug options to enable. For more information check [Tracing Options](../#debugging). -debug log | `/var/log/netdata/debug.log` | The filename to save debug information. This file will not be created is debugging is not enabled. You can also set it to `syslog` to send the debug messages to syslog, or `none` to disable this log. For more information check [Tracing Options](../#debugging). +debug log | `/var/log/netdata/debug.log` | The filename to save debug information. This file will not be created if debugging is not enabled. You can also set it to `syslog` to send the debug messages to syslog, or `none` to disable this log. For more information check [Tracing Options](../#debugging). error log | `/var/log/netdata/error.log` | The filename to save error messages for netdata daemon and all plugins (`stderr` is sent here for all netdata programs, including the plugins). You can also set it to `syslog` to send the errors to syslog, or `none` to disable this log. access log | `/var/log/netdata/access.log` | The filename to save the log of web clients accessing netdata charts. You can also set it to `syslog` to send the access log to syslog, or `none` to disable this log. errors flood protection period | `1200` | UNUSED - Length of period (in sec) during which the number of errors should not exceed the `errors to trigger flood protection`. @@ -105,12 +105,12 @@ setting | default | info :------:|:-------:|:---- enabled | `yes` | Set to `no` to disable all alarms and notifications in memory max health log entries | 1000 | Size of the alarm history held in RAM -script to execute on alarm | `/usr/libexec/netdata/plugins.d/alarm-notify.sh` | The script that sends alarm notifications. +script to execute on alarm | `/usr/libexec/netdata/plugins.d/alarm-notify.sh` | The script that sends alarm notifications. Note that in versions before 1.16, the plugins.d directory may be installed in a different location in certain OSs (e.g. under `/usr/lib/netdata`). stock health configuration directory | `/usr/lib/netdata/conf.d/health.d` | Contains the stock alarm configuration files for each collector health configuration directory | `/etc/netdata/health.d` | The directory containing the user alarm configuration files, to override the stock configurations run at least every seconds | `10` | Controls how often all alarm conditions should be evaluated. postpone alarms during hibernation for seconds | `60` | Prevents false alarms. May need to be increased if you get alarms during hibernation. -rotate log every lines | 2000 | Controls the number of alarm log entries stored in `<lib directory>/health-log.db`, where <lib directory> is the one configured in the [[global] section](#global-section-options) +rotate log every lines | 2000 | Controls the number of alarm log entries stored in `<lib directory>/health-log.db`, where `<lib directory>` is the one configured in the [[global] section](#global-section-options) ### [registry] section options @@ -128,6 +128,8 @@ The configuration options for plugins appear in sections following the pattern ` Most internal plugins will provide additional options. Check [Internal Plugins](../../collectors/) for more information. +Please note, that by default Netdata will enable monitoring metrics for disks, memory, and network only when they are not zero. If they are constantly zero they are ignored. Metrics that will start having values, after netdata is started, will be detected and charts will be automatically added to the dashboard (a refresh of the dashboard is needed for them to appear though). Use `yes` instead of `auto` in plugin configuration sections to enable these charts permanently. + #### External plugins External plugins will have only 2 options at `netdata.conf`: @@ -141,6 +143,6 @@ External plugins that need additional configuration may support a dedicated file ### Per chart configuration -In this section you will a separate subsection for each chart shown on the dashboard. You can control all aspects of a specific chart here. You can understand what each option does by reading [how charts are defined](../../collectors/plugins.d/#chart). If you don't know how to find the name of a chart, you can learn about it [here](../../docs/Charts.md). +In this section you will find a separate subsection for each chart shown on the dashboard. You can control all aspects of a specific chart here. You can understand what each option does by reading [how charts are defined](../../collectors/plugins.d/#chart). If you don't know how to find the name of a chart, you can learn about it [here](../../docs/Charts.md). [![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fdaemon%2Fconfig%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)]() diff --git a/daemon/global_statistics.c b/daemon/global_statistics.c index 9cc05abb..53b7546f 100644 --- a/daemon/global_statistics.c +++ b/daemon/global_statistics.c @@ -535,10 +535,10 @@ void global_statistics_charts(void) { #ifdef ENABLE_DBENGINE if (localhost->rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) { - unsigned long long stats_array[27]; + unsigned long long stats_array[RRDENG_NR_STATS]; /* get localhost's DB engine's statistics */ - rrdeng_get_27_statistics(localhost->rrdeng_ctx, stats_array); + rrdeng_get_33_statistics(localhost->rrdeng_ctx, stats_array); // ---------------------------------------------------------------- @@ -637,6 +637,7 @@ void global_statistics_charts(void) { { static RRDSET *st_pg_cache_pages = NULL; + static RRDDIM *rd_descriptors = NULL; static RRDDIM *rd_populated = NULL; static RRDDIM *rd_commited = NULL; static RRDDIM *rd_insertions = NULL; @@ -660,6 +661,7 @@ void global_statistics_charts(void) { , RRDSET_TYPE_LINE ); + rd_descriptors = rrddim_add(st_pg_cache_pages, "descriptors", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); rd_populated = rrddim_add(st_pg_cache_pages, "populated", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); rd_commited = rrddim_add(st_pg_cache_pages, "commited", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); rd_insertions = rrddim_add(st_pg_cache_pages, "insertions", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); @@ -670,6 +672,7 @@ void global_statistics_charts(void) { else rrdset_next(st_pg_cache_pages); + rrddim_set_by_pointer(st_pg_cache_pages, rd_descriptors, (collected_number)stats_array[27]); rrddim_set_by_pointer(st_pg_cache_pages, rd_populated, (collected_number)stats_array[3]); rrddim_set_by_pointer(st_pg_cache_pages, rd_commited, (collected_number)stats_array[4]); rrddim_set_by_pointer(st_pg_cache_pages, rd_insertions, (collected_number)stats_array[5]); @@ -746,6 +749,75 @@ void global_statistics_charts(void) { rrddim_set_by_pointer(st_io_stats, rd_writes, (collected_number)stats_array[16]); rrdset_done(st_io_stats); } + + // ---------------------------------------------------------------- + + { + static RRDSET *st_errors = NULL; + static RRDDIM *rd_fs_errors = NULL; + static RRDDIM *rd_io_errors = NULL; + + if (unlikely(!st_errors)) { + st_errors = rrdset_create_localhost( + "netdata" + , "dbengine_global_errors" + , NULL + , "dbengine" + , NULL + , "NetData DB engine errors" + , "errors/s" + , "netdata" + , "stats" + , 130507 + , localhost->rrd_update_every + , RRDSET_TYPE_LINE + ); + + rd_io_errors = rrddim_add(st_errors, "I/O errors", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); + rd_fs_errors = rrddim_add(st_errors, "FS errors", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); + } + else + rrdset_next(st_errors); + + rrddim_set_by_pointer(st_errors, rd_io_errors, (collected_number)stats_array[30]); + rrddim_set_by_pointer(st_errors, rd_fs_errors, (collected_number)stats_array[31]); + rrdset_done(st_errors); + } + + // ---------------------------------------------------------------- + + { + static RRDSET *st_fd = NULL; + static RRDDIM *rd_fd_current = NULL; + static RRDDIM *rd_fd_max = NULL; + + if (unlikely(!st_fd)) { + st_fd = rrdset_create_localhost( + "netdata" + , "dbengine_global_file_descriptors" + , NULL + , "dbengine" + , NULL + , "NetData DB engine File Descriptors" + , "descriptors" + , "netdata" + , "stats" + , 130508 + , localhost->rrd_update_every + , RRDSET_TYPE_LINE + ); + + rd_fd_current = rrddim_add(st_fd, "current", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + rd_fd_max = rrddim_add(st_fd, "max", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + } + else + rrdset_next(st_fd); + + rrddim_set_by_pointer(st_fd, rd_fd_current, (collected_number)stats_array[32]); + /* Careful here, modify this accordingly if the File-Descriptor budget ever changes */ + rrddim_set_by_pointer(st_fd, rd_fd_max, (collected_number)rlimit_nofile.rlim_cur / 4); + rrdset_done(st_fd); + } } #endif diff --git a/daemon/main.c b/daemon/main.c index a1577fb9..0ced9081 100644 --- a/daemon/main.c +++ b/daemon/main.c @@ -24,7 +24,7 @@ void netdata_cleanup_and_exit(int ret) { error_log_limit_unlimited(); info("EXIT: netdata prepares to exit with code %d...", ret); - send_statistics("EXIT", ret?"ERROR":"OK","-"); + send_statistics("EXIT", ret?"ERROR":"OK","-"); // cleanup/save the database and exit info("EXIT: cleaning up the database..."); @@ -49,6 +49,10 @@ void netdata_cleanup_and_exit(int ret) { error("EXIT: cannot unlink pidfile '%s'.", pidfile); } +#ifdef ENABLE_HTTPS + security_clean_openssl(); +#endif + info("EXIT: all done - netdata is now exiting - bye bye..."); exit(ret); } @@ -345,7 +349,20 @@ static const char *verify_required_directory(const char *dir) { return dir; } -void log_init(void) { +#ifdef ENABLE_HTTPS +static void security_init(){ + char filename[FILENAME_MAX + 1]; + snprintfz(filename, FILENAME_MAX, "%s/ssl/key.pem",netdata_configured_user_config_dir); + security_key = config_get(CONFIG_SECTION_WEB, "ssl key", filename); + + snprintfz(filename, FILENAME_MAX, "%s/ssl/cert.pem",netdata_configured_user_config_dir); + security_cert = config_get(CONFIG_SECTION_WEB, "ssl certificate", filename); + + security_openssl_library(); +} +#endif + +static void log_init(void) { char filename[FILENAME_MAX + 1]; snprintfz(filename, FILENAME_MAX, "%s/debug.log", netdata_configured_log_dir); stdout_filename = config_get(CONFIG_SECTION_GLOBAL, "debug log", filename); @@ -356,9 +373,9 @@ void log_init(void) { snprintfz(filename, FILENAME_MAX, "%s/access.log", netdata_configured_log_dir); stdaccess_filename = config_get(CONFIG_SECTION_GLOBAL, "access log", filename); - char deffacility[8]; - snprintfz(deffacility,7,"%s","daemon"); - facility_log = config_get(CONFIG_SECTION_GLOBAL, "facility log", deffacility); + char deffacility[8]; + snprintfz(deffacility,7,"%s","daemon"); + facility_log = config_get(CONFIG_SECTION_GLOBAL, "facility log", deffacility); error_log_throttle_period = config_get_number(CONFIG_SECTION_GLOBAL, "errors flood protection period", error_log_throttle_period); error_log_errors_per_period = (unsigned long)config_get_number(CONFIG_SECTION_GLOBAL, "errors to trigger flood protection", (long long int)error_log_errors_per_period); @@ -420,8 +437,9 @@ static void get_netdata_configured_variables() { // get the hostname char buf[HOSTNAME_MAX + 1]; - if(gethostname(buf, HOSTNAME_MAX) == -1) + if(gethostname(buf, HOSTNAME_MAX) == -1){ error("Cannot get machine hostname."); + } netdata_configured_hostname = config_get(CONFIG_SECTION_GLOBAL, "hostname", buf); debug(D_OPTIONS, "hostname set to '%s'", netdata_configured_hostname); @@ -724,20 +742,20 @@ void send_statistics( const char *action, const char *action_result, const char if (likely(access(optout_file, R_OK) != 0)) { as_script = mallocz(sizeof(char) * (strlen(netdata_configured_primary_plugins_dir) + strlen("anonymous-statistics.sh") + 2)); sprintf(as_script, "%s/%s", netdata_configured_primary_plugins_dir, "anonymous-statistics.sh"); - if (unlikely(access(as_script, R_OK) != 0)) { - netdata_anonymous_statistics_enabled=0; - info("Anonymous statistics script %s not found.",as_script); - freez(as_script); - } else { - netdata_anonymous_statistics_enabled=1; - } - } else { + if (unlikely(access(as_script, R_OK) != 0)) { + netdata_anonymous_statistics_enabled=0; + info("Anonymous statistics script %s not found.",as_script); + freez(as_script); + } else { + netdata_anonymous_statistics_enabled=1; + } + } else { netdata_anonymous_statistics_enabled = 0; as_script = NULL; } freez(optout_file); } - if(!netdata_anonymous_statistics_enabled) return; + if(!netdata_anonymous_statistics_enabled) return; if (!action) return; if (!action_result) action_result=""; if (!action_data) action_data=""; @@ -756,6 +774,12 @@ void send_statistics( const char *action, const char *action_result, const char freez(command_to_run); } +void set_silencers_filename() { + char filename[FILENAME_MAX + 1]; + snprintfz(filename, FILENAME_MAX, "%s/health.silencers.json", netdata_configured_varlib_dir); + silencers_filename = config_get(CONFIG_SECTION_HEALTH, "silencers file", filename); +} + int main(int argc, char **argv) { int i; int config_loaded = 0; @@ -881,12 +905,9 @@ int main(int argc, char **argv) { return 0; } else if(strncmp(optarg, createdataset_string, strlen(createdataset_string)) == 0) { - unsigned history_seconds; - optarg += strlen(createdataset_string); - history_seconds = (unsigned )strtoull(optarg, NULL, 0); - #ifdef ENABLE_DBENGINE + unsigned history_seconds = (unsigned )strtoull(optarg, NULL, 0); generate_dbengine_dataset(history_seconds); #endif return 0; @@ -1081,6 +1102,17 @@ int main(int argc, char **argv) { error_log_limit_unlimited(); // -------------------------------------------------------------------- + // get the certificate and start security +#ifdef ENABLE_HTTPS + security_init(); +#endif + + // -------------------------------------------------------------------- + // This is the safest place to start the SILENCERS structure + set_silencers_filename(); + health_initialize_global_silencers(); + + // -------------------------------------------------------------------- // setup process signals // block signals while initializing threads. @@ -1134,10 +1166,6 @@ int main(int argc, char **argv) { // initialize the log files open_all_log_files(); - netdata_anonymous_statistics_enabled=-1; - struct rrdhost_system_info *system_info = calloc(1, sizeof(struct rrdhost_system_info)); - if (get_system_info(system_info) == 0) send_statistics("START","-", "-"); - #ifdef NETDATA_INTERNAL_CHECKS if(debug_flags != 0) { struct rlimit rl = { RLIM_INFINITY, RLIM_INFINITY }; @@ -1171,8 +1199,11 @@ int main(int argc, char **argv) { // ------------------------------------------------------------------------ // initialize rrd, registry, health, rrdpush, etc. + netdata_anonymous_statistics_enabled=-1; + struct rrdhost_system_info *system_info = calloc(1, sizeof(struct rrdhost_system_info)); + get_system_info(system_info); + rrd_init(netdata_configured_hostname, system_info); - rrdhost_system_info_free(system_info); // ------------------------------------------------------------------------ // enable log flood protection @@ -1196,6 +1227,8 @@ int main(int argc, char **argv) { info("netdata initialization completed. Enjoy real-time performance monitoring!"); netdata_ready = 1; + + send_statistics("START", "-", "-"); // ------------------------------------------------------------------------ // unblock signals diff --git a/database/README.md b/database/README.md index dc40a3e4..de0aa9b5 100644 --- a/database/README.md +++ b/database/README.md @@ -87,7 +87,9 @@ server that will maintain the entire database for all nodes, and will also run h for all nodes. For this central netdata, memory size can be a problem. Fortunately, netdata supports several -memory modes. One interesting option for this setup is `memory mode = map`. +memory modes. **One interesting option** for this setup is `memory mode = map`. + +### map In this mode, the database of netdata is stored in memory mapped files. netdata continues to read and write the database in memory, but the kernel automatically loads and saves memory pages from/to @@ -162,8 +164,10 @@ vm.dirty_ratio = 90 vm.dirty_writeback_centisecs = 0 ``` -There is another memory mode to help overcome the memory size problem. What is most interesting -for this setup is `memory mode = dbengine`. +There is another memory mode to help overcome the memory size problem. What is **most interesting +for this setup** is `memory mode = dbengine`. + +### dbengine In this mode, the database of netdata is stored in database files. The [Database Engine](engine/) works like a traditional database. There is some amount of RAM dedicated to data caching and diff --git a/database/engine/README.md b/database/engine/README.md index 28a2528c..adc69ffd 100644 --- a/database/engine/README.md +++ b/database/engine/README.md @@ -96,9 +96,9 @@ There are explicit memory requirements **per** DB engine **instance**, meaning * - `page cache size` must be at least `#dimensions-being-collected x 4096 x 2` bytes. -- an additional `#pages-on-disk x 4096 x 0.06` bytes of RAM are allocated for metadata. +- an additional `#pages-on-disk x 4096 x 0.03` bytes of RAM are allocated for metadata. - - roughly speaking this is 6% of the uncompressed disk space taken by the DB files. + - roughly speaking this is 3% of the uncompressed disk space taken by the DB files. - for very highly compressible data (compression ratio > 90%) this RAM overhead is comparable to the disk space footprint. @@ -106,4 +106,40 @@ There are explicit memory requirements **per** DB engine **instance**, meaning * An important observation is that RAM usage depends on both the `page cache size` and the `dbengine disk space` options. +## File descriptor requirements + +The Database Engine may keep a **significant** amount of files open per instance (e.g. per streaming +slave or master server). When configuring your system you should make sure there are at least 50 +file descriptors available per `dbengine` instance. + +Netdata allocates 25% of the available file descriptors to its Database Engine instances. This means that only 25% +of the file descriptors that are available to the Netdata service are accessible by dbengine instances. +You should take that into account when configuring your service +or system-wide file descriptor limits. You can roughly estimate that the netdata service needs 2048 file +descriptors for every 10 streaming slave hosts when streaming is configured to use `memory mode = dbengine`. + +If for example one wants to allocate 65536 file descriptors to the netdata service on a systemd system +one needs to override the netdata service by running `sudo systemctl edit netdata` and creating a +file with contents: + +``` +[Service] +LimitNOFILE=65536 +``` + +For other types of services one can add the line: +``` +ulimit -n 65536 +``` +at the beginning of the service file. Alternatively you can change the system-wide limits of the kernel by changing `/etc/sysctl.conf`. For linux that would be: +``` +fs.file-max = 65536 +``` +In FreeBSD and OS X you change the lines like this: +``` +kern.maxfilesperproc=65536 +kern.maxfiles=65536 +``` +You can apply the settings by running `sysctl -p` or by rebooting. + [![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fdatabase%2Fengine%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)]() diff --git a/database/engine/datafile.c b/database/engine/datafile.c index 2d17d05e..8ef4ed59 100644 --- a/database/engine/datafile.c +++ b/database/engine/datafile.c @@ -49,44 +49,69 @@ static void datafile_init(struct rrdengine_datafile *datafile, struct rrdengine_ datafile->ctx = ctx; } -static void generate_datafilepath(struct rrdengine_datafile *datafile, char *str, size_t maxlen) +void generate_datafilepath(struct rrdengine_datafile *datafile, char *str, size_t maxlen) { (void) snprintf(str, maxlen, "%s/" DATAFILE_PREFIX RRDENG_FILE_NUMBER_PRINT_TMPL DATAFILE_EXTENSION, datafile->ctx->dbfiles_path, datafile->tier, datafile->fileno); } +int close_data_file(struct rrdengine_datafile *datafile) +{ + struct rrdengine_instance *ctx = datafile->ctx; + uv_fs_t req; + int ret; + char path[RRDENG_PATH_MAX]; + + generate_datafilepath(datafile, path, sizeof(path)); + + ret = uv_fs_close(NULL, &req, datafile->file, NULL); + if (ret < 0) { + error("uv_fs_close(%s): %s", path, uv_strerror(ret)); + ++ctx->stats.fs_errors; + rrd_stat_atomic_add(&global_fs_errors, 1); + } + uv_fs_req_cleanup(&req); + + return ret; +} + + int destroy_data_file(struct rrdengine_datafile *datafile) { struct rrdengine_instance *ctx = datafile->ctx; uv_fs_t req; - int ret, fd; - char path[1024]; + int ret; + char path[RRDENG_PATH_MAX]; + + generate_datafilepath(datafile, path, sizeof(path)); ret = uv_fs_ftruncate(NULL, &req, datafile->file, 0, NULL); if (ret < 0) { - fatal("uv_fs_ftruncate: %s", uv_strerror(ret)); + error("uv_fs_ftruncate(%s): %s", path, uv_strerror(ret)); + ++ctx->stats.fs_errors; + rrd_stat_atomic_add(&global_fs_errors, 1); } - assert(0 == req.result); uv_fs_req_cleanup(&req); ret = uv_fs_close(NULL, &req, datafile->file, NULL); if (ret < 0) { - fatal("uv_fs_close: %s", uv_strerror(ret)); + error("uv_fs_close(%s): %s", path, uv_strerror(ret)); + ++ctx->stats.fs_errors; + rrd_stat_atomic_add(&global_fs_errors, 1); } - assert(0 == req.result); uv_fs_req_cleanup(&req); - generate_datafilepath(datafile, path, sizeof(path)); - fd = uv_fs_unlink(NULL, &req, path, NULL); - if (fd < 0) { - fatal("uv_fs_fsunlink: %s", uv_strerror(fd)); + ret = uv_fs_unlink(NULL, &req, path, NULL); + if (ret < 0) { + error("uv_fs_fsunlink(%s): %s", path, uv_strerror(ret)); + ++ctx->stats.fs_errors; + rrd_stat_atomic_add(&global_fs_errors, 1); } - assert(0 == req.result); uv_fs_req_cleanup(&req); ++ctx->stats.datafile_deletions; - return 0; + return ret; } int create_data_file(struct rrdengine_datafile *datafile) @@ -97,21 +122,17 @@ int create_data_file(struct rrdengine_datafile *datafile) int ret, fd; struct rrdeng_df_sb *superblock; uv_buf_t iov; - char path[1024]; + char path[RRDENG_PATH_MAX]; generate_datafilepath(datafile, path, sizeof(path)); - fd = uv_fs_open(NULL, &req, path, O_DIRECT | O_CREAT | O_RDWR | O_TRUNC, - S_IRUSR | S_IWUSR, NULL); + fd = open_file_direct_io(path, O_CREAT | O_RDWR | O_TRUNC, &file); if (fd < 0) { - fatal("uv_fs_fsopen: %s", uv_strerror(fd)); + ++ctx->stats.fs_errors; + rrd_stat_atomic_add(&global_fs_errors, 1); + return fd; } - assert(req.result >= 0); - file = req.result; - uv_fs_req_cleanup(&req); -#ifdef __APPLE__ - info("Disabling OS X caching for file \"%s\".", path); - fcntl(fd, F_NOCACHE, 1); -#endif + datafile->file = file; + ++ctx->stats.datafile_creations; ret = posix_memalign((void *)&superblock, RRDFILE_ALIGNMENT, sizeof(*superblock)); if (unlikely(ret)) { @@ -125,19 +146,21 @@ int create_data_file(struct rrdengine_datafile *datafile) ret = uv_fs_write(NULL, &req, file, &iov, 1, 0, NULL); if (ret < 0) { - fatal("uv_fs_write: %s", uv_strerror(ret)); - } - if (req.result < 0) { - fatal("uv_fs_write: %s", uv_strerror((int)req.result)); + assert(req.result < 0); + error("uv_fs_write: %s", uv_strerror(ret)); + ++ctx->stats.io_errors; + rrd_stat_atomic_add(&global_io_errors, 1); } uv_fs_req_cleanup(&req); free(superblock); + if (ret < 0) { + destroy_data_file(datafile); + return ret; + } - datafile->file = file; datafile->pos = sizeof(*superblock); ctx->stats.io_write_bytes += sizeof(*superblock); ++ctx->stats.io_write_requests; - ++ctx->stats.datafile_creations; return 0; } @@ -182,25 +205,17 @@ static int load_data_file(struct rrdengine_datafile *datafile) struct rrdengine_instance *ctx = datafile->ctx; uv_fs_t req; uv_file file; - int ret, fd; + int ret, fd, error; uint64_t file_size; - char path[1024]; + char path[RRDENG_PATH_MAX]; generate_datafilepath(datafile, path, sizeof(path)); - fd = uv_fs_open(NULL, &req, path, O_DIRECT | O_RDWR, S_IRUSR | S_IWUSR, NULL); + fd = open_file_direct_io(path, O_RDWR, &file); if (fd < 0) { - /* if (UV_ENOENT != fd) */ - error("uv_fs_fsopen: %s", uv_strerror(fd)); - uv_fs_req_cleanup(&req); + ++ctx->stats.fs_errors; + rrd_stat_atomic_add(&global_fs_errors, 1); return fd; } - assert(req.result >= 0); - file = req.result; - uv_fs_req_cleanup(&req); -#ifdef __APPLE__ - info("Disabling OS X caching for file \"%s\".", path); - fcntl(fd, F_NOCACHE, 1); -#endif info("Initializing data file \"%s\".", path); ret = check_file_properties(file, &file_size, sizeof(struct rrdeng_df_sb)); @@ -221,15 +236,21 @@ static int load_data_file(struct rrdengine_datafile *datafile) return 0; error: - (void) uv_fs_close(NULL, &req, file, NULL); + error = ret; + ret = uv_fs_close(NULL, &req, file, NULL); + if (ret < 0) { + error("uv_fs_close(%s): %s", path, uv_strerror(ret)); + ++ctx->stats.fs_errors; + rrd_stat_atomic_add(&global_fs_errors, 1); + } uv_fs_req_cleanup(&req); - return ret; + return error; } static int scan_data_files_cmp(const void *a, const void *b) { struct rrdengine_datafile *file1, *file2; - char path1[1024], path2[1024]; + char path1[RRDENG_PATH_MAX], path2[RRDENG_PATH_MAX]; file1 = *(struct rrdengine_datafile **)a; file2 = *(struct rrdengine_datafile **)b; @@ -238,7 +259,7 @@ static int scan_data_files_cmp(const void *a, const void *b) return strcmp(path1, path2); } -/* Returns number of datafiles that were loaded */ +/* Returns number of datafiles that were loaded or < 0 on error */ static int scan_data_files(struct rrdengine_instance *ctx) { int ret; @@ -249,16 +270,22 @@ static int scan_data_files(struct rrdengine_instance *ctx) struct rrdengine_journalfile *journalfile; ret = uv_fs_scandir(NULL, &req, ctx->dbfiles_path, 0, NULL); - assert(ret >= 0); - assert(req.result >= 0); + if (ret < 0) { + assert(req.result < 0); + uv_fs_req_cleanup(&req); + error("uv_fs_scandir(%s): %s", ctx->dbfiles_path, uv_strerror(ret)); + ++ctx->stats.fs_errors; + rrd_stat_atomic_add(&global_fs_errors, 1); + return ret; + } info("Found %d files in path %s", ret, ctx->dbfiles_path); datafiles = callocz(MIN(ret, MAX_DATAFILES), sizeof(*datafiles)); for (matched_files = 0 ; UV_EOF != uv_fs_scandir_next(&req, &dent) && matched_files < MAX_DATAFILES ; ) { - info("Scanning file \"%s\"", dent.name); + info("Scanning file \"%s/%s\"", ctx->dbfiles_path, dent.name); ret = sscanf(dent.name, DATAFILE_PREFIX RRDENG_FILE_NUMBER_SCAN_TMPL DATAFILE_EXTENSION, &tier, &no); if (2 == ret) { - info("Matched file \"%s\"", dent.name); + info("Matched file \"%s/%s\"", ctx->dbfiles_path, dent.name); datafile = mallocz(sizeof(*datafile)); datafile_init(datafile, ctx, tier, no); datafiles[matched_files++] = datafile; @@ -266,70 +293,133 @@ static int scan_data_files(struct rrdengine_instance *ctx) } uv_fs_req_cleanup(&req); + if (0 == matched_files) { + freez(datafiles); + return 0; + } if (matched_files == MAX_DATAFILES) { error("Warning: hit maximum database engine file limit of %d files", MAX_DATAFILES); } qsort(datafiles, matched_files, sizeof(*datafiles), scan_data_files_cmp); + /* TODO: change this when tiering is implemented */ + ctx->last_fileno = datafiles[matched_files - 1]->fileno; + for (failed_to_load = 0, i = 0 ; i < matched_files ; ++i) { datafile = datafiles[i]; ret = load_data_file(datafile); if (0 != ret) { - free(datafile); + freez(datafile); ++failed_to_load; - continue; + break; } journalfile = mallocz(sizeof(*journalfile)); datafile->journalfile = journalfile; journalfile_init(journalfile, datafile); ret = load_journal_file(ctx, journalfile, datafile); if (0 != ret) { - free(datafile); - free(journalfile); + close_data_file(datafile); + freez(datafile); + freez(journalfile); ++failed_to_load; - continue; + break; } datafile_list_insert(ctx, datafile); ctx->disk_space += datafile->pos + journalfile->pos; } + freez(datafiles); if (failed_to_load) { - error("%u files failed to load.", failed_to_load); + error("%u datafiles failed to load.", failed_to_load); + finalize_data_files(ctx); + return UV_EIO; } - free(datafiles); - return matched_files - failed_to_load; + return matched_files; } /* Creates a datafile and a journalfile pair */ -void create_new_datafile_pair(struct rrdengine_instance *ctx, unsigned tier, unsigned fileno) +int create_new_datafile_pair(struct rrdengine_instance *ctx, unsigned tier, unsigned fileno) { struct rrdengine_datafile *datafile; struct rrdengine_journalfile *journalfile; int ret; + char path[RRDENG_PATH_MAX]; - info("Creating new data and journal files."); + info("Creating new data and journal files in path %s", ctx->dbfiles_path); datafile = mallocz(sizeof(*datafile)); datafile_init(datafile, ctx, tier, fileno); ret = create_data_file(datafile); - assert(!ret); + if (!ret) { + generate_datafilepath(datafile, path, sizeof(path)); + info("Created data file \"%s\".", path); + } else { + goto error_after_datafile; + } journalfile = mallocz(sizeof(*journalfile)); datafile->journalfile = journalfile; journalfile_init(journalfile, datafile); ret = create_journal_file(journalfile, datafile); - assert(!ret); + if (!ret) { + generate_journalfilepath(datafile, path, sizeof(path)); + info("Created journal file \"%s\".", path); + } else { + goto error_after_journalfile; + } datafile_list_insert(ctx, datafile); ctx->disk_space += datafile->pos + journalfile->pos; + + return 0; + +error_after_journalfile: + destroy_data_file(datafile); + freez(journalfile); +error_after_datafile: + freez(datafile); + return ret; } -/* Page cache must already be initialized. */ +/* Page cache must already be initialized. + * Return 0 on success. + */ int init_data_files(struct rrdengine_instance *ctx) { int ret; ret = scan_data_files(ctx); - if (0 == ret) { - info("Data files not found, creating."); - create_new_datafile_pair(ctx, 1, 1); + if (ret < 0) { + error("Failed to scan path \"%s\".", ctx->dbfiles_path); + return ret; + } else if (0 == ret) { + info("Data files not found, creating in path \"%s\".", ctx->dbfiles_path); + ret = create_new_datafile_pair(ctx, 1, 1); + if (ret) { + error("Failed to create data and journal files in path \"%s\".", ctx->dbfiles_path); + return ret; + } + ctx->last_fileno = 1; } + return 0; +} + +void finalize_data_files(struct rrdengine_instance *ctx) +{ + struct rrdengine_datafile *datafile, *next_datafile; + struct rrdengine_journalfile *journalfile; + struct extent_info *extent, *next_extent; + + for (datafile = ctx->datafiles.first ; datafile != NULL ; datafile = next_datafile) { + journalfile = datafile->journalfile; + next_datafile = datafile->next; + + for (extent = datafile->extents.first ; extent != NULL ; extent = next_extent) { + next_extent = extent->next; + freez(extent); + } + close_journal_file(journalfile, datafile); + close_data_file(datafile); + freez(journalfile); + freez(datafile); + + } }
\ No newline at end of file diff --git a/database/engine/datafile.h b/database/engine/datafile.h index c5c8f31f..eeb11310 100644 --- a/database/engine/datafile.h +++ b/database/engine/datafile.h @@ -26,7 +26,7 @@ struct extent_info { uint8_t number_of_pages; struct rrdengine_datafile *datafile; struct extent_info *next; - struct rrdeng_page_cache_descr *pages[]; + struct rrdeng_page_descr *pages[]; }; struct rrdengine_df_extents { @@ -55,9 +55,12 @@ struct rrdengine_datafile_list { extern void df_extent_insert(struct extent_info *extent); extern void datafile_list_insert(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile); extern void datafile_list_delete(struct rrdengine_instance *ctx, struct rrdengine_datafile *datafile); +extern void generate_datafilepath(struct rrdengine_datafile *datafile, char *str, size_t maxlen); +extern int close_data_file(struct rrdengine_datafile *datafile); extern int destroy_data_file(struct rrdengine_datafile *datafile); extern int create_data_file(struct rrdengine_datafile *datafile); -extern void create_new_datafile_pair(struct rrdengine_instance *ctx, unsigned tier, unsigned fileno); +extern int create_new_datafile_pair(struct rrdengine_instance *ctx, unsigned tier, unsigned fileno); extern int init_data_files(struct rrdengine_instance *ctx); +extern void finalize_data_files(struct rrdengine_instance *ctx); #endif /* NETDATA_DATAFILE_H */
\ No newline at end of file diff --git a/database/engine/journalfile.c b/database/engine/journalfile.c index 44d8461d..30eaa0ec 100644 --- a/database/engine/journalfile.c +++ b/database/engine/journalfile.c @@ -13,7 +13,7 @@ static void flush_transaction_buffer_cb(uv_fs_t* req) uv_fs_req_cleanup(req); free(io_descr->buf); - free(io_descr); + freez(io_descr); } /* Careful to always call this before creating a new journal file */ @@ -87,7 +87,7 @@ void * wal_get_transaction_buffer(struct rrdengine_worker_config* wc, unsigned s return ctx->commit_log.buf + buf_pos; } -static void generate_journalfilepath(struct rrdengine_datafile *datafile, char *str, size_t maxlen) +void generate_journalfilepath(struct rrdengine_datafile *datafile, char *str, size_t maxlen) { (void) snprintf(str, maxlen, "%s/" WALFILE_PREFIX RRDENG_FILE_NUMBER_PRINT_TMPL WALFILE_EXTENSION, datafile->ctx->dbfiles_path, datafile->tier, datafile->fileno); @@ -100,39 +100,62 @@ void journalfile_init(struct rrdengine_journalfile *journalfile, struct rrdengin journalfile->datafile = datafile; } +int close_journal_file(struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile) +{ + struct rrdengine_instance *ctx = datafile->ctx; + uv_fs_t req; + int ret; + char path[RRDENG_PATH_MAX]; + + generate_journalfilepath(datafile, path, sizeof(path)); + + ret = uv_fs_close(NULL, &req, journalfile->file, NULL); + if (ret < 0) { + error("uv_fs_close(%s): %s", path, uv_strerror(ret)); + ++ctx->stats.fs_errors; + rrd_stat_atomic_add(&global_fs_errors, 1); + } + uv_fs_req_cleanup(&req); + + return ret; +} + int destroy_journal_file(struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile) { struct rrdengine_instance *ctx = datafile->ctx; uv_fs_t req; - int ret, fd; - char path[1024]; + int ret; + char path[RRDENG_PATH_MAX]; + + generate_journalfilepath(datafile, path, sizeof(path)); ret = uv_fs_ftruncate(NULL, &req, journalfile->file, 0, NULL); if (ret < 0) { - fatal("uv_fs_ftruncate: %s", uv_strerror(ret)); + error("uv_fs_ftruncate(%s): %s", path, uv_strerror(ret)); + ++ctx->stats.fs_errors; + rrd_stat_atomic_add(&global_fs_errors, 1); } - assert(0 == req.result); uv_fs_req_cleanup(&req); ret = uv_fs_close(NULL, &req, journalfile->file, NULL); if (ret < 0) { - fatal("uv_fs_close: %s", uv_strerror(ret)); - exit(ret); + error("uv_fs_close(%s): %s", path, uv_strerror(ret)); + ++ctx->stats.fs_errors; + rrd_stat_atomic_add(&global_fs_errors, 1); } - assert(0 == req.result); uv_fs_req_cleanup(&req); - generate_journalfilepath(datafile, path, sizeof(path)); - fd = uv_fs_unlink(NULL, &req, path, NULL); - if (fd < 0) { - fatal("uv_fs_fsunlink: %s", uv_strerror(fd)); + ret = uv_fs_unlink(NULL, &req, path, NULL); + if (ret < 0) { + error("uv_fs_fsunlink(%s): %s", path, uv_strerror(ret)); + ++ctx->stats.fs_errors; + rrd_stat_atomic_add(&global_fs_errors, 1); } - assert(0 == req.result); uv_fs_req_cleanup(&req); ++ctx->stats.journalfile_deletions; - return 0; + return ret; } int create_journal_file(struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile) @@ -143,21 +166,17 @@ int create_journal_file(struct rrdengine_journalfile *journalfile, struct rrdeng int ret, fd; struct rrdeng_jf_sb *superblock; uv_buf_t iov; - char path[1024]; + char path[RRDENG_PATH_MAX]; generate_journalfilepath(datafile, path, sizeof(path)); - fd = uv_fs_open(NULL, &req, path, O_DIRECT | O_CREAT | O_RDWR | O_TRUNC, - S_IRUSR | S_IWUSR, NULL); + fd = open_file_direct_io(path, O_CREAT | O_RDWR | O_TRUNC, &file); if (fd < 0) { - fatal("uv_fs_fsopen: %s", uv_strerror(fd)); + ++ctx->stats.fs_errors; + rrd_stat_atomic_add(&global_fs_errors, 1); + return fd; } - assert(req.result >= 0); - file = req.result; - uv_fs_req_cleanup(&req); -#ifdef __APPLE__ - info("Disabling OS X caching for file \"%s\".", path); - fcntl(fd, F_NOCACHE, 1); -#endif + journalfile->file = file; + ++ctx->stats.journalfile_creations; ret = posix_memalign((void *)&superblock, RRDFILE_ALIGNMENT, sizeof(*superblock)); if (unlikely(ret)) { @@ -170,19 +189,21 @@ int create_journal_file(struct rrdengine_journalfile *journalfile, struct rrdeng ret = uv_fs_write(NULL, &req, file, &iov, 1, 0, NULL); if (ret < 0) { - fatal("uv_fs_write: %s", uv_strerror(ret)); - } - if (req.result < 0) { - fatal("uv_fs_write: %s", uv_strerror((int)req.result)); + assert(req.result < 0); + error("uv_fs_write: %s", uv_strerror(ret)); + ++ctx->stats.io_errors; + rrd_stat_atomic_add(&global_io_errors, 1); } uv_fs_req_cleanup(&req); free(superblock); + if (ret < 0) { + destroy_journal_file(journalfile, datafile); + return ret; + } - journalfile->file = file; journalfile->pos = sizeof(*superblock); ctx->stats.io_write_bytes += sizeof(*superblock); ++ctx->stats.io_write_requests; - ++ctx->stats.journalfile_creations; return 0; } @@ -226,7 +247,7 @@ static void restore_extent_metadata(struct rrdengine_instance *ctx, struct rrden { struct page_cache *pg_cache = &ctx->pg_cache; unsigned i, count, payload_length, descr_size, valid_pages; - struct rrdeng_page_cache_descr *descr; + struct rrdeng_page_descr *descr; struct extent_info *extent; /* persistent structures */ struct rrdeng_jf_store_data *jf_metric_data; @@ -271,6 +292,8 @@ static void restore_extent_metadata(struct rrdengine_instance *ctx, struct rrden PValue = JudyHSIns(&pg_cache->metrics_index.JudyHS_array, temp_id, sizeof(uuid_t), PJE0); assert(NULL == *PValue); /* TODO: figure out concurrency model */ *PValue = page_index = create_page_index(temp_id); + page_index->prev = pg_cache->metrics_index.last_page_index; + pg_cache->metrics_index.last_page_index = page_index; uv_rwlock_wrunlock(&pg_cache->metrics_index.lock); } @@ -406,25 +429,17 @@ int load_journal_file(struct rrdengine_instance *ctx, struct rrdengine_journalfi { uv_fs_t req; uv_file file; - int ret, fd; + int ret, fd, error; uint64_t file_size, max_id; - char path[1024]; + char path[RRDENG_PATH_MAX]; generate_journalfilepath(datafile, path, sizeof(path)); - fd = uv_fs_open(NULL, &req, path, O_DIRECT | O_RDWR, S_IRUSR | S_IWUSR, NULL); + fd = open_file_direct_io(path, O_RDWR, &file); if (fd < 0) { - /* if (UV_ENOENT != fd) */ - error("uv_fs_fsopen: %s", uv_strerror(fd)); - uv_fs_req_cleanup(&req); + ++ctx->stats.fs_errors; + rrd_stat_atomic_add(&global_fs_errors, 1); return fd; } - assert(req.result >= 0); - file = req.result; - uv_fs_req_cleanup(&req); -#ifdef __APPLE__ - info("Disabling OS X caching for file \"%s\".", path); - fcntl(fd, F_NOCACHE, 1); -#endif info("Loading journal file \"%s\".", path); ret = check_file_properties(file, &file_size, sizeof(struct rrdeng_df_sb)); @@ -449,9 +464,15 @@ int load_journal_file(struct rrdengine_instance *ctx, struct rrdengine_journalfi return 0; error: - (void) uv_fs_close(NULL, &req, file, NULL); + error = ret; + ret = uv_fs_close(NULL, &req, file, NULL); + if (ret < 0) { + error("uv_fs_close(%s): %s", path, uv_strerror(ret)); + ++ctx->stats.fs_errors; + rrd_stat_atomic_add(&global_fs_errors, 1); + } uv_fs_req_cleanup(&req); - return ret; + return error; } void init_commit_log(struct rrdengine_instance *ctx) diff --git a/database/engine/journalfile.h b/database/engine/journalfile.h index 50489aee..0df66304 100644 --- a/database/engine/journalfile.h +++ b/database/engine/journalfile.h @@ -33,9 +33,11 @@ struct transaction_commit_log { unsigned buf_size; }; +extern void generate_journalfilepath(struct rrdengine_datafile *datafile, char *str, size_t maxlen); extern void journalfile_init(struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile); extern void *wal_get_transaction_buffer(struct rrdengine_worker_config* wc, unsigned size); extern void wal_flush_transaction_buffer(struct rrdengine_worker_config* wc); +extern int close_journal_file(struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile); extern int destroy_journal_file(struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile); extern int create_journal_file(struct rrdengine_journalfile *journalfile, struct rrdengine_datafile *datafile); extern int load_journal_file(struct rrdengine_instance *ctx, struct rrdengine_journalfile *journalfile, diff --git a/database/engine/pagecache.c b/database/engine/pagecache.c index c90947a6..124f2448 100644 --- a/database/engine/pagecache.c +++ b/database/engine/pagecache.c @@ -8,28 +8,29 @@ static int pg_cache_try_evict_one_page_unsafe(struct rrdengine_instance *ctx); /* always inserts into tail */ static inline void pg_cache_replaceQ_insert_unsafe(struct rrdengine_instance *ctx, - struct rrdeng_page_cache_descr *descr) + struct rrdeng_page_descr *descr) { struct page_cache *pg_cache = &ctx->pg_cache; + struct page_cache_descr *pg_cache_descr = descr->pg_cache_descr; if (likely(NULL != pg_cache->replaceQ.tail)) { - descr->prev = pg_cache->replaceQ.tail; - pg_cache->replaceQ.tail->next = descr; + pg_cache_descr->prev = pg_cache->replaceQ.tail; + pg_cache->replaceQ.tail->next = pg_cache_descr; } if (unlikely(NULL == pg_cache->replaceQ.head)) { - pg_cache->replaceQ.head = descr; + pg_cache->replaceQ.head = pg_cache_descr; } - pg_cache->replaceQ.tail = descr; + pg_cache->replaceQ.tail = pg_cache_descr; } static inline void pg_cache_replaceQ_delete_unsafe(struct rrdengine_instance *ctx, - struct rrdeng_page_cache_descr *descr) + struct rrdeng_page_descr *descr) { struct page_cache *pg_cache = &ctx->pg_cache; - struct rrdeng_page_cache_descr *prev, *next; + struct page_cache_descr *pg_cache_descr = descr->pg_cache_descr, *prev, *next; - prev = descr->prev; - next = descr->next; + prev = pg_cache_descr->prev; + next = pg_cache_descr->next; if (likely(NULL != prev)) { prev->next = next; @@ -37,17 +38,17 @@ static inline void pg_cache_replaceQ_delete_unsafe(struct rrdengine_instance *ct if (likely(NULL != next)) { next->prev = prev; } - if (unlikely(descr == pg_cache->replaceQ.head)) { + if (unlikely(pg_cache_descr == pg_cache->replaceQ.head)) { pg_cache->replaceQ.head = next; } - if (unlikely(descr == pg_cache->replaceQ.tail)) { + if (unlikely(pg_cache_descr == pg_cache->replaceQ.tail)) { pg_cache->replaceQ.tail = prev; } - descr->prev = descr->next = NULL; + pg_cache_descr->prev = pg_cache_descr->next = NULL; } void pg_cache_replaceQ_insert(struct rrdengine_instance *ctx, - struct rrdeng_page_cache_descr *descr) + struct rrdeng_page_descr *descr) { struct page_cache *pg_cache = &ctx->pg_cache; @@ -57,7 +58,7 @@ void pg_cache_replaceQ_insert(struct rrdengine_instance *ctx, } void pg_cache_replaceQ_delete(struct rrdengine_instance *ctx, - struct rrdeng_page_cache_descr *descr) + struct rrdeng_page_descr *descr) { struct page_cache *pg_cache = &ctx->pg_cache; @@ -66,7 +67,7 @@ void pg_cache_replaceQ_delete(struct rrdengine_instance *ctx, uv_rwlock_wrunlock(&pg_cache->replaceQ.lock); } void pg_cache_replaceQ_set_hot(struct rrdengine_instance *ctx, - struct rrdeng_page_cache_descr *descr) + struct rrdeng_page_descr *descr) { struct page_cache *pg_cache = &ctx->pg_cache; @@ -76,40 +77,28 @@ void pg_cache_replaceQ_set_hot(struct rrdengine_instance *ctx, uv_rwlock_wrunlock(&pg_cache->replaceQ.lock); } -struct rrdeng_page_cache_descr *pg_cache_create_descr(void) +struct rrdeng_page_descr *pg_cache_create_descr(void) { - struct rrdeng_page_cache_descr *descr; + struct rrdeng_page_descr *descr; descr = mallocz(sizeof(*descr)); - descr->page = NULL; descr->page_length = 0; descr->start_time = INVALID_TIME; descr->end_time = INVALID_TIME; descr->id = NULL; descr->extent = NULL; - descr->flags = 0; - descr->prev = descr->next = descr->private = NULL; - descr->refcnt = 0; - descr->waiters = 0; - descr->handle = NULL; - assert(0 == uv_cond_init(&descr->cond)); - assert(0 == uv_mutex_init(&descr->mutex)); + descr->pg_cache_descr_state = 0; + descr->pg_cache_descr = NULL; return descr; } -void pg_cache_destroy_descr(struct rrdeng_page_cache_descr *descr) -{ - uv_cond_destroy(&descr->cond); - uv_mutex_destroy(&descr->mutex); - free(descr); -} - /* The caller must hold page descriptor lock. */ -void pg_cache_wake_up_waiters_unsafe(struct rrdeng_page_cache_descr *descr) +void pg_cache_wake_up_waiters_unsafe(struct rrdeng_page_descr *descr) { - if (descr->waiters) - uv_cond_broadcast(&descr->cond); + struct page_cache_descr *pg_cache_descr = descr->pg_cache_descr; + if (pg_cache_descr->waiters) + uv_cond_broadcast(&pg_cache_descr->cond); } /* @@ -117,11 +106,13 @@ void pg_cache_wake_up_waiters_unsafe(struct rrdeng_page_cache_descr *descr) * The lock will be released and re-acquired. The descriptor is not guaranteed * to exist after this function returns. */ -void pg_cache_wait_event_unsafe(struct rrdeng_page_cache_descr *descr) +void pg_cache_wait_event_unsafe(struct rrdeng_page_descr *descr) { - ++descr->waiters; - uv_cond_wait(&descr->cond, &descr->mutex); - --descr->waiters; + struct page_cache_descr *pg_cache_descr = descr->pg_cache_descr; + + ++pg_cache_descr->waiters; + uv_cond_wait(&pg_cache_descr->cond, &pg_cache_descr->mutex); + --pg_cache_descr->waiters; } /* @@ -129,14 +120,15 @@ void pg_cache_wait_event_unsafe(struct rrdeng_page_cache_descr *descr) * The lock will be released and re-acquired. The descriptor is not guaranteed * to exist after this function returns. */ -unsigned long pg_cache_wait_event(struct rrdeng_page_cache_descr *descr) +unsigned long pg_cache_wait_event(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr) { + struct page_cache_descr *pg_cache_descr = descr->pg_cache_descr; unsigned long flags; - uv_mutex_lock(&descr->mutex); + rrdeng_page_descr_mutex_lock(ctx, descr); pg_cache_wait_event_unsafe(descr); - flags = descr->flags; - uv_mutex_unlock(&descr->mutex); + flags = pg_cache_descr->flags; + rrdeng_page_descr_mutex_unlock(ctx, descr); return flags; } @@ -146,15 +138,17 @@ unsigned long pg_cache_wait_event(struct rrdeng_page_cache_descr *descr) * Gets a reference to the page descriptor. * Returns 1 on success and 0 on failure. */ -int pg_cache_try_get_unsafe(struct rrdeng_page_cache_descr *descr, int exclusive_access) +int pg_cache_try_get_unsafe(struct rrdeng_page_descr *descr, int exclusive_access) { - if ((descr->flags & (RRD_PAGE_LOCKED | RRD_PAGE_READ_PENDING)) || - (exclusive_access && descr->refcnt)) { + struct page_cache_descr *pg_cache_descr = descr->pg_cache_descr; + + if ((pg_cache_descr->flags & (RRD_PAGE_LOCKED | RRD_PAGE_READ_PENDING)) || + (exclusive_access && pg_cache_descr->refcnt)) { return 0; } if (exclusive_access) - descr->flags |= RRD_PAGE_LOCKED; - ++descr->refcnt; + pg_cache_descr->flags |= RRD_PAGE_LOCKED; + ++pg_cache_descr->refcnt; return 1; } @@ -163,10 +157,12 @@ int pg_cache_try_get_unsafe(struct rrdeng_page_cache_descr *descr, int exclusive * The caller must hold page descriptor lock. * Same return values as pg_cache_try_get_unsafe() without doing anything. */ -int pg_cache_can_get_unsafe(struct rrdeng_page_cache_descr *descr, int exclusive_access) +int pg_cache_can_get_unsafe(struct rrdeng_page_descr *descr, int exclusive_access) { - if ((descr->flags & (RRD_PAGE_LOCKED | RRD_PAGE_READ_PENDING)) || - (exclusive_access && descr->refcnt)) { + struct page_cache_descr *pg_cache_descr = descr->pg_cache_descr; + + if ((pg_cache_descr->flags & (RRD_PAGE_LOCKED | RRD_PAGE_READ_PENDING)) || + (exclusive_access && pg_cache_descr->refcnt)) { return 0; } @@ -177,23 +173,24 @@ int pg_cache_can_get_unsafe(struct rrdeng_page_cache_descr *descr, int exclusive * The caller must hold the page descriptor lock. * This function may block doing cleanup. */ -void pg_cache_put_unsafe(struct rrdeng_page_cache_descr *descr) +void pg_cache_put_unsafe(struct rrdeng_page_descr *descr) { - descr->flags &= ~RRD_PAGE_LOCKED; - if (0 == --descr->refcnt) { + struct page_cache_descr *pg_cache_descr = descr->pg_cache_descr; + + pg_cache_descr->flags &= ~RRD_PAGE_LOCKED; + if (0 == --pg_cache_descr->refcnt) { pg_cache_wake_up_waiters_unsafe(descr); } - /* TODO: perform cleanup */ } /* * This function may block doing cleanup. */ -void pg_cache_put(struct rrdeng_page_cache_descr *descr) +void pg_cache_put(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr) { - uv_mutex_lock(&descr->mutex); + rrdeng_page_descr_mutex_lock(ctx, descr); pg_cache_put_unsafe(descr); - uv_mutex_unlock(&descr->mutex); + rrdeng_page_descr_mutex_unlock(ctx, descr); } /* The caller must hold the page cache lock */ @@ -224,7 +221,7 @@ static void pg_cache_reserve_pages(struct rrdengine_instance *ctx, unsigned numb uv_rwlock_wrlock(&pg_cache->pg_cache_rwlock); if (pg_cache->populated_pages + number >= ctx->max_cache_pages + 1) - debug(D_RRDENGINE, "=================================\nPage cache full. Reserving %u pages.\n=================================", + debug(D_RRDENGINE, "==Page cache full. Reserving %u pages.==", number); while (pg_cache->populated_pages + number >= ctx->max_cache_pages + 1) { if (!pg_cache_try_evict_one_page_unsafe(ctx)) { @@ -266,7 +263,7 @@ static int pg_cache_try_reserve_pages(struct rrdengine_instance *ctx, unsigned n uv_rwlock_wrlock(&pg_cache->pg_cache_rwlock); if (pg_cache->populated_pages + number >= ctx->cache_pages_low_watermark + 1) { debug(D_RRDENGINE, - "=================================\nPage cache full. Trying to reserve %u pages.\n=================================", + "==Page cache full. Trying to reserve %u pages.==", number); do { if (!pg_cache_try_evict_one_page_unsafe(ctx)) @@ -286,18 +283,20 @@ static int pg_cache_try_reserve_pages(struct rrdengine_instance *ctx, unsigned n } /* The caller must hold the page cache and the page descriptor locks in that order */ -static void pg_cache_evict_unsafe(struct rrdengine_instance *ctx, struct rrdeng_page_cache_descr *descr) +static void pg_cache_evict_unsafe(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr) { - free(descr->page); - descr->page = NULL; - descr->flags &= ~RRD_PAGE_POPULATED; + struct page_cache_descr *pg_cache_descr = descr->pg_cache_descr; + + freez(pg_cache_descr->page); + pg_cache_descr->page = NULL; + pg_cache_descr->flags &= ~RRD_PAGE_POPULATED; pg_cache_release_pages_unsafe(ctx, 1); ++ctx->stats.pg_cache_evictions; } /* * The caller must hold the page cache lock. - * Lock order: page cache -> replaceQ -> descriptor + * Lock order: page cache -> replaceQ -> page descriptor * This function iterates all pages and tries to evict one. * If it fails it sets in_flight_descr to the oldest descriptor that has write-back in progress, * or it sets it to NULL if no write-back is in progress. @@ -308,36 +307,40 @@ static int pg_cache_try_evict_one_page_unsafe(struct rrdengine_instance *ctx) { struct page_cache *pg_cache = &ctx->pg_cache; unsigned long old_flags; - struct rrdeng_page_cache_descr *descr; + struct rrdeng_page_descr *descr; + struct page_cache_descr *pg_cache_descr = NULL; uv_rwlock_wrlock(&pg_cache->replaceQ.lock); - for (descr = pg_cache->replaceQ.head ; NULL != descr ; descr = descr->next) { - uv_mutex_lock(&descr->mutex); - old_flags = descr->flags; + for (pg_cache_descr = pg_cache->replaceQ.head ; NULL != pg_cache_descr ; pg_cache_descr = pg_cache_descr->next) { + descr = pg_cache_descr->descr; + + rrdeng_page_descr_mutex_lock(ctx, descr); + old_flags = pg_cache_descr->flags; if ((old_flags & RRD_PAGE_POPULATED) && !(old_flags & RRD_PAGE_DIRTY) && pg_cache_try_get_unsafe(descr, 1)) { /* must evict */ pg_cache_evict_unsafe(ctx, descr); pg_cache_put_unsafe(descr); - uv_mutex_unlock(&descr->mutex); pg_cache_replaceQ_delete_unsafe(ctx, descr); + + rrdeng_page_descr_mutex_unlock(ctx, descr); uv_rwlock_wrunlock(&pg_cache->replaceQ.lock); + rrdeng_try_deallocate_pg_cache_descr(ctx, descr); + return 1; } - uv_mutex_unlock(&descr->mutex); - }; + rrdeng_page_descr_mutex_unlock(ctx, descr); + } uv_rwlock_wrunlock(&pg_cache->replaceQ.lock); /* failed to evict */ return 0; } -/* - * TODO: last waiter frees descriptor ? - */ -void pg_cache_punch_hole(struct rrdengine_instance *ctx, struct rrdeng_page_cache_descr *descr) +void pg_cache_punch_hole(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr, uint8_t remove_dirty) { struct page_cache *pg_cache = &ctx->pg_cache; + struct page_cache_descr *pg_cache_descr = NULL; Pvoid_t *PValue; struct pg_cache_page_index *page_index; int ret; @@ -353,8 +356,9 @@ void pg_cache_punch_hole(struct rrdengine_instance *ctx, struct rrdeng_page_cach uv_rwlock_wrunlock(&page_index->lock); if (unlikely(0 == ret)) { error("Page under deletion was not in index."); - if (unlikely(debug_flags & D_RRDENGINE)) - print_page_cache_descr(descr); + if (unlikely(debug_flags & D_RRDENGINE)) { + print_page_descr(descr); + } goto destroy; } assert(1 == ret); @@ -364,23 +368,26 @@ void pg_cache_punch_hole(struct rrdengine_instance *ctx, struct rrdeng_page_cach --pg_cache->page_descriptors; uv_rwlock_wrunlock(&pg_cache->pg_cache_rwlock); - uv_mutex_lock(&descr->mutex); + rrdeng_page_descr_mutex_lock(ctx, descr); + pg_cache_descr = descr->pg_cache_descr; while (!pg_cache_try_get_unsafe(descr, 1)) { debug(D_RRDENGINE, "%s: Waiting for locked page:", __func__); - if(unlikely(debug_flags & D_RRDENGINE)) - print_page_cache_descr(descr); - pg_cache_wait_event_unsafe(descr); - } - /* even a locked page could be dirty */ - while (unlikely(descr->flags & RRD_PAGE_DIRTY)) { - debug(D_RRDENGINE, "%s: Found dirty page, waiting for it to be flushed:", __func__); if (unlikely(debug_flags & D_RRDENGINE)) print_page_cache_descr(descr); pg_cache_wait_event_unsafe(descr); } - uv_mutex_unlock(&descr->mutex); + if (!remove_dirty) { + /* even a locked page could be dirty */ + while (unlikely(pg_cache_descr->flags & RRD_PAGE_DIRTY)) { + debug(D_RRDENGINE, "%s: Found dirty page, waiting for it to be flushed:", __func__); + if (unlikely(debug_flags & D_RRDENGINE)) + print_page_cache_descr(descr); + pg_cache_wait_event_unsafe(descr); + } + } + rrdeng_page_descr_mutex_unlock(ctx, descr); - if (descr->flags & RRD_PAGE_POPULATED) { + if (pg_cache_descr->flags & RRD_PAGE_POPULATED) { /* only after locking can it be safely deleted from LRU */ pg_cache_replaceQ_delete(ctx, descr); @@ -388,13 +395,15 @@ void pg_cache_punch_hole(struct rrdengine_instance *ctx, struct rrdeng_page_cach pg_cache_evict_unsafe(ctx, descr); uv_rwlock_wrunlock(&pg_cache->pg_cache_rwlock); } - pg_cache_put(descr); + pg_cache_put(ctx, descr); + + rrdeng_destroy_pg_cache_descr(ctx, pg_cache_descr); destroy: - pg_cache_destroy_descr(descr); + freez(descr); pg_cache_update_metric_times(page_index); } -static inline int is_page_in_time_range(struct rrdeng_page_cache_descr *descr, usec_t start_time, usec_t end_time) +static inline int is_page_in_time_range(struct rrdeng_page_descr *descr, usec_t start_time, usec_t end_time) { usec_t pg_start, pg_end; @@ -405,13 +414,13 @@ static inline int is_page_in_time_range(struct rrdeng_page_cache_descr *descr, u (pg_start >= start_time && pg_start <= end_time); } -static inline int is_point_in_time_in_page(struct rrdeng_page_cache_descr *descr, usec_t point_in_time) +static inline int is_point_in_time_in_page(struct rrdeng_page_descr *descr, usec_t point_in_time) { return (point_in_time >= descr->start_time && point_in_time <= descr->end_time); } /* Update metric oldest and latest timestamps efficiently when adding new values */ -void pg_cache_add_new_metric_time(struct pg_cache_page_index *page_index, struct rrdeng_page_cache_descr *descr) +void pg_cache_add_new_metric_time(struct pg_cache_page_index *page_index, struct rrdeng_page_descr *descr) { usec_t oldest_time = page_index->oldest_time; usec_t latest_time = page_index->latest_time; @@ -429,7 +438,7 @@ void pg_cache_update_metric_times(struct pg_cache_page_index *page_index) { Pvoid_t *firstPValue, *lastPValue; Word_t firstIndex, lastIndex; - struct rrdeng_page_cache_descr *descr; + struct rrdeng_page_descr *descr; usec_t oldest_time = INVALID_TIME; usec_t latest_time = INVALID_TIME; @@ -460,16 +469,23 @@ void pg_cache_update_metric_times(struct pg_cache_page_index *page_index) /* If index is NULL lookup by UUID (descr->id) */ void pg_cache_insert(struct rrdengine_instance *ctx, struct pg_cache_page_index *index, - struct rrdeng_page_cache_descr *descr) + struct rrdeng_page_descr *descr) { struct page_cache *pg_cache = &ctx->pg_cache; Pvoid_t *PValue; struct pg_cache_page_index *page_index; + unsigned long pg_cache_descr_state = descr->pg_cache_descr_state; + + if (0 != pg_cache_descr_state) { + /* there is page cache descriptor pre-allocated state */ + struct page_cache_descr *pg_cache_descr = descr->pg_cache_descr; - if (descr->flags & RRD_PAGE_POPULATED) { - pg_cache_reserve_pages(ctx, 1); - if (!(descr->flags & RRD_PAGE_DIRTY)) - pg_cache_replaceQ_insert(ctx, descr); + assert(pg_cache_descr_state & PG_CACHE_DESCR_ALLOCATED); + if (pg_cache_descr->flags & RRD_PAGE_POPULATED) { + pg_cache_reserve_pages(ctx, 1); + if (!(pg_cache_descr->flags & RRD_PAGE_DIRTY)) + pg_cache_replaceQ_insert(ctx, descr); + } } if (unlikely(NULL == index)) { @@ -503,7 +519,8 @@ struct pg_cache_page_index * pg_cache_preload(struct rrdengine_instance *ctx, uuid_t *id, usec_t start_time, usec_t end_time) { struct page_cache *pg_cache = &ctx->pg_cache; - struct rrdeng_page_cache_descr *descr = NULL, *preload_array[PAGE_CACHE_MAX_PRELOAD_PAGES]; + struct rrdeng_page_descr *descr = NULL, *preload_array[PAGE_CACHE_MAX_PRELOAD_PAGES]; + struct page_cache_descr *pg_cache_descr = NULL; int i, j, k, count, found; unsigned long flags; Pvoid_t *PValue; @@ -557,12 +574,13 @@ struct pg_cache_page_index * if (unlikely(0 == descr->page_length)) continue; - uv_mutex_lock(&descr->mutex); - flags = descr->flags; + rrdeng_page_descr_mutex_lock(ctx, descr); + pg_cache_descr = descr->pg_cache_descr; + flags = pg_cache_descr->flags; if (pg_cache_can_get_unsafe(descr, 0)) { if (flags & RRD_PAGE_POPULATED) { /* success */ - uv_mutex_unlock(&descr->mutex); + rrdeng_page_descr_mutex_unlock(ctx, descr); debug(D_RRDENGINE, "%s: Page was found in memory.", __func__); continue; } @@ -570,19 +588,19 @@ struct pg_cache_page_index * if (!(flags & RRD_PAGE_POPULATED) && pg_cache_try_get_unsafe(descr, 1)) { preload_array[count++] = descr; if (PAGE_CACHE_MAX_PRELOAD_PAGES == count) { - uv_mutex_unlock(&descr->mutex); + rrdeng_page_descr_mutex_unlock(ctx, descr); break; } } - uv_mutex_unlock(&descr->mutex); + rrdeng_page_descr_mutex_unlock(ctx, descr); - }; + } uv_rwlock_rdunlock(&page_index->lock); failed_to_reserve = 0; for (i = 0 ; i < count && !failed_to_reserve ; ++i) { struct rrdeng_cmd cmd; - struct rrdeng_page_cache_descr *next; + struct rrdeng_page_descr *next; descr = preload_array[i]; if (NULL == descr) { @@ -622,7 +640,7 @@ struct pg_cache_page_index * if (NULL == descr) { continue; } - pg_cache_put(descr); + pg_cache_put(ctx, descr); } } if (!count) { @@ -637,12 +655,13 @@ struct pg_cache_page_index * * When point_in_time is INVALID_TIME get any page. * If index is NULL lookup by UUID (id). */ -struct rrdeng_page_cache_descr * +struct rrdeng_page_descr * pg_cache_lookup(struct rrdengine_instance *ctx, struct pg_cache_page_index *index, uuid_t *id, usec_t point_in_time) { struct page_cache *pg_cache = &ctx->pg_cache; - struct rrdeng_page_cache_descr *descr = NULL; + struct rrdeng_page_descr *descr = NULL; + struct page_cache_descr *pg_cache_descr = NULL; unsigned long flags; Pvoid_t *PValue; struct pg_cache_page_index *page_index; @@ -682,11 +701,12 @@ struct rrdeng_page_cache_descr * pg_cache_release_pages(ctx, 1); return NULL; } - uv_mutex_lock(&descr->mutex); - flags = descr->flags; + rrdeng_page_descr_mutex_lock(ctx, descr); + pg_cache_descr = descr->pg_cache_descr; + flags = pg_cache_descr->flags; if ((flags & RRD_PAGE_POPULATED) && pg_cache_try_get_unsafe(descr, 0)) { /* success */ - uv_mutex_unlock(&descr->mutex); + rrdeng_page_descr_mutex_unlock(ctx, descr); debug(D_RRDENGINE, "%s: Page was found in memory.", __func__); break; } @@ -702,14 +722,14 @@ struct rrdeng_page_cache_descr * debug(D_RRDENGINE, "%s: Waiting for page to be asynchronously read from disk:", __func__); if(unlikely(debug_flags & D_RRDENGINE)) print_page_cache_descr(descr); - while (!(descr->flags & RRD_PAGE_POPULATED)) { + while (!(pg_cache_descr->flags & RRD_PAGE_POPULATED)) { pg_cache_wait_event_unsafe(descr); } /* success */ /* Downgrade exclusive reference to allow other readers */ - descr->flags &= ~RRD_PAGE_LOCKED; + pg_cache_descr->flags &= ~RRD_PAGE_LOCKED; pg_cache_wake_up_waiters_unsafe(descr); - uv_mutex_unlock(&descr->mutex); + rrdeng_page_descr_mutex_unlock(ctx, descr); rrd_stat_atomic_add(&ctx->stats.pg_cache_misses, 1); return descr; } @@ -720,7 +740,7 @@ struct rrdeng_page_cache_descr * if (!(flags & RRD_PAGE_POPULATED)) page_not_in_cache = 1; pg_cache_wait_event_unsafe(descr); - uv_mutex_unlock(&descr->mutex); + rrdeng_page_descr_mutex_unlock(ctx, descr); /* reset scan to find again */ uv_rwlock_rdlock(&page_index->lock); @@ -747,6 +767,7 @@ struct pg_cache_page_index *create_page_index(uuid_t *id) assert(0 == uv_rwlock_init(&page_index->lock)); page_index->oldest_time = INVALID_TIME; page_index->latest_time = INVALID_TIME; + page_index->prev = NULL; return page_index; } @@ -756,6 +777,7 @@ static void init_metrics_index(struct rrdengine_instance *ctx) struct page_cache *pg_cache = &ctx->pg_cache; pg_cache->metrics_index.JudyHS_array = (Pvoid_t) NULL; + pg_cache->metrics_index.last_page_index = NULL; assert(0 == uv_rwlock_init(&pg_cache->metrics_index.lock)); } @@ -789,4 +811,65 @@ void init_page_cache(struct rrdengine_instance *ctx) init_metrics_index(ctx); init_replaceQ(ctx); init_commited_page_index(ctx); +} + +void free_page_cache(struct rrdengine_instance *ctx) +{ + struct page_cache *pg_cache = &ctx->pg_cache; + Word_t ret_Judy, bytes_freed = 0; + Pvoid_t *PValue; + struct pg_cache_page_index *page_index, *prev_page_index; + Word_t Index; + struct rrdeng_page_descr *descr; + struct page_cache_descr *pg_cache_descr; + + /* Free commited page index */ + ret_Judy = JudyLFreeArray(&pg_cache->commited_page_index.JudyL_array, PJE0); + assert(NULL == pg_cache->commited_page_index.JudyL_array); + bytes_freed += ret_Judy; + + for (page_index = pg_cache->metrics_index.last_page_index ; + page_index != NULL ; + page_index = prev_page_index) { + prev_page_index = page_index->prev; + + /* Find first page in range */ + Index = (Word_t) 0; + PValue = JudyLFirst(page_index->JudyL_array, &Index, PJE0); + if (likely(NULL != PValue)) { + descr = *PValue; + } + while (descr != NULL) { + /* Iterate all page descriptors of this metric */ + + if (descr->pg_cache_descr_state & PG_CACHE_DESCR_ALLOCATED) { + /* Check rrdenglocking.c */ + pg_cache_descr = descr->pg_cache_descr; + if (pg_cache_descr->flags & RRD_PAGE_POPULATED) { + freez(pg_cache_descr->page); + bytes_freed += RRDENG_BLOCK_SIZE; + } + rrdeng_destroy_pg_cache_descr(ctx, pg_cache_descr); + bytes_freed += sizeof(*pg_cache_descr); + } + freez(descr); + bytes_freed += sizeof(*descr); + + PValue = JudyLNext(page_index->JudyL_array, &Index, PJE0); + descr = unlikely(NULL == PValue) ? NULL : *PValue; + } + + /* Free page index */ + ret_Judy = JudyLFreeArray(&page_index->JudyL_array, PJE0); + assert(NULL == page_index->JudyL_array); + bytes_freed += ret_Judy; + freez(page_index); + bytes_freed += sizeof(*page_index); + } + /* Free metrics index */ + ret_Judy = JudyHSFreeArray(&pg_cache->metrics_index.JudyHS_array, PJE0); + assert(NULL == pg_cache->metrics_index.JudyHS_array); + bytes_freed += ret_Judy; + + info("Freed %lu bytes of memory from page cache.", bytes_freed); }
\ No newline at end of file diff --git a/database/engine/pagecache.h b/database/engine/pagecache.h index d1e29aaa..b5670f82 100644 --- a/database/engine/pagecache.h +++ b/database/engine/pagecache.h @@ -5,9 +5,10 @@ #include "rrdengine.h" -/* Forward declerations */ +/* Forward declarations */ struct rrdengine_instance; struct extent_info; +struct rrdeng_page_descr; #define INVALID_TIME (0) @@ -18,24 +19,46 @@ struct extent_info; #define RRD_PAGE_WRITE_PENDING (1LU << 3) #define RRD_PAGE_POPULATED (1LU << 4) -struct rrdeng_page_cache_descr { +struct page_cache_descr { + struct rrdeng_page_descr *descr; /* parent descriptor */ void *page; - uint32_t page_length; - usec_t start_time; - usec_t end_time; - uuid_t *id; /* never changes */ - struct extent_info *extent; unsigned long flags; - void *private; - struct rrdeng_page_cache_descr *prev; - struct rrdeng_page_cache_descr *next; + struct page_cache_descr *prev; /* LRU */ + struct page_cache_descr *next; /* LRU */ - /* TODO: move waiter logic to concurrency table */ unsigned refcnt; uv_mutex_t mutex; /* always take it after the page cache lock or after the commit lock */ uv_cond_t cond; unsigned waiters; - struct rrdeng_collect_handle *handle; /* API user */ +}; + +/* Page cache descriptor flags, state = 0 means no descriptor */ +#define PG_CACHE_DESCR_ALLOCATED (1LU << 0) +#define PG_CACHE_DESCR_DESTROY (1LU << 1) +#define PG_CACHE_DESCR_LOCKED (1LU << 2) +#define PG_CACHE_DESCR_SHIFT (3) +#define PG_CACHE_DESCR_USERS_MASK (((unsigned long)-1) << PG_CACHE_DESCR_SHIFT) +#define PG_CACHE_DESCR_FLAGS_MASK (((unsigned long)-1) >> (BITS_PER_ULONG - PG_CACHE_DESCR_SHIFT)) + +/* + * Page cache descriptor state bits (works for both 32-bit and 64-bit architectures): + * + * 63 ... 31 ... 3 | 2 | 1 | 0| + * -----------------------------+------------+------------+-----------| + * number of descriptor users | DESTROY | LOCKED | ALLOCATED | + */ +struct rrdeng_page_descr { + uint32_t page_length; + usec_t start_time; + usec_t end_time; + uuid_t *id; /* never changes */ + struct extent_info *extent; + + /* points to ephemeral page cache descriptor if the page resides in the cache */ + struct page_cache_descr *pg_cache_descr; + + /* Compare-And-Swap target for page cache descriptor allocation algorithm */ + volatile unsigned long pg_cache_descr_state; }; #define PAGE_CACHE_MAX_PRELOAD_PAGES (256) @@ -61,12 +84,15 @@ struct pg_cache_page_index { * It's also written by the data deletion workqueue when data collection is disabled for this metric. */ usec_t latest_time; + + struct pg_cache_page_index *prev; }; /* maps UUIDs to page indices */ struct pg_cache_metrics_index { uv_rwlock_t lock; Pvoid_t JudyHS_array; + struct pg_cache_page_index *last_page_index; }; /* gathers dirty pages to be written on disk */ @@ -85,12 +111,15 @@ struct pg_cache_commited_page_index { unsigned nr_commited_pages; }; -/* gathers populated pages to be evicted */ +/* + * Gathers populated pages to be evicted. + * Relies on page cache descriptors being there as it uses their memory. + */ struct pg_cache_replaceQ { uv_rwlock_t lock; /* LRU lock */ - struct rrdeng_page_cache_descr *head; /* LRU */ - struct rrdeng_page_cache_descr *tail; /* MRU */ + struct page_cache_descr *head; /* LRU */ + struct page_cache_descr *tail; /* MRU */ }; struct page_cache { /* TODO: add statistics */ @@ -104,29 +133,31 @@ struct page_cache { /* TODO: add statistics */ unsigned populated_pages; }; -extern void pg_cache_wake_up_waiters_unsafe(struct rrdeng_page_cache_descr *descr); -extern void pg_cache_wait_event_unsafe(struct rrdeng_page_cache_descr *descr); -extern unsigned long pg_cache_wait_event(struct rrdeng_page_cache_descr *descr); +extern void pg_cache_wake_up_waiters_unsafe(struct rrdeng_page_descr *descr); +extern void pg_cache_wait_event_unsafe(struct rrdeng_page_descr *descr); +extern unsigned long pg_cache_wait_event(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr); extern void pg_cache_replaceQ_insert(struct rrdengine_instance *ctx, - struct rrdeng_page_cache_descr *descr); + struct rrdeng_page_descr *descr); extern void pg_cache_replaceQ_delete(struct rrdengine_instance *ctx, - struct rrdeng_page_cache_descr *descr); + struct rrdeng_page_descr *descr); extern void pg_cache_replaceQ_set_hot(struct rrdengine_instance *ctx, - struct rrdeng_page_cache_descr *descr); -extern struct rrdeng_page_cache_descr *pg_cache_create_descr(void); -extern void pg_cache_put_unsafe(struct rrdeng_page_cache_descr *descr); -extern void pg_cache_put(struct rrdeng_page_cache_descr *descr); + struct rrdeng_page_descr *descr); +extern struct rrdeng_page_descr *pg_cache_create_descr(void); +extern int pg_cache_try_get_unsafe(struct rrdeng_page_descr *descr, int exclusive_access); +extern void pg_cache_put_unsafe(struct rrdeng_page_descr *descr); +extern void pg_cache_put(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr); extern void pg_cache_insert(struct rrdengine_instance *ctx, struct pg_cache_page_index *index, - struct rrdeng_page_cache_descr *descr); -extern void pg_cache_punch_hole(struct rrdengine_instance *ctx, struct rrdeng_page_cache_descr *descr); + struct rrdeng_page_descr *descr); +extern void pg_cache_punch_hole(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr, uint8_t remove_dirty); extern struct pg_cache_page_index * pg_cache_preload(struct rrdengine_instance *ctx, uuid_t *id, usec_t start_time, usec_t end_time); -extern struct rrdeng_page_cache_descr * +extern struct rrdeng_page_descr * pg_cache_lookup(struct rrdengine_instance *ctx, struct pg_cache_page_index *index, uuid_t *id, usec_t point_in_time); extern struct pg_cache_page_index *create_page_index(uuid_t *id); extern void init_page_cache(struct rrdengine_instance *ctx); -extern void pg_cache_add_new_metric_time(struct pg_cache_page_index *page_index, struct rrdeng_page_cache_descr *descr); +extern void free_page_cache(struct rrdengine_instance *ctx); +extern void pg_cache_add_new_metric_time(struct pg_cache_page_index *page_index, struct rrdeng_page_descr *descr); extern void pg_cache_update_metric_times(struct pg_cache_page_index *page_index); -#endif /* NETDATA_PAGECACHE_H */
\ No newline at end of file +#endif /* NETDATA_PAGECACHE_H */ diff --git a/database/engine/rrdengine.c b/database/engine/rrdengine.c index b8e4eba0..0f2dceaa 100644 --- a/database/engine/rrdengine.c +++ b/database/engine/rrdengine.c @@ -3,6 +3,10 @@ #include "rrdengine.h" +rrdeng_stats_t global_io_errors = 0; +rrdeng_stats_t global_fs_errors = 0; +rrdeng_stats_t rrdeng_reserved_file_descriptors = 0; + void sanity_check(void) { /* Magic numbers must fit in the super-blocks */ @@ -27,12 +31,12 @@ void read_extent_cb(uv_fs_t* req) struct rrdengine_worker_config* wc = req->loop->data; struct rrdengine_instance *ctx = wc->ctx; struct extent_io_descriptor *xt_io_descr; - struct rrdeng_page_cache_descr *descr; + struct rrdeng_page_descr *descr; + struct page_cache_descr *pg_cache_descr; int ret; unsigned i, j, count; void *page, *uncompressed_buf = NULL; uint32_t payload_length, payload_offset, page_offset, uncompressed_payload_length; - struct rrdengine_datafile *datafile; /* persistent structures */ struct rrdeng_df_extent_header *header; struct rrdeng_df_extent_trailer *trailer; @@ -54,9 +58,13 @@ void read_extent_cb(uv_fs_t* req) crc = crc32(0L, Z_NULL, 0); crc = crc32(crc, xt_io_descr->buf, xt_io_descr->bytes - sizeof(*trailer)); ret = crc32cmp(trailer->checksum, crc); - datafile = xt_io_descr->descr_array[0]->extent->datafile; - debug(D_RRDENGINE, "%s: Extent at offset %"PRIu64"(%u) was read from datafile %u-%u. CRC32 check: %s", __func__, - xt_io_descr->pos, xt_io_descr->bytes, datafile->tier, datafile->fileno, ret ? "FAILED" : "SUCCEEDED"); +#ifdef NETDATA_INTERNAL_CHECKS + { + struct rrdengine_datafile *datafile = xt_io_descr->descr_array[0]->extent->datafile; + debug(D_RRDENGINE, "%s: Extent at offset %"PRIu64"(%u) was read from datafile %u-%u. CRC32 check: %s", __func__, + xt_io_descr->pos, xt_io_descr->bytes, datafile->tier, datafile->fileno, ret ? "FAILED" : "SUCCEEDED"); + } +#endif if (unlikely(ret)) { /* TODO: handle errors */ exit(UV_EIO); @@ -97,36 +105,38 @@ void read_extent_cb(uv_fs_t* req) (void) memcpy(page, uncompressed_buf + page_offset, descr->page_length); } pg_cache_replaceQ_insert(ctx, descr); - uv_mutex_lock(&descr->mutex); - descr->page = page; - descr->flags |= RRD_PAGE_POPULATED; - descr->flags &= ~RRD_PAGE_READ_PENDING; + rrdeng_page_descr_mutex_lock(ctx, descr); + pg_cache_descr = descr->pg_cache_descr; + pg_cache_descr->page = page; + pg_cache_descr->flags |= RRD_PAGE_POPULATED; + pg_cache_descr->flags &= ~RRD_PAGE_READ_PENDING; debug(D_RRDENGINE, "%s: Waking up waiters.", __func__); if (xt_io_descr->release_descr) { pg_cache_put_unsafe(descr); } else { pg_cache_wake_up_waiters_unsafe(descr); } - uv_mutex_unlock(&descr->mutex); + rrdeng_page_descr_mutex_unlock(ctx, descr); } if (RRD_NO_COMPRESSION != header->compression_algorithm) { - free(uncompressed_buf); + freez(uncompressed_buf); } if (xt_io_descr->completion) complete(xt_io_descr->completion); cleanup: uv_fs_req_cleanup(req); free(xt_io_descr->buf); - free(xt_io_descr); + freez(xt_io_descr); } static void do_read_extent(struct rrdengine_worker_config* wc, - struct rrdeng_page_cache_descr **descr, + struct rrdeng_page_descr **descr, unsigned count, uint8_t release_descr) { struct rrdengine_instance *ctx = wc->ctx; + struct page_cache_descr *pg_cache_descr; int ret; unsigned i, size_bytes, pos, real_io_size; // uint32_t payload_length; @@ -141,14 +151,15 @@ static void do_read_extent(struct rrdengine_worker_config* wc, ret = posix_memalign((void *)&xt_io_descr->buf, RRDFILE_ALIGNMENT, ALIGN_BYTES_CEILING(size_bytes)); if (unlikely(ret)) { fatal("posix_memalign:%s", strerror(ret)); - /* free(xt_io_descr); + /* freez(xt_io_descr); return;*/ } for (i = 0 ; i < count; ++i) { - uv_mutex_lock(&descr[i]->mutex); - descr[i]->flags |= RRD_PAGE_READ_PENDING; + rrdeng_page_descr_mutex_lock(ctx, descr[i]); + pg_cache_descr = descr[i]->pg_cache_descr; + pg_cache_descr->flags |= RRD_PAGE_READ_PENDING; // payload_length = descr[i]->page_length; - uv_mutex_unlock(&descr[i]->mutex); + rrdeng_page_descr_mutex_unlock(ctx, descr[i]); xt_io_descr->descr_array[i] = descr[i]; } @@ -227,8 +238,8 @@ void flush_pages_cb(uv_fs_t* req) struct rrdengine_instance *ctx = wc->ctx; struct page_cache *pg_cache = &ctx->pg_cache; struct extent_io_descriptor *xt_io_descr; - struct rrdeng_page_cache_descr *descr; - struct rrdengine_datafile *datafile; + struct rrdeng_page_descr *descr; + struct page_cache_descr *pg_cache_descr; int ret; unsigned i, count; Word_t commit_id; @@ -238,10 +249,13 @@ void flush_pages_cb(uv_fs_t* req) error("%s: uv_fs_write: %s", __func__, uv_strerror((int)req->result)); goto cleanup; } - datafile = xt_io_descr->descr_array[0]->extent->datafile; - debug(D_RRDENGINE, "%s: Extent at offset %"PRIu64"(%u) was written to datafile %u-%u. Waking up waiters.", - __func__, xt_io_descr->pos, xt_io_descr->bytes, datafile->tier, datafile->fileno); - +#ifdef NETDATA_INTERNAL_CHECKS + { + struct rrdengine_datafile *datafile = xt_io_descr->descr_array[0]->extent->datafile; + debug(D_RRDENGINE, "%s: Extent at offset %"PRIu64"(%u) was written to datafile %u-%u. Waking up waiters.", + __func__, xt_io_descr->pos, xt_io_descr->bytes, datafile->tier, datafile->fileno); + } +#endif count = xt_io_descr->descr_count; for (i = 0 ; i < count ; ++i) { /* care, we don't hold the descriptor mutex */ @@ -256,18 +270,19 @@ void flush_pages_cb(uv_fs_t* req) pg_cache_replaceQ_insert(ctx, descr); - uv_mutex_lock(&descr->mutex); - descr->flags &= ~(RRD_PAGE_DIRTY | RRD_PAGE_WRITE_PENDING); + rrdeng_page_descr_mutex_lock(ctx, descr); + pg_cache_descr = descr->pg_cache_descr; + pg_cache_descr->flags &= ~(RRD_PAGE_DIRTY | RRD_PAGE_WRITE_PENDING); /* wake up waiters, care no reference being held */ pg_cache_wake_up_waiters_unsafe(descr); - uv_mutex_unlock(&descr->mutex); + rrdeng_page_descr_mutex_unlock(ctx, descr); } if (xt_io_descr->completion) complete(xt_io_descr->completion); cleanup: uv_fs_req_cleanup(req); free(xt_io_descr->buf); - free(xt_io_descr); + freez(xt_io_descr); } /* @@ -283,7 +298,8 @@ static int do_flush_pages(struct rrdengine_worker_config* wc, int force, struct int compressed_size, max_compressed_size = 0; unsigned i, count, size_bytes, pos, real_io_size; uint32_t uncompressed_payload_length, payload_offset; - struct rrdeng_page_cache_descr *descr, *eligible_pages[MAX_PAGES_PER_EXTENT]; + struct rrdeng_page_descr *descr, *eligible_pages[MAX_PAGES_PER_EXTENT]; + struct page_cache_descr *pg_cache_descr; struct extent_io_descriptor *xt_io_descr; void *compressed_buf = NULL; Word_t descr_commit_idx_array[MAX_PAGES_PER_EXTENT]; @@ -311,15 +327,16 @@ static int do_flush_pages(struct rrdengine_worker_config* wc, int force, struct descr = unlikely(NULL == PValue) ? NULL : *PValue) { assert(0 != descr->page_length); - uv_mutex_lock(&descr->mutex); - if (!(descr->flags & RRD_PAGE_WRITE_PENDING)) { + rrdeng_page_descr_mutex_lock(ctx, descr); + pg_cache_descr = descr->pg_cache_descr; + if (!(pg_cache_descr->flags & RRD_PAGE_WRITE_PENDING)) { /* care, no reference being held */ - descr->flags |= RRD_PAGE_WRITE_PENDING; + pg_cache_descr->flags |= RRD_PAGE_WRITE_PENDING; uncompressed_payload_length += descr->page_length; descr_commit_idx_array[count] = Index; eligible_pages[count++] = descr; } - uv_mutex_unlock(&descr->mutex); + rrdeng_page_descr_mutex_unlock(ctx, descr); } uv_rwlock_rdunlock(&pg_cache->commited_page_index.lock); @@ -345,9 +362,9 @@ static int do_flush_pages(struct rrdengine_worker_config* wc, int force, struct ret = posix_memalign((void *)&xt_io_descr->buf, RRDFILE_ALIGNMENT, ALIGN_BYTES_CEILING(size_bytes)); if (unlikely(ret)) { fatal("posix_memalign:%s", strerror(ret)); - /* free(xt_io_descr);*/ + /* freez(xt_io_descr);*/ } - (void) memcpy(xt_io_descr->descr_array, eligible_pages, sizeof(struct rrdeng_page_cache_descr *) * count); + (void) memcpy(xt_io_descr->descr_array, eligible_pages, sizeof(struct rrdeng_page_descr *) * count); xt_io_descr->descr_count = count; pos = 0; @@ -378,7 +395,7 @@ static int do_flush_pages(struct rrdengine_worker_config* wc, int force, struct for (i = 0 ; i < count ; ++i) { descr = xt_io_descr->descr_array[i]; /* care, we don't hold the descriptor mutex */ - (void) memcpy(xt_io_descr->buf + pos, descr->page, descr->page_length); + (void) memcpy(xt_io_descr->buf + pos, descr->pg_cache_descr->page, descr->page_length); descr->extent = extent; extent->pages[i] = descr; @@ -397,7 +414,7 @@ static int do_flush_pages(struct rrdengine_worker_config* wc, int force, struct ctx->stats.after_compress_bytes += compressed_size; debug(D_RRDENGINE, "LZ4 compressed %"PRIu32" bytes to %d bytes.", uncompressed_payload_length, compressed_size); (void) memcpy(xt_io_descr->buf + payload_offset, compressed_buf, compressed_size); - free(compressed_buf); + freez(compressed_buf); size_bytes = payload_offset + compressed_size + sizeof(*trailer); header->payload_length = compressed_size; break; @@ -435,23 +452,36 @@ static void after_delete_old_data(uv_work_t *req, int status) struct rrdengine_worker_config* wc = &ctx->worker_config; struct rrdengine_datafile *datafile; struct rrdengine_journalfile *journalfile; - unsigned bytes; + unsigned deleted_bytes, journalfile_bytes, datafile_bytes; + int ret; + char path[RRDENG_PATH_MAX]; (void)status; datafile = ctx->datafiles.first; journalfile = datafile->journalfile; - bytes = datafile->pos + journalfile->pos; + datafile_bytes = datafile->pos; + journalfile_bytes = journalfile->pos; + deleted_bytes = 0; + info("Deleting data and journal file pair."); datafile_list_delete(ctx, datafile); - destroy_journal_file(journalfile, datafile); - destroy_data_file(datafile); - info("Deleted data file \""DATAFILE_PREFIX RRDENG_FILE_NUMBER_PRINT_TMPL DATAFILE_EXTENSION"\".", - datafile->tier, datafile->fileno); - free(journalfile); - free(datafile); + ret = destroy_journal_file(journalfile, datafile); + if (!ret) { + generate_journalfilepath(datafile, path, sizeof(path)); + info("Deleted journal file \"%s\".", path); + deleted_bytes += journalfile_bytes; + } + ret = destroy_data_file(datafile); + if (!ret) { + generate_datafilepath(datafile, path, sizeof(path)); + info("Deleted data file \"%s\".", path); + deleted_bytes += datafile_bytes; + } + freez(journalfile); + freez(datafile); - ctx->disk_space -= bytes; - info("Reclaimed %u bytes of disk space.", bytes); + ctx->disk_space -= deleted_bytes; + info("Reclaimed %u bytes of disk space.", deleted_bytes); /* unfreeze command processing */ wc->now_deleting.data = NULL; @@ -464,7 +494,7 @@ static void delete_old_data(uv_work_t *req) struct rrdengine_instance *ctx = req->data; struct rrdengine_datafile *datafile; struct extent_info *extent, *next; - struct rrdeng_page_cache_descr *descr; + struct rrdeng_page_descr *descr; unsigned count, i; /* Safe to use since it will be deleted after we are done */ @@ -474,10 +504,10 @@ static void delete_old_data(uv_work_t *req) count = extent->number_of_pages; for (i = 0 ; i < count ; ++i) { descr = extent->pages[i]; - pg_cache_punch_hole(ctx, descr); + pg_cache_punch_hole(ctx, descr, 0); } next = extent->next; - free(extent); + freez(extent); } } @@ -487,6 +517,7 @@ void rrdeng_test_quota(struct rrdengine_worker_config* wc) struct rrdengine_datafile *datafile; unsigned current_size, target_size; uint8_t out_of_space, only_one_datafile; + int ret; out_of_space = 0; if (unlikely(ctx->disk_space > ctx->max_disk_space)) { @@ -501,7 +532,10 @@ void rrdeng_test_quota(struct rrdengine_worker_config* wc) if (unlikely(current_size >= target_size || (out_of_space && only_one_datafile))) { /* Finalize data and journal file and create a new pair */ wal_flush_transaction_buffer(wc); - create_new_datafile_pair(ctx, 1, datafile->fileno + 1); + ret = create_new_datafile_pair(ctx, 1, ctx->last_fileno + 1); + if (likely(!ret)) { + ++ctx->last_fileno; + } } if (unlikely(out_of_space)) { /* delete old data */ @@ -509,18 +543,30 @@ void rrdeng_test_quota(struct rrdengine_worker_config* wc) /* already deleting data */ return; } - info("Deleting data file \""DATAFILE_PREFIX RRDENG_FILE_NUMBER_PRINT_TMPL DATAFILE_EXTENSION"\".", - ctx->datafiles.first->tier, ctx->datafiles.first->fileno); + if (NULL == ctx->datafiles.first->next) { + error("Cannot delete data file \"%s/"DATAFILE_PREFIX RRDENG_FILE_NUMBER_PRINT_TMPL DATAFILE_EXTENSION"\"" + " to reclaim space, there are no other file pairs left.", + ctx->dbfiles_path, ctx->datafiles.first->tier, ctx->datafiles.first->fileno); + return; + } + info("Deleting data file \"%s/"DATAFILE_PREFIX RRDENG_FILE_NUMBER_PRINT_TMPL DATAFILE_EXTENSION"\".", + ctx->dbfiles_path, ctx->datafiles.first->tier, ctx->datafiles.first->fileno); wc->now_deleting.data = ctx; - uv_queue_work(wc->loop, &wc->now_deleting, delete_old_data, after_delete_old_data); + assert(0 == uv_queue_work(wc->loop, &wc->now_deleting, delete_old_data, after_delete_old_data)); } } +/* return 0 on success */ int init_rrd_files(struct rrdengine_instance *ctx) { return init_data_files(ctx); } +void finalize_rrd_files(struct rrdengine_instance *ctx) +{ + return finalize_data_files(ctx); +} + void rrdeng_init_cmd_queue(struct rrdengine_worker_config* wc) { wc->cmd_queue.head = wc->cmd_queue.tail = 0; @@ -588,7 +634,6 @@ void async_cb(uv_async_t *handle) void timer_cb(uv_timer_t* handle) { struct rrdengine_worker_config* wc = handle->data; - struct rrdengine_instance *ctx = wc->ctx; uv_stop(handle->loop); uv_update_time(handle->loop); @@ -608,7 +653,7 @@ void timer_cb(uv_timer_t* handle) #ifdef NETDATA_INTERNAL_CHECKS { char buf[4096]; - debug(D_RRDENGINE, "%s", get_rrdeng_statistics(ctx, buf, sizeof(buf))); + debug(D_RRDENGINE, "%s", get_rrdeng_statistics(wc->ctx, buf, sizeof(buf))); } #endif } @@ -623,7 +668,7 @@ void rrdeng_worker(void* arg) struct rrdengine_worker_config* wc = arg; struct rrdengine_instance *ctx = wc->ctx; uv_loop_t* loop; - int shutdown; + int shutdown, ret; enum rrdeng_opcode opcode; uv_timer_t timer_req; struct rrdeng_cmd cmd; @@ -631,22 +676,35 @@ void rrdeng_worker(void* arg) rrdeng_init_cmd_queue(wc); loop = wc->loop = mallocz(sizeof(uv_loop_t)); - uv_loop_init(loop); + ret = uv_loop_init(loop); + if (ret) { + error("uv_loop_init(): %s", uv_strerror(ret)); + goto error_after_loop_init; + } loop->data = wc; - uv_async_init(wc->loop, &wc->async, async_cb); + ret = uv_async_init(wc->loop, &wc->async, async_cb); + if (ret) { + error("uv_async_init(): %s", uv_strerror(ret)); + goto error_after_async_init; + } wc->async.data = wc; wc->now_deleting.data = NULL; /* dirty page flushing timer */ - uv_timer_init(loop, &timer_req); + ret = uv_timer_init(loop, &timer_req); + if (ret) { + error("uv_timer_init(): %s", uv_strerror(ret)); + goto error_after_timer_init; + } timer_req.data = wc; + wc->error = 0; /* wake up initialization thread */ complete(&ctx->rrdengine_completion); - uv_timer_start(&timer_req, timer_cb, TIMER_PERIOD_MS, TIMER_PERIOD_MS); + assert(0 == uv_timer_start(&timer_req, timer_cb, TIMER_PERIOD_MS, TIMER_PERIOD_MS)); shutdown = 0; while (shutdown == 0 || uv_loop_alive(loop)) { uv_run(loop, UV_RUN_DEFAULT); @@ -661,12 +719,6 @@ void rrdeng_worker(void* arg) break; case RRDENG_SHUTDOWN: shutdown = 1; - if (unlikely(wc->now_deleting.data)) { - /* postpone shutdown until after deletion */ - info("Postponing shutting RRD engine event loop down until after datafile deletion is finished."); - rrdeng_enq_cmd(wc, &cmd); - break; - } /* * uv_async_send after uv_close does not seem to crash in linux at the moment, * it is however undocumented behaviour and we need to be aware if this becomes @@ -675,10 +727,6 @@ void rrdeng_worker(void* arg) uv_close((uv_handle_t *)&wc->async, NULL); assert(0 == uv_timer_stop(&timer_req)); uv_close((uv_handle_t *)&timer_req, NULL); - info("Shutting down RRD engine event loop."); - while (do_flush_pages(wc, 1, NULL)) { - ; /* Force flushing of all commited pages. */ - } break; case RRDENG_READ_PAGE: do_read_extent(wc, &cmd.read_page.page_cache_descr, 1, 0); @@ -690,14 +738,14 @@ void rrdeng_worker(void* arg) do_commit_transaction(wc, STORE_DATA, NULL); break; case RRDENG_FLUSH_PAGES: { - unsigned total_bytes, bytes_written; + unsigned bytes_written; /* First I/O should be enough to call completion */ bytes_written = do_flush_pages(wc, 1, cmd.completion); - for (total_bytes = bytes_written ; - bytes_written && (total_bytes < DATAFILE_IDEAL_IO_SIZE) ; - total_bytes += bytes_written) { - bytes_written = do_flush_pages(wc, 1, NULL); + if (bytes_written) { + while (do_flush_pages(wc, 1, NULL)) { + ; /* Force flushing of all commited pages. */ + } } break; } @@ -708,6 +756,13 @@ void rrdeng_worker(void* arg) } while (opcode != RRDENG_NOOP); } /* cleanup operations of the event loop */ + if (unlikely(wc->now_deleting.data)) { + info("Postponing shutting RRD engine event loop down until after datafile deletion is finished."); + } + info("Shutting down RRD engine event loop."); + while (do_flush_pages(wc, 1, NULL)) { + ; /* Force flushing of all commited pages. */ + } wal_flush_transaction_buffer(wc); uv_run(loop, UV_RUN_DEFAULT); @@ -716,7 +771,20 @@ void rrdeng_worker(void* arg) uv_cond_destroy(&wc->cmd_cond); /* uv_mutex_destroy(&wc->cmd_mutex); */ assert(0 == uv_loop_close(loop)); - free(loop); + freez(loop); + + return; + +error_after_timer_init: + uv_close((uv_handle_t *)&wc->async, NULL); +error_after_async_init: + assert(0 == uv_loop_close(loop)); +error_after_loop_init: + freez(loop); + + wc->error = UV_EAGAIN; + /* wake up initialization thread */ + complete(&ctx->rrdengine_completion); } @@ -726,19 +794,19 @@ static void basic_functional_test(struct rrdengine_instance *ctx) int i, j, failed_validations; uuid_t uuid[NR_PAGES]; void *buf; - struct rrdeng_page_cache_descr *handle[NR_PAGES]; - char uuid_str[37]; - char backup[NR_PAGES][37 * 100]; /* backup storage for page data verification */ + struct rrdeng_page_descr *handle[NR_PAGES]; + char uuid_str[UUID_STR_LEN]; + char backup[NR_PAGES][UUID_STR_LEN * 100]; /* backup storage for page data verification */ for (i = 0 ; i < NR_PAGES ; ++i) { uuid_generate(uuid[i]); uuid_unparse_lower(uuid[i], uuid_str); // fprintf(stderr, "Generated uuid[%d]=%s\n", i, uuid_str); - buf = rrdeng_create_page(&uuid[i], &handle[i]); + buf = rrdeng_create_page(ctx, &uuid[i], &handle[i]); /* Each page contains 10 times its own UUID stringified */ for (j = 0 ; j < 100 ; ++j) { - strcpy(buf + 37 * j, uuid_str); - strcpy(backup[i] + 37 * j, uuid_str); + strcpy(buf + UUID_STR_LEN * j, uuid_str); + strcpy(backup[i] + UUID_STR_LEN * j, uuid_str); } rrdeng_commit_page(ctx, handle[i], (Word_t)i); } @@ -750,7 +818,7 @@ static void basic_functional_test(struct rrdengine_instance *ctx) ++failed_validations; fprintf(stderr, "Page %d was LOST.\n", i); } - if (memcmp(backup[i], buf, 37 * 100)) { + if (memcmp(backup[i], buf, UUID_STR_LEN * 100)) { ++failed_validations; fprintf(stderr, "Page %d data comparison with backup FAILED validation.\n", i); } diff --git a/database/engine/rrdengine.h b/database/engine/rrdengine.h index 141bb9c6..6f6a6f8f 100644 --- a/database/engine/rrdengine.h +++ b/database/engine/rrdengine.h @@ -22,6 +22,7 @@ #include "journalfile.h" #include "rrdengineapi.h" #include "pagecache.h" +#include "rrdenglocking.h" #ifdef NETDATA_RRD_INTERNALS @@ -59,10 +60,10 @@ struct rrdeng_cmd { enum rrdeng_opcode opcode; union { struct rrdeng_read_page { - struct rrdeng_page_cache_descr *page_cache_descr; + struct rrdeng_page_descr *page_cache_descr; } read_page; struct rrdeng_read_extent { - struct rrdeng_page_cache_descr *page_cache_descr[MAX_PAGES_PER_EXTENT]; + struct rrdeng_page_descr *page_cache_descr[MAX_PAGES_PER_EXTENT]; int page_count; } read_extent; struct completion *completion; @@ -85,7 +86,7 @@ struct extent_io_descriptor { struct completion *completion; unsigned descr_count; int release_descr; - struct rrdeng_page_cache_descr *descr_array[MAX_PAGES_PER_EXTENT]; + struct rrdeng_page_descr *descr_array[MAX_PAGES_PER_EXTENT]; Word_t descr_commit_idx_array[MAX_PAGES_PER_EXTENT]; }; @@ -111,6 +112,8 @@ struct rrdengine_worker_config { uv_cond_t cmd_cond; volatile unsigned queue_size; struct rrdeng_cmdqueue cmd_queue; + + int error; }; /* @@ -142,10 +145,19 @@ struct rrdengine_statistics { rrdeng_stats_t datafile_deletions; rrdeng_stats_t journalfile_creations; rrdeng_stats_t journalfile_deletions; + rrdeng_stats_t page_cache_descriptors; + rrdeng_stats_t io_errors; + rrdeng_stats_t fs_errors; }; +/* I/O errors global counter */ +extern rrdeng_stats_t global_io_errors; +/* File-System errors global counter */ +extern rrdeng_stats_t global_fs_errors; +/* number of File-Descriptors that have been reserved by dbengine */ +extern rrdeng_stats_t rrdeng_reserved_file_descriptors; + struct rrdengine_instance { - rrdengine_state_t rrdengine_state; struct rrdengine_worker_config worker_config; struct completion rrdengine_completion; struct page_cache pg_cache; @@ -155,6 +167,7 @@ struct rrdengine_instance { char dbfiles_path[FILENAME_MAX+1]; uint64_t disk_space; uint64_t max_disk_space; + unsigned last_fileno; /* newest index of datafile and journalfile */ unsigned long max_cache_pages; unsigned long cache_pages_low_watermark; @@ -163,6 +176,7 @@ struct rrdengine_instance { extern void sanity_check(void); extern int init_rrd_files(struct rrdengine_instance *ctx); +extern void finalize_rrd_files(struct rrdengine_instance *ctx); extern void rrdeng_test_quota(struct rrdengine_worker_config* wc); extern void rrdeng_worker(void* arg); extern void rrdeng_enq_cmd(struct rrdengine_worker_config* wc, struct rrdeng_cmd *cmd); diff --git a/database/engine/rrdengineapi.c b/database/engine/rrdengineapi.c index a4e71155..a87ce6d6 100644 --- a/database/engine/rrdengineapi.c +++ b/database/engine/rrdengineapi.c @@ -41,6 +41,7 @@ void rrdeng_store_metric_init(RRDDIM *rd) handle->descr = NULL; handle->prev_descr = NULL; + handle->unaligned_page = 0; uv_rwlock_rdlock(&pg_cache->metrics_index.lock); PValue = JudyHSGet(pg_cache->metrics_index.JudyHS_array, &temp_id, sizeof(uuid_t)); @@ -54,59 +55,140 @@ void rrdeng_store_metric_init(RRDDIM *rd) PValue = JudyHSIns(&pg_cache->metrics_index.JudyHS_array, &temp_id, sizeof(uuid_t), PJE0); assert(NULL == *PValue); /* TODO: figure out concurrency model */ *PValue = page_index = create_page_index(&temp_id); + page_index->prev = pg_cache->metrics_index.last_page_index; + pg_cache->metrics_index.last_page_index = page_index; uv_rwlock_wrunlock(&pg_cache->metrics_index.lock); } rd->state->rrdeng_uuid = &page_index->id; handle->page_index = page_index; } +/* The page must be populated and referenced */ +static int page_has_only_empty_metrics(struct rrdeng_page_descr *descr) +{ + unsigned i; + uint8_t has_only_empty_metrics = 1; + storage_number *page; + + page = descr->pg_cache_descr->page; + for (i = 0 ; i < descr->page_length / sizeof(storage_number); ++i) { + if (SN_EMPTY_SLOT != page[i]) { + has_only_empty_metrics = 0; + break; + } + } + return has_only_empty_metrics; +} + +void rrdeng_store_metric_flush_current_page(RRDDIM *rd) +{ + struct rrdeng_collect_handle *handle; + struct rrdengine_instance *ctx; + struct rrdeng_page_descr *descr; + + handle = &rd->state->handle.rrdeng; + ctx = handle->ctx; + descr = handle->descr; + if (unlikely(NULL == descr)) { + return; + } + if (likely(descr->page_length)) { + int ret, page_is_empty; + +#ifdef NETDATA_INTERNAL_CHECKS + rrd_stat_atomic_add(&ctx->stats.metric_API_producers, -1); +#endif + if (handle->prev_descr) { + /* unpin old second page */ + pg_cache_put(ctx, handle->prev_descr); + } + page_is_empty = page_has_only_empty_metrics(descr); + if (page_is_empty) { + debug(D_RRDENGINE, "Page has empty metrics only, deleting:"); + if (unlikely(debug_flags & D_RRDENGINE)) + print_page_cache_descr(descr); + pg_cache_put(ctx, descr); + pg_cache_punch_hole(ctx, descr, 1); + handle->prev_descr = NULL; + } else { + /* added 1 extra reference to keep 2 dirty pages pinned per metric, expected refcnt = 2 */ + rrdeng_page_descr_mutex_lock(ctx, descr); + ret = pg_cache_try_get_unsafe(descr, 0); + rrdeng_page_descr_mutex_unlock(ctx, descr); + assert (1 == ret); + + rrdeng_commit_page(ctx, descr, handle->page_correlation_id); + handle->prev_descr = descr; + } + } else { + freez(descr->pg_cache_descr->page); + rrdeng_destroy_pg_cache_descr(ctx, descr->pg_cache_descr); + freez(descr); + } + handle->descr = NULL; +} + void rrdeng_store_metric_next(RRDDIM *rd, usec_t point_in_time, storage_number number) { struct rrdeng_collect_handle *handle; struct rrdengine_instance *ctx; struct page_cache *pg_cache; - struct rrdeng_page_cache_descr *descr; + struct rrdeng_page_descr *descr; storage_number *page; + uint8_t must_flush_unaligned_page = 0, perfect_page_alignment = 0; handle = &rd->state->handle.rrdeng; ctx = handle->ctx; pg_cache = &ctx->pg_cache; descr = handle->descr; - if (unlikely(NULL == descr || descr->page_length + sizeof(number) > RRDENG_BLOCK_SIZE)) { - if (descr) { - descr->handle = NULL; - if (descr->page_length) { -#ifdef NETDATA_INTERNAL_CHECKS - rrd_stat_atomic_add(&ctx->stats.metric_API_producers, -1); -#endif - /* added 1 extra reference to keep 2 dirty pages pinned per metric, expected refcnt = 2 */ - ++descr->refcnt; - rrdeng_commit_page(ctx, descr, handle->page_correlation_id); - if (handle->prev_descr) { - /* unpin old second page */ - pg_cache_put(handle->prev_descr); - } - handle->prev_descr = descr; - } else { - free(descr->page); - free(descr); - handle->descr = NULL; - } + + if (descr) { + /* Make alignment decisions */ + + if (descr->page_length == rd->rrdset->rrddim_page_alignment) { + /* this is the leading dimension that defines chart alignment */ + perfect_page_alignment = 1; + } + /* is the metric far enough out of alignment with the others? */ + if (unlikely(descr->page_length + sizeof(number) < rd->rrdset->rrddim_page_alignment)) { + handle->unaligned_page = 1; + debug(D_RRDENGINE, "Metric page is not aligned with chart:"); + if (unlikely(debug_flags & D_RRDENGINE)) + print_page_cache_descr(descr); } - page = rrdeng_create_page(&handle->page_index->id, &descr); + if (unlikely(handle->unaligned_page && + /* did the other metrics change page? */ + rd->rrdset->rrddim_page_alignment <= sizeof(number))) { + debug(D_RRDENGINE, "Flushing unaligned metric page."); + must_flush_unaligned_page = 1; + handle->unaligned_page = 0; + } + } + if (unlikely(NULL == descr || + descr->page_length + sizeof(number) > RRDENG_BLOCK_SIZE || + must_flush_unaligned_page)) { + rrdeng_store_metric_flush_current_page(rd); + + page = rrdeng_create_page(ctx, &handle->page_index->id, &descr); assert(page); - handle->prev_descr = handle->descr; + handle->descr = descr; - descr->handle = handle; + uv_rwlock_wrlock(&pg_cache->commited_page_index.lock); handle->page_correlation_id = pg_cache->commited_page_index.latest_corr_id++; uv_rwlock_wrunlock(&pg_cache->commited_page_index.lock); - } - page = descr->page; + if (0 == rd->rrdset->rrddim_page_alignment) { + /* this is the leading dimension that defines chart alignment */ + perfect_page_alignment = 1; + } + } + page = descr->pg_cache_descr->page; page[descr->page_length / sizeof(number)] = number; descr->end_time = point_in_time; descr->page_length += sizeof(number); + if (perfect_page_alignment) + rd->rrdset->rrddim_page_alignment = descr->page_length; if (unlikely(INVALID_TIME == descr->start_time)) { descr->start_time = point_in_time; @@ -126,26 +208,13 @@ void rrdeng_store_metric_finalize(RRDDIM *rd) { struct rrdeng_collect_handle *handle; struct rrdengine_instance *ctx; - struct rrdeng_page_cache_descr *descr; handle = &rd->state->handle.rrdeng; ctx = handle->ctx; - descr = handle->descr; - if (descr) { - descr->handle = NULL; - if (descr->page_length) { -#ifdef NETDATA_INTERNAL_CHECKS - rrd_stat_atomic_add(&ctx->stats.metric_API_producers, -1); -#endif - rrdeng_commit_page(ctx, descr, handle->page_correlation_id); - if (handle->prev_descr) { - /* unpin old second page */ - pg_cache_put(handle->prev_descr); - } - } else { - free(descr->page); - free(descr); - } + rrdeng_store_metric_flush_current_page(rd); + if (handle->prev_descr) { + /* unpin old second page */ + pg_cache_put(ctx, handle->prev_descr); } } @@ -174,7 +243,7 @@ storage_number rrdeng_load_metric_next(struct rrddim_query_handle *rrdimm_handle { struct rrdeng_query_handle *handle; struct rrdengine_instance *ctx; - struct rrdeng_page_cache_descr *descr; + struct rrdeng_page_descr *descr; storage_number *page, ret; unsigned position; usec_t point_in_time; @@ -198,7 +267,7 @@ storage_number rrdeng_load_metric_next(struct rrddim_query_handle *rrdimm_handle #ifdef NETDATA_INTERNAL_CHECKS rrd_stat_atomic_add(&ctx->stats.metric_API_consumers, -1); #endif - pg_cache_put(descr); + pg_cache_put(ctx, descr); handle->descr = NULL; } descr = pg_cache_lookup(ctx, handle->page_index, &handle->page_index->id, point_in_time); @@ -216,7 +285,7 @@ storage_number rrdeng_load_metric_next(struct rrddim_query_handle *rrdimm_handle ret = SN_EMPTY_SLOT; goto out; } - page = descr->page; + page = descr->pg_cache_descr->page; if (unlikely(descr->start_time == descr->end_time)) { ret = page[0]; goto out; @@ -248,7 +317,7 @@ void rrdeng_load_metric_finalize(struct rrddim_query_handle *rrdimm_handle) { struct rrdeng_query_handle *handle; struct rrdengine_instance *ctx; - struct rrdeng_page_cache_descr *descr; + struct rrdeng_page_descr *descr; handle = &rrdimm_handle->rrdeng; ctx = handle->ctx; @@ -257,7 +326,7 @@ void rrdeng_load_metric_finalize(struct rrddim_query_handle *rrdimm_handle) #ifdef NETDATA_INTERNAL_CHECKS rrd_stat_atomic_add(&ctx->stats.metric_API_consumers, -1); #endif - pg_cache_put(descr); + pg_cache_put(ctx, descr); } } @@ -283,30 +352,32 @@ time_t rrdeng_metric_oldest_time(RRDDIM *rd) } /* Also gets a reference for the page */ -void *rrdeng_create_page(uuid_t *id, struct rrdeng_page_cache_descr **ret_descr) +void *rrdeng_create_page(struct rrdengine_instance *ctx, uuid_t *id, struct rrdeng_page_descr **ret_descr) { - struct rrdeng_page_cache_descr *descr; + struct rrdeng_page_descr *descr; + struct page_cache_descr *pg_cache_descr; void *page; - int ret; - /* TODO: check maximum number of pages in page cache limit */ - page = mallocz(RRDENG_BLOCK_SIZE); /*TODO: add page size */ descr = pg_cache_create_descr(); - descr->page = page; descr->id = id; /* TODO: add page type: metric, log, something? */ - descr->flags = RRD_PAGE_DIRTY /*| RRD_PAGE_LOCKED */ | RRD_PAGE_POPULATED /* | BEING_COLLECTED */; - descr->refcnt = 1; - - debug(D_RRDENGINE, "-----------------\nCreated new page:\n-----------------"); - if(unlikely(debug_flags & D_RRDENGINE)) + page = mallocz(RRDENG_BLOCK_SIZE); /*TODO: add page size */ + rrdeng_page_descr_mutex_lock(ctx, descr); + pg_cache_descr = descr->pg_cache_descr; + pg_cache_descr->page = page; + pg_cache_descr->flags = RRD_PAGE_DIRTY /*| RRD_PAGE_LOCKED */ | RRD_PAGE_POPULATED /* | BEING_COLLECTED */; + pg_cache_descr->refcnt = 1; + + debug(D_RRDENGINE, "Created new page:"); + if (unlikely(debug_flags & D_RRDENGINE)) print_page_cache_descr(descr); + rrdeng_page_descr_mutex_unlock(ctx, descr); *ret_descr = descr; return page; } /* The page must not be empty */ -void rrdeng_commit_page(struct rrdengine_instance *ctx, struct rrdeng_page_cache_descr *descr, +void rrdeng_commit_page(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr, Word_t page_correlation_id) { struct page_cache *pg_cache = &ctx->pg_cache; @@ -324,15 +395,16 @@ void rrdeng_commit_page(struct rrdengine_instance *ctx, struct rrdeng_page_cache ++pg_cache->commited_page_index.nr_commited_pages; uv_rwlock_wrunlock(&pg_cache->commited_page_index.lock); - pg_cache_put(descr); + pg_cache_put(ctx, descr); } /* Gets a reference for the page */ void *rrdeng_get_latest_page(struct rrdengine_instance *ctx, uuid_t *id, void **handle) { - struct rrdeng_page_cache_descr *descr; + struct rrdeng_page_descr *descr; + struct page_cache_descr *pg_cache_descr; - debug(D_RRDENGINE, "----------------------\nReading existing page:\n----------------------"); + debug(D_RRDENGINE, "Reading existing page:"); descr = pg_cache_lookup(ctx, NULL, id, INVALID_TIME); if (NULL == descr) { *handle = NULL; @@ -340,16 +412,18 @@ void *rrdeng_get_latest_page(struct rrdengine_instance *ctx, uuid_t *id, void ** return NULL; } *handle = descr; + pg_cache_descr = descr->pg_cache_descr; - return descr->page; + return pg_cache_descr->page; } /* Gets a reference for the page */ void *rrdeng_get_page(struct rrdengine_instance *ctx, uuid_t *id, usec_t point_in_time, void **handle) { - struct rrdeng_page_cache_descr *descr; + struct rrdeng_page_descr *descr; + struct page_cache_descr *pg_cache_descr; - debug(D_RRDENGINE, "----------------------\nReading existing page:\n----------------------"); + debug(D_RRDENGINE, "Reading existing page:"); descr = pg_cache_lookup(ctx, NULL, id, point_in_time); if (NULL == descr) { *handle = NULL; @@ -357,11 +431,18 @@ void *rrdeng_get_page(struct rrdengine_instance *ctx, uuid_t *id, usec_t point_i return NULL; } *handle = descr; + pg_cache_descr = descr->pg_cache_descr; - return descr->page; + return pg_cache_descr->page; } -void rrdeng_get_27_statistics(struct rrdengine_instance *ctx, unsigned long long *array) +/* + * Gathers Database Engine statistics. + * Careful when modifying this function. + * You must not change the indices of the statistics or user code will break. + * You must not exceed RRDENG_NR_STATS or it will crash. + */ +void rrdeng_get_33_statistics(struct rrdengine_instance *ctx, unsigned long long *array) { struct page_cache *pg_cache = &ctx->pg_cache; @@ -392,24 +473,46 @@ void rrdeng_get_27_statistics(struct rrdengine_instance *ctx, unsigned long long array[24] = (uint64_t)ctx->stats.datafile_deletions; array[25] = (uint64_t)ctx->stats.journalfile_creations; array[26] = (uint64_t)ctx->stats.journalfile_deletions; + array[27] = (uint64_t)ctx->stats.page_cache_descriptors; + array[28] = (uint64_t)ctx->stats.io_errors; + array[29] = (uint64_t)ctx->stats.fs_errors; + array[30] = (uint64_t)global_io_errors; + array[31] = (uint64_t)global_fs_errors; + array[32] = (uint64_t)rrdeng_reserved_file_descriptors; + assert(RRDENG_NR_STATS == 33); } /* Releases reference to page */ void rrdeng_put_page(struct rrdengine_instance *ctx, void *handle) { (void)ctx; - pg_cache_put((struct rrdeng_page_cache_descr *)handle); + pg_cache_put(ctx, (struct rrdeng_page_descr *)handle); } /* - * Returns 0 on success, 1 on error + * Returns 0 on success, negative on error */ int rrdeng_init(struct rrdengine_instance **ctxp, char *dbfiles_path, unsigned page_cache_mb, unsigned disk_space_mb) { struct rrdengine_instance *ctx; int error; + uint32_t max_open_files; sanity_check(); + + max_open_files = rlimit_nofile.rlim_cur / 4; + + /* reserve RRDENG_FD_BUDGET_PER_INSTANCE file descriptors for this instance */ + rrd_stat_atomic_add(&rrdeng_reserved_file_descriptors, RRDENG_FD_BUDGET_PER_INSTANCE); + if (rrdeng_reserved_file_descriptors > max_open_files) { + error("Exceeded the budget of available file descriptors (%u/%u), cannot create new dbengine instance.", + (unsigned)rrdeng_reserved_file_descriptors, (unsigned)max_open_files); + + rrd_stat_atomic_add(&global_fs_errors, 1); + rrd_stat_atomic_add(&rrdeng_reserved_file_descriptors, -RRDENG_FD_BUDGET_PER_INSTANCE); + return UV_EMFILE; + } + if (NULL == ctxp) { /* for testing */ ctx = &default_global_ctx; @@ -417,10 +520,6 @@ int rrdeng_init(struct rrdengine_instance **ctxp, char *dbfiles_path, unsigned p } else { *ctxp = ctx = callocz(1, sizeof(*ctx)); } - if (ctx->rrdengine_state != RRDENGINE_STATUS_UNINITIALIZED) { - return 1; - } - ctx->rrdengine_state = RRDENGINE_STATUS_INITIALIZING; ctx->global_compress_alg = RRD_LZ4; if (page_cache_mb < RRDENG_MIN_PAGE_CACHE_SIZE_MB) page_cache_mb = RRDENG_MIN_PAGE_CACHE_SIZE_MB; @@ -439,11 +538,7 @@ int rrdeng_init(struct rrdengine_instance **ctxp, char *dbfiles_path, unsigned p init_commit_log(ctx); error = init_rrd_files(ctx); if (error) { - ctx->rrdengine_state = RRDENGINE_STATUS_UNINITIALIZED; - if (ctx != &default_global_ctx) { - freez(ctx); - } - return 1; + goto error_after_init_rrd_files; } init_completion(&ctx->rrdengine_completion); @@ -451,9 +546,21 @@ int rrdeng_init(struct rrdengine_instance **ctxp, char *dbfiles_path, unsigned p /* wait for worker thread to initialize */ wait_for_completion(&ctx->rrdengine_completion); destroy_completion(&ctx->rrdengine_completion); - - ctx->rrdengine_state = RRDENGINE_STATUS_INITIALIZED; + if (ctx->worker_config.error) { + goto error_after_rrdeng_worker; + } return 0; + +error_after_rrdeng_worker: + finalize_rrd_files(ctx); +error_after_init_rrd_files: + free_page_cache(ctx); + if (ctx != &default_global_ctx) { + freez(ctx); + *ctxp = NULL; + } + rrd_stat_atomic_add(&rrdeng_reserved_file_descriptors, -RRDENG_FD_BUDGET_PER_INSTANCE); + return UV_EIO; } /* @@ -464,10 +571,6 @@ int rrdeng_exit(struct rrdengine_instance *ctx) struct rrdeng_cmd cmd; if (NULL == ctx) { - /* TODO: move to per host basis */ - ctx = &default_global_ctx; - } - if (ctx->rrdengine_state != RRDENGINE_STATUS_INITIALIZED) { return 1; } @@ -477,8 +580,12 @@ int rrdeng_exit(struct rrdengine_instance *ctx) assert(0 == uv_thread_join(&ctx->worker_config.thread)); + finalize_rrd_files(ctx); + free_page_cache(ctx); + if (ctx != &default_global_ctx) { freez(ctx); } + rrd_stat_atomic_add(&rrdeng_reserved_file_descriptors, -RRDENG_FD_BUDGET_PER_INSTANCE); return 0; }
\ No newline at end of file diff --git a/database/engine/rrdengineapi.h b/database/engine/rrdengineapi.h index e76629a4..e52aabcb 100644 --- a/database/engine/rrdengineapi.h +++ b/database/engine/rrdengineapi.h @@ -7,16 +7,22 @@ #define RRDENG_MIN_PAGE_CACHE_SIZE_MB (32) #define RRDENG_MIN_DISK_SPACE_MB (256) + +#define RRDENG_NR_STATS (33) + +#define RRDENG_FD_BUDGET_PER_INSTANCE (50) + extern int default_rrdeng_page_cache_mb; extern int default_rrdeng_disk_quota_mb; -extern void *rrdeng_create_page(uuid_t *id, struct rrdeng_page_cache_descr **ret_descr); -extern void rrdeng_commit_page(struct rrdengine_instance *ctx, struct rrdeng_page_cache_descr *descr, +extern void *rrdeng_create_page(struct rrdengine_instance *ctx, uuid_t *id, struct rrdeng_page_descr **ret_descr); +extern void rrdeng_commit_page(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr, Word_t page_correlation_id); extern void *rrdeng_get_latest_page(struct rrdengine_instance *ctx, uuid_t *id, void **handle); extern void *rrdeng_get_page(struct rrdengine_instance *ctx, uuid_t *id, usec_t point_in_time, void **handle); extern void rrdeng_put_page(struct rrdengine_instance *ctx, void *handle); extern void rrdeng_store_metric_init(RRDDIM *rd); +extern void rrdeng_store_metric_flush_current_page(RRDDIM *rd); extern void rrdeng_store_metric_next(RRDDIM *rd, usec_t point_in_time, storage_number number); extern void rrdeng_store_metric_finalize(RRDDIM *rd); extern void rrdeng_load_metric_init(RRDDIM *rd, struct rrddim_query_handle *rrdimm_handle, @@ -26,7 +32,7 @@ extern int rrdeng_load_metric_is_finished(struct rrddim_query_handle *rrdimm_han extern void rrdeng_load_metric_finalize(struct rrddim_query_handle *rrdimm_handle); extern time_t rrdeng_metric_latest_time(RRDDIM *rd); extern time_t rrdeng_metric_oldest_time(RRDDIM *rd); -extern void rrdeng_get_27_statistics(struct rrdengine_instance *ctx, unsigned long long *array); +extern void rrdeng_get_33_statistics(struct rrdengine_instance *ctx, unsigned long long *array); /* must call once before using anything */ extern int rrdeng_init(struct rrdengine_instance **ctxp, char *dbfiles_path, unsigned page_cache_mb, diff --git a/database/engine/rrdenginelib.c b/database/engine/rrdenginelib.c index 25f57ba1..96504b27 100644 --- a/database/engine/rrdenginelib.c +++ b/database/engine/rrdenginelib.c @@ -1,25 +1,52 @@ // SPDX-License-Identifier: GPL-3.0-or-later #include "rrdengine.h" -void print_page_cache_descr(struct rrdeng_page_cache_descr *page_cache_descr) +#define BUFSIZE (512) + +/* Caller must hold descriptor lock */ +void print_page_cache_descr(struct rrdeng_page_descr *descr) { - char uuid_str[37]; - char str[512]; + struct page_cache_descr *pg_cache_descr = descr->pg_cache_descr; + char uuid_str[UUID_STR_LEN]; + char str[BUFSIZE]; int pos = 0; - uuid_unparse_lower(*page_cache_descr->id, uuid_str); - pos += snprintfz(str, 512 - pos, "page(%p) id=%s\n" + uuid_unparse_lower(*descr->id, uuid_str); + pos += snprintfz(str, BUFSIZE - pos, "page(%p) id=%s\n" "--->len:%"PRIu32" time:%"PRIu64"->%"PRIu64" xt_offset:", - page_cache_descr->page, uuid_str, - page_cache_descr->page_length, - (uint64_t)page_cache_descr->start_time, - (uint64_t)page_cache_descr->end_time); - if (!page_cache_descr->extent) { - pos += snprintfz(str + pos, 512 - pos, "N/A"); + pg_cache_descr->page, uuid_str, + descr->page_length, + (uint64_t)descr->start_time, + (uint64_t)descr->end_time); + if (!descr->extent) { + pos += snprintfz(str + pos, BUFSIZE - pos, "N/A"); + } else { + pos += snprintfz(str + pos, BUFSIZE - pos, "%"PRIu64, descr->extent->offset); + } + + snprintfz(str + pos, BUFSIZE - pos, " flags:0x%2.2lX refcnt:%u\n\n", pg_cache_descr->flags, pg_cache_descr->refcnt); + debug(D_RRDENGINE, "%s", str); +} + +void print_page_descr(struct rrdeng_page_descr *descr) +{ + char uuid_str[UUID_STR_LEN]; + char str[BUFSIZE]; + int pos = 0; + + uuid_unparse_lower(*descr->id, uuid_str); + pos += snprintfz(str, BUFSIZE - pos, "id=%s\n" + "--->len:%"PRIu32" time:%"PRIu64"->%"PRIu64" xt_offset:", + uuid_str, + descr->page_length, + (uint64_t)descr->start_time, + (uint64_t)descr->end_time); + if (!descr->extent) { + pos += snprintfz(str + pos, BUFSIZE - pos, "N/A"); } else { - pos += snprintfz(str + pos, 512 - pos, "%"PRIu64, page_cache_descr->extent->offset); + pos += snprintfz(str + pos, BUFSIZE - pos, "%"PRIu64, descr->extent->offset); } - snprintfz(str + pos, 512 - pos, " flags:0x%2.2lX refcnt:%u\n\n", page_cache_descr->flags, page_cache_descr->refcnt); + snprintfz(str + pos, BUFSIZE - pos, "\n\n"); fputs(str, stderr); } @@ -51,6 +78,48 @@ int check_file_properties(uv_file file, uint64_t *file_size, size_t min_size) return 0; } +/* + * Tries to open a file in direct I/O mode, falls back to buffered mode if not possible. + * Returns UV error number that is < 0 on failure. + * On success sets (*file) to be the uv_file that was opened. + */ +int open_file_direct_io(char *path, int flags, uv_file *file) +{ + uv_fs_t req; + int fd, current_flags, direct; + + for (direct = 1 ; direct >= 0 ; --direct) { +#ifdef __APPLE__ + /* Apple OS does not support O_DIRECT */ + direct = 0; +#endif + current_flags = flags; + if (direct) { + current_flags |= O_DIRECT; + } + fd = uv_fs_open(NULL, &req, path, current_flags, S_IRUSR | S_IWUSR, NULL); + if (fd < 0) { + if ((direct) && (UV_EINVAL == fd)) { + error("File \"%s\" does not support direct I/O, falling back to buffered I/O.", path); + } else { + error("Failed to open file \"%s\".", path); + --direct; /* break the loop */ + } + } else { + assert(req.result >= 0); + *file = req.result; +#ifdef __APPLE__ + info("Disabling OS X caching for file \"%s\".", path); + fcntl(fd, F_NOCACHE, 1); +#endif + --direct; /* break the loop */ + } + uv_fs_req_cleanup(&req); + } + + return fd; +} + char *get_rrdeng_statistics(struct rrdengine_instance *ctx, char *str, size_t size) { struct page_cache *pg_cache; @@ -60,6 +129,7 @@ char *get_rrdeng_statistics(struct rrdengine_instance *ctx, char *str, size_t si "metric_API_producers: %ld\n" "metric_API_consumers: %ld\n" "page_cache_total_pages: %ld\n" + "page_cache_descriptors: %ld\n" "page_cache_populated_pages: %ld\n" "page_cache_commited_pages: %ld\n" "page_cache_insertions: %ld\n" @@ -87,6 +157,7 @@ char *get_rrdeng_statistics(struct rrdengine_instance *ctx, char *str, size_t si (long)ctx->stats.metric_API_producers, (long)ctx->stats.metric_API_consumers, (long)pg_cache->page_descriptors, + (long)ctx->stats.page_cache_descriptors, (long)pg_cache->populated_pages, (long)pg_cache->commited_page_index.nr_commited_pages, (long)ctx->stats.pg_cache_insertions, diff --git a/database/engine/rrdenginelib.h b/database/engine/rrdenginelib.h index bb6f072b..36d414e8 100644 --- a/database/engine/rrdenginelib.h +++ b/database/engine/rrdenginelib.h @@ -6,11 +6,17 @@ #include "rrdengine.h" /* Forward declarations */ -struct rrdeng_page_cache_descr; +struct rrdeng_page_descr; #define STR_HELPER(x) #x #define STR(x) STR_HELPER(x) +#define BITS_PER_ULONG (sizeof(unsigned long) * 8) + +#ifndef UUID_STR_LEN +#define UUID_STR_LEN (37) +#endif + /* Taken from linux kernel */ #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) @@ -25,6 +31,15 @@ typedef uintptr_t rrdeng_stats_t; #define rrd_stat_atomic_add(p, n) do {(void) __sync_fetch_and_add(p, n);} while(0) #endif +#define RRDENG_PATH_MAX (4096) + +/* returns old *ptr value */ +static inline unsigned long ulong_compare_and_swap(volatile unsigned long *ptr, + unsigned long oldval, unsigned long newval) +{ + return __sync_val_compare_and_swap(ptr, oldval, newval); +} + #ifndef O_DIRECT /* Workaround for OS X */ #define O_DIRECT (0) @@ -77,8 +92,10 @@ static inline void crc32set(void *crcp, uLong crc) *(uint32_t *)crcp = crc; } -extern void print_page_cache_descr(struct rrdeng_page_cache_descr *page_cache_descr); +extern void print_page_cache_descr(struct rrdeng_page_descr *page_cache_descr); +extern void print_page_descr(struct rrdeng_page_descr *descr); extern int check_file_properties(uv_file file, uint64_t *file_size, size_t min_size); +extern int open_file_direct_io(char *path, int flags, uv_file *file); extern char *get_rrdeng_statistics(struct rrdengine_instance *ctx, char *str, size_t size); #endif /* NETDATA_RRDENGINELIB_H */
\ No newline at end of file diff --git a/database/engine/rrdenglocking.c b/database/engine/rrdenglocking.c new file mode 100644 index 00000000..0eb9019b --- /dev/null +++ b/database/engine/rrdenglocking.c @@ -0,0 +1,233 @@ +// SPDX-License-Identifier: GPL-3.0-or-later +#include "rrdengine.h" + +struct page_cache_descr *rrdeng_create_pg_cache_descr(struct rrdengine_instance *ctx) +{ + struct page_cache_descr *pg_cache_descr; + + pg_cache_descr = mallocz(sizeof(*pg_cache_descr)); + rrd_stat_atomic_add(&ctx->stats.page_cache_descriptors, 1); + pg_cache_descr->page = NULL; + pg_cache_descr->flags = 0; + pg_cache_descr->prev = pg_cache_descr->next = NULL; + pg_cache_descr->refcnt = 0; + pg_cache_descr->waiters = 0; + assert(0 == uv_cond_init(&pg_cache_descr->cond)); + assert(0 == uv_mutex_init(&pg_cache_descr->mutex)); + + return pg_cache_descr; +} + +void rrdeng_destroy_pg_cache_descr(struct rrdengine_instance *ctx, struct page_cache_descr *pg_cache_descr) +{ + uv_cond_destroy(&pg_cache_descr->cond); + uv_mutex_destroy(&pg_cache_descr->mutex); + freez(pg_cache_descr); + rrd_stat_atomic_add(&ctx->stats.page_cache_descriptors, -1); +} + +/* also allocates page cache descriptor if missing */ +void rrdeng_page_descr_mutex_lock(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr) +{ + unsigned long old_state, old_users, new_state, ret_state; + struct page_cache_descr *pg_cache_descr = NULL; + uint8_t we_locked; + + we_locked = 0; + while (1) { /* spin */ + old_state = descr->pg_cache_descr_state; + old_users = old_state >> PG_CACHE_DESCR_SHIFT; + + if (unlikely(we_locked)) { + assert(old_state & PG_CACHE_DESCR_LOCKED); + new_state = (1 << PG_CACHE_DESCR_SHIFT) | PG_CACHE_DESCR_ALLOCATED; + ret_state = ulong_compare_and_swap(&descr->pg_cache_descr_state, old_state, new_state); + if (old_state == ret_state) { + /* success */ + break; + } + continue; /* spin */ + } + if (old_state & PG_CACHE_DESCR_LOCKED) { + assert(0 == old_users); + continue; /* spin */ + } + if (0 == old_state) { + /* no page cache descriptor has been allocated */ + + if (NULL == pg_cache_descr) { + pg_cache_descr = rrdeng_create_pg_cache_descr(ctx); + } + new_state = PG_CACHE_DESCR_LOCKED; + ret_state = ulong_compare_and_swap(&descr->pg_cache_descr_state, 0, new_state); + if (0 == ret_state) { + we_locked = 1; + descr->pg_cache_descr = pg_cache_descr; + pg_cache_descr->descr = descr; + pg_cache_descr = NULL; /* make sure we don't free pg_cache_descr */ + /* retry */ + continue; + } + continue; /* spin */ + } + /* page cache descriptor is already allocated */ + assert(old_state & PG_CACHE_DESCR_ALLOCATED); + + new_state = (old_users + 1) << PG_CACHE_DESCR_SHIFT; + new_state |= old_state & PG_CACHE_DESCR_FLAGS_MASK; + + ret_state = ulong_compare_and_swap(&descr->pg_cache_descr_state, old_state, new_state); + if (old_state == ret_state) { + /* success */ + break; + } + /* spin */ + } + + if (pg_cache_descr) { + rrdeng_destroy_pg_cache_descr(ctx, pg_cache_descr); + } + pg_cache_descr = descr->pg_cache_descr; + uv_mutex_lock(&pg_cache_descr->mutex); +} + +void rrdeng_page_descr_mutex_unlock(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr) +{ + unsigned long old_state, new_state, ret_state, old_users; + struct page_cache_descr *pg_cache_descr; + uint8_t we_locked; + + uv_mutex_unlock(&descr->pg_cache_descr->mutex); + + we_locked = 0; + while (1) { /* spin */ + old_state = descr->pg_cache_descr_state; + old_users = old_state >> PG_CACHE_DESCR_SHIFT; + + if (unlikely(we_locked)) { + assert(0 == old_users); + + ret_state = ulong_compare_and_swap(&descr->pg_cache_descr_state, old_state, 0); + if (old_state == ret_state) { + /* success */ + break; + } + continue; /* spin */ + } + if (old_state & PG_CACHE_DESCR_LOCKED) { + assert(0 == old_users); + continue; /* spin */ + } + assert(old_state & PG_CACHE_DESCR_ALLOCATED); + pg_cache_descr = descr->pg_cache_descr; + /* caller is the only page cache descriptor user and there are no pending references on the page */ + if ((old_state & PG_CACHE_DESCR_DESTROY) && (1 == old_users) && + !pg_cache_descr->flags && !pg_cache_descr->refcnt) { + new_state = PG_CACHE_DESCR_LOCKED; + ret_state = ulong_compare_and_swap(&descr->pg_cache_descr_state, old_state, new_state); + if (old_state == ret_state) { + we_locked = 1; + rrdeng_destroy_pg_cache_descr(ctx, pg_cache_descr); + /* retry */ + continue; + } + continue; /* spin */ + } + assert(old_users > 0); + new_state = (old_users - 1) << PG_CACHE_DESCR_SHIFT; + new_state |= old_state & PG_CACHE_DESCR_FLAGS_MASK; + + ret_state = ulong_compare_and_swap(&descr->pg_cache_descr_state, old_state, new_state); + if (old_state == ret_state) { + /* success */ + break; + } + /* spin */ + } + +} + +/* + * Tries to deallocate page cache descriptor. If it fails, it postpones deallocation by setting the + * PG_CACHE_DESCR_DESTROY flag which will be eventually cleared by a different context after doing + * the deallocation. + */ +void rrdeng_try_deallocate_pg_cache_descr(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr) +{ + unsigned long old_state, new_state, ret_state, old_users; + struct page_cache_descr *pg_cache_descr; + uint8_t just_locked, we_freed, must_unlock; + + just_locked = 0; + we_freed = 0; + must_unlock = 0; + while (1) { /* spin */ + old_state = descr->pg_cache_descr_state; + old_users = old_state >> PG_CACHE_DESCR_SHIFT; + + if (unlikely(just_locked)) { + assert(0 == old_users); + + must_unlock = 1; + just_locked = 0; + /* Try deallocate if there are no pending references on the page */ + if (!pg_cache_descr->flags && !pg_cache_descr->refcnt) { + rrdeng_destroy_pg_cache_descr(ctx, pg_cache_descr); + we_freed = 1; + /* success */ + continue; + } + continue; /* spin */ + } + if (unlikely(must_unlock)) { + assert(0 == old_users); + + if (we_freed) { + /* success */ + new_state = 0; + } else { + new_state = old_state | PG_CACHE_DESCR_DESTROY; + new_state &= ~PG_CACHE_DESCR_LOCKED; + } + ret_state = ulong_compare_and_swap(&descr->pg_cache_descr_state, old_state, new_state); + if (old_state == ret_state) { + /* unlocked */ + return; + } + continue; /* spin */ + } + if (!(old_state & PG_CACHE_DESCR_ALLOCATED)) { + /* don't do anything */ + return; + } + if (old_state & PG_CACHE_DESCR_LOCKED) { + assert(0 == old_users); + continue; /* spin */ + } + pg_cache_descr = descr->pg_cache_descr; + /* caller is the only page cache descriptor user */ + if (0 == old_users) { + new_state = old_state | PG_CACHE_DESCR_LOCKED; + ret_state = ulong_compare_and_swap(&descr->pg_cache_descr_state, old_state, new_state); + if (old_state == ret_state) { + just_locked = 1; + /* retry */ + continue; + } + continue; /* spin */ + } + if (old_state & PG_CACHE_DESCR_DESTROY) { + /* don't do anything */ + return; + } + /* plant PG_CACHE_DESCR_DESTROY so that other contexts eventually free the page cache descriptor */ + new_state = old_state | PG_CACHE_DESCR_DESTROY; + + ret_state = ulong_compare_and_swap(&descr->pg_cache_descr_state, old_state, new_state); + if (old_state == ret_state) { + /* success */ + return; + } + /* spin */ + } +}
\ No newline at end of file diff --git a/database/engine/rrdenglocking.h b/database/engine/rrdenglocking.h new file mode 100644 index 00000000..127ddc90 --- /dev/null +++ b/database/engine/rrdenglocking.h @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef NETDATA_RRDENGLOCKING_H +#define NETDATA_RRDENGLOCKING_H + +#include "rrdengine.h" + +/* Forward declarations */ +struct page_cache_descr; + +extern struct page_cache_descr *rrdeng_create_pg_cache_descr(struct rrdengine_instance *ctx); +extern void rrdeng_destroy_pg_cache_descr(struct rrdengine_instance *ctx, struct page_cache_descr *pg_cache_descr); +extern void rrdeng_page_descr_mutex_lock(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr); +extern void rrdeng_page_descr_mutex_unlock(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr); +extern void rrdeng_try_deallocate_pg_cache_descr(struct rrdengine_instance *ctx, struct rrdeng_page_descr *descr); + +#endif /* NETDATA_RRDENGLOCKING_H */
\ No newline at end of file diff --git a/database/rrd.c b/database/rrd.c index 2457cac0..31ad3f07 100644 --- a/database/rrd.c +++ b/database/rrd.c @@ -132,7 +132,6 @@ const char *rrdset_type_name(RRDSET_TYPE chart_type) { } } - // ---------------------------------------------------------------------------- // RRD - cache directory @@ -146,8 +145,7 @@ char *rrdset_cache_dir(RRDHOST *host, const char *id, const char *config_section snprintfz(n, FILENAME_MAX, "%s/%s", host->cache_dir, b); ret = config_get(config_section, "cache directory", n); - if(host->rrd_memory_mode == RRD_MEMORY_MODE_MAP || host->rrd_memory_mode == RRD_MEMORY_MODE_SAVE || - host->rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) { + if(host->rrd_memory_mode == RRD_MEMORY_MODE_MAP || host->rrd_memory_mode == RRD_MEMORY_MODE_SAVE) { int r = mkdir(ret, 0775); if(r != 0 && errno != EEXIST) error("Cannot create directory '%s'", ret); @@ -155,3 +153,4 @@ char *rrdset_cache_dir(RRDHOST *host, const char *id, const char *config_section return ret; } + diff --git a/database/rrd.h b/database/rrd.h index 3f57b903..5b09c2dd 100644 --- a/database/rrd.h +++ b/database/rrd.h @@ -17,7 +17,7 @@ typedef struct alarm_entry ALARM_ENTRY; // forward declarations struct rrddim_volatile; #ifdef ENABLE_DBENGINE -struct rrdeng_page_cache_descr; +struct rrdeng_page_descr; struct rrdengine_instance; struct pg_cache_page_index; #endif @@ -246,10 +246,12 @@ union rrddim_collect_handle { } slotted; // state the legacy code uses #ifdef ENABLE_DBENGINE struct rrdeng_collect_handle { - struct rrdeng_page_cache_descr *descr, *prev_descr; + struct rrdeng_page_descr *descr, *prev_descr; unsigned long page_correlation_id; struct rrdengine_instance *ctx; struct pg_cache_page_index *page_index; + // set to 1 when this dimension is not page aligned with the other dimensions in the chart + uint8_t unaligned_page; } rrdeng; // state the database engine uses #endif }; @@ -268,7 +270,7 @@ struct rrddim_query_handle { } slotted; // state the legacy code uses #ifdef ENABLE_DBENGINE struct rrdeng_query_handle { - struct rrdeng_page_cache_descr *descr; + struct rrdeng_page_descr *descr; struct rrdengine_instance *ctx; struct pg_cache_page_index *page_index; time_t now; //TODO: remove now to implement next point iteration @@ -351,7 +353,7 @@ typedef enum rrdset_flags { RRDSET_FLAG_UPSTREAM_EXPOSED = 1 << 8, // if set, we have sent this chart definition to netdata master (streaming) RRDSET_FLAG_STORE_FIRST = 1 << 9, // if set, do not eliminate the first collection during interpolation RRDSET_FLAG_HETEROGENEOUS = 1 << 10, // if set, the chart is not homogeneous (dimensions in it have multiple algorithms, multipliers or dividers) - RRDSET_FLAG_HOMEGENEOUS_CHECK = 1 << 11, // if set, the chart should be checked to determine if the dimensions as homogeneous + RRDSET_FLAG_HOMOGENEOUS_CHECK = 1 << 11, // if set, the chart should be checked to determine if the dimensions are homogeneous RRDSET_FLAG_HIDDEN = 1 << 12, // if set, do not show this chart on the dashboard, but use it for backends RRDSET_FLAG_SYNC_CLOCK = 1 << 13, // if set, microseconds on next data collection will be ignored (the chart will be synced to now) RRDSET_FLAG_OBSOLETE_DIMENSIONS = 1 << 14 // this is marked by the collector/module when a chart has obsolete dimensions @@ -431,7 +433,9 @@ struct rrdset { char *plugin_name; // the name of the plugin that generated this char *module_name; // the name of the plugin module that generated this - size_t unused[6]; + size_t unused[5]; + + size_t rrddim_page_alignment; // keeps metric pages in alignment when using dbengine uint32_t hash; // a simple hash on the id, to speed up searching // we first compare hashes, and only if the hashes are equal we do string comparisons @@ -568,6 +572,8 @@ struct alarm_entry { uint32_t updated_by_id; uint32_t updates_id; + time_t last_repeat; + struct alarm_entry *next; }; @@ -682,11 +688,16 @@ struct rrdhost { char *health_log_filename; // the alarms event log filename size_t health_log_entries_written; // the number of alarm events writtern to the alarms event log FILE *health_log_fp; // the FILE pointer to the open alarms event log file + uint32_t health_default_warn_repeat_every; // the default value for the interval between repeating warning notifications + uint32_t health_default_crit_repeat_every; // the default value for the interval between repeating critical notifications + // all RRDCALCs are primarily allocated and linked here // RRDCALCs may be linked to charts at any point // (charts may or may not exist when these are loaded) RRDCALC *alarms; + avl_tree_lock alarms_idx_health_log; + avl_tree_lock alarms_idx_name; ALARM_LOG health_log; // alarms historical events (event log) uint32_t health_last_processed_id; // the last processed health id from the log @@ -723,6 +734,10 @@ struct rrdhost { struct rrdengine_instance *rrdeng_ctx; // DB engine instance for this host #endif +#ifdef ENABLE_HTTPS + struct netdata_ssl ssl; //Structure used to encrypt the connection +#endif + struct rrdhost *next; }; extern RRDHOST *localhost; @@ -781,7 +796,6 @@ extern RRDHOST *rrdhost_find_or_create( ); extern int rrdhost_set_system_info_variable(struct rrdhost_system_info *system_info, char *name, char *value); -extern struct rrdhost_system_info *rrdhost_system_info_dup(struct rrdhost_system_info *system_info); #if defined(NETDATA_INTERNAL_CHECKS) && defined(NETDATA_VERIFY_LOCKS) extern void __rrdhost_check_wrlock(RRDHOST *host, const char *file, const char *function, const unsigned long line); @@ -1015,6 +1029,12 @@ extern collected_number rrddim_set(RRDSET *st, const char *id, collected_number extern long align_entries_to_pagesize(RRD_MEMORY_MODE mode, long entries); // ---------------------------------------------------------------------------- +// Miscellaneous functions + +extern int alarm_compare_id(void *a, void *b); +extern int alarm_compare_name(void *a, void *b); + +// ---------------------------------------------------------------------------- // RRD internal functions #ifdef NETDATA_RRD_INTERNALS diff --git a/database/rrdcalc.c b/database/rrdcalc.c index 7f6a896b..908fc2eb 100644 --- a/database/rrdcalc.c +++ b/database/rrdcalc.c @@ -81,9 +81,9 @@ static void rrdsetcalc_link(RRDSET *st, RRDCALC *rc) { if(!rc->units) rc->units = strdupz(st->units); - { + if(!rrdcalc_isrepeating(rc)) { time_t now = now_realtime_sec(); - health_alarm_log( + ALARM_ENTRY *ae = health_create_alarm_entry( host, rc->id, rc->next_event_id++, @@ -104,6 +104,7 @@ static void rrdsetcalc_link(RRDSET *st, RRDCALC *rc) { 0, 0 ); + health_alarm_log(host, ae); } } @@ -142,9 +143,9 @@ inline void rrdsetcalc_unlink(RRDCALC *rc) { RRDHOST *host = st->rrdhost; - { + if(!rrdcalc_isrepeating(rc)) { time_t now = now_realtime_sec(); - health_alarm_log( + ALARM_ENTRY *ae = health_create_alarm_entry( host, rc->id, rc->next_event_id++, @@ -165,6 +166,7 @@ inline void rrdsetcalc_unlink(RRDCALC *rc) { 0, 0 ); + health_alarm_log(host, ae); } debug(D_HEALTH, "Health unlinking alarm '%s.%s' from chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, host->hostname); @@ -253,7 +255,7 @@ inline uint32_t rrdcalc_get_unique_id(RRDHOST *host, const char *chart, const ch return host->health_log.next_alarm_id++; } -inline void rrdcalc_create_part2(RRDHOST *host, RRDCALC *rc) { +inline void rrdcalc_add_to_host(RRDHOST *host, RRDCALC *rc) { rrdhost_check_rdlock(host); if(rc->calculation) { @@ -301,8 +303,7 @@ inline void rrdcalc_create_part2(RRDHOST *host, RRDCALC *rc) { } } -inline RRDCALC *rrdcalc_create(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *chart) { - +inline RRDCALC *rrdcalc_create_from_template(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *chart) { debug(D_HEALTH, "Health creating dynamic alarm (from template) '%s.%s'", chart, rt->name); if(rrdcalc_exists(host, chart, rt->name, 0, 0)) @@ -328,6 +329,10 @@ inline RRDCALC *rrdcalc_create(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *c rc->delay_max_duration = rt->delay_max_duration; rc->delay_multiplier = rt->delay_multiplier; + rc->last_repeat = 0; + rc->warn_repeat_every = rt->warn_repeat_every; + rc->crit_repeat_every = rt->crit_repeat_every; + rc->group = rt->group; rc->after = rt->after; rc->before = rt->before; @@ -356,7 +361,7 @@ inline RRDCALC *rrdcalc_create(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *c error("Health alarm '%s.%s': failed to re-parse critical expression '%s'", chart, rt->name, rt->critical->source); } - debug(D_HEALTH, "Health runtime added alarm '%s.%s': exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f", + debug(D_HEALTH, "Health runtime added alarm '%s.%s': exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u", (rc->chart)?rc->chart:"NOCHART", rc->name, (rc->exec)?rc->exec:"DEFAULT", @@ -376,16 +381,24 @@ inline RRDCALC *rrdcalc_create(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *c rc->delay_up_duration, rc->delay_down_duration, rc->delay_max_duration, - rc->delay_multiplier + rc->delay_multiplier, + rc->warn_repeat_every, + rc->crit_repeat_every ); - rrdcalc_create_part2(host, rc); + rrdcalc_add_to_host(host, rc); + RRDCALC *rdcmp = (RRDCALC *) avl_insert_lock(&(host)->alarms_idx_health_log,(avl *)rc); + if (rdcmp != rc) { + error("Cannot insert the alarm index ID %s",rc->name); + } + return rc; } void rrdcalc_free(RRDCALC *rc) { if(unlikely(!rc)) return; + expression_free(rc->calculation); expression_free(rc->warning); expression_free(rc->critical); @@ -413,7 +426,6 @@ void rrdcalc_unlink_and_free(RRDHOST *host, RRDCALC *rc) { // unlink it from RRDHOST if(unlikely(rc == host->alarms)) host->alarms = rc->next; - else { RRDCALC *t; for(t = host->alarms; t && t->next != rc; t = t->next) ; @@ -425,5 +437,79 @@ void rrdcalc_unlink_and_free(RRDHOST *host, RRDCALC *rc) { error("Cannot unlink alarm '%s.%s' from host '%s': not found", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname); } + if (rc) { + RRDCALC *rdcmp = (RRDCALC *) avl_search_lock(&(host)->alarms_idx_health_log, (avl *)rc); + if (rdcmp) { + rdcmp = (RRDCALC *) avl_remove_lock(&(host)->alarms_idx_health_log, (avl *)rc); + if (!rdcmp) { + error("Cannot remove the health alarm index from health_log"); + } + } + + rdcmp = (RRDCALC *) avl_search_lock(&(host)->alarms_idx_name, (avl *)rc); + if (rdcmp) { + rdcmp = (RRDCALC *) avl_remove_lock(&(host)->alarms_idx_name, (avl *)rc); + if (!rdcmp) { + error("Cannot remove the health alarm index from idx_name"); + } + } + } + rrdcalc_free(rc); } + +// ---------------------------------------------------------------------------- +// Alarm + + +/** + * Alarm is repeating + * + * Is this alarm repeating ? + * + * @param host The structure that has the binary tree + * @param alarm_id the id of the alarm to search + * + * @return It returns 1 case it is repeating and 0 otherwise + */ +int alarm_isrepeating(RRDHOST *host, uint32_t alarm_id) { + RRDCALC findme; + findme.id = alarm_id; + RRDCALC *rc = (RRDCALC *)avl_search_lock(&host->alarms_idx_health_log, (avl *)&findme); + if (!rc) { + return 0; + } + return rrdcalc_isrepeating(rc); +} + +/** + * Entry is repeating + * + * Check whether the id of alarm entry is yet present in the host structure + * + * @param host The structure that has the binary tree + * @param ae the alarm entry + * + * @return It returns 1 case it is repeating and 0 otherwise + */ +int alarm_entry_isrepeating(RRDHOST *host, ALARM_ENTRY *ae) { + return alarm_isrepeating(host, ae->alarm_id); +} + +/** + * Max last repeat + * + * Check the maximum last_repeat for the alarms associated a host + * + * @param host The structure that has the binary tree + * + * @return It returns 1 case it is repeating and 0 otherwise + */ +RRDCALC *alarm_max_last_repeat(RRDHOST *host, char *alarm_name,uint32_t hash) { + RRDCALC findme; + findme.name = alarm_name; + findme.hash = hash; + RRDCALC *rc = (RRDCALC *)avl_search_lock(&host->alarms_idx_name, (avl *)&findme); + + return rc; +} diff --git a/database/rrdcalc.h b/database/rrdcalc.h index 4df4381a..3400f711 100644 --- a/database/rrdcalc.h +++ b/database/rrdcalc.h @@ -29,7 +29,9 @@ #define RRDCALC_FLAG_SILENCED 0x00000100 #define RRDCALC_FLAG_NO_CLEAR_NOTIFICATION 0x80000000 + struct rrdcalc { + avl avl; // the index, with key the id - this has to be first! uint32_t id; // the unique id of this alarm uint32_t next_event_id; // the next event id that will be used for this alarm @@ -78,8 +80,15 @@ struct rrdcalc { // while now < delay_up_to // ------------------------------------------------------------------------ + // notification repeat settings + + uint32_t warn_repeat_every; // interval between repeating warning notifications + uint32_t crit_repeat_every; // interval between repeating critical notifications + + // ------------------------------------------------------------------------ // runtime information + RRDCALC_STATUS old_status; // the old status of the alarm RRDCALC_STATUS status; // the current status of the alarm calculated_number value; // the current value of the alarm @@ -90,6 +99,7 @@ struct rrdcalc { time_t last_updated; // the last update timestamp of the alarm time_t next_update; // the next update timestamp of the alarm time_t last_status_change; // the timestamp of the last time this alarm changed status + time_t last_repeat; // the last time the alarm got repeated time_t db_after; // the first timestamp evaluated by the db lookup time_t db_before; // the last timestamp evaluated by the db lookup @@ -119,6 +129,10 @@ struct rrdcalc { struct rrdcalc *next; }; +extern int alarm_isrepeating(RRDHOST *host, uint32_t alarm_id); +extern int alarm_entry_isrepeating(RRDHOST *host, ALARM_ENTRY *ae); +extern RRDCALC *alarm_max_last_repeat(RRDHOST *host, char *alarm_name, uint32_t hash); + #define RRDCALC_HAS_DB_LOOKUP(rc) ((rc)->after) extern void rrdsetcalc_link_matching(RRDSET *st); @@ -132,7 +146,14 @@ extern void rrdcalc_unlink_and_free(RRDHOST *host, RRDCALC *rc); extern int rrdcalc_exists(RRDHOST *host, const char *chart, const char *name, uint32_t hash_chart, uint32_t hash_name); extern uint32_t rrdcalc_get_unique_id(RRDHOST *host, const char *chart, const char *name, uint32_t *next_event_id); -extern RRDCALC *rrdcalc_create(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *chart); -extern void rrdcalc_create_part2(RRDHOST *host, RRDCALC *rc); +extern RRDCALC *rrdcalc_create_from_template(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *chart); +extern void rrdcalc_add_to_host(RRDHOST *host, RRDCALC *rc); + +static inline int rrdcalc_isrepeating(RRDCALC *rc) { + if (unlikely(rc->warn_repeat_every > 0 || rc->crit_repeat_every > 0)) { + return 1; + } + return 0; +} #endif //NETDATA_RRDCALC_H diff --git a/database/rrdcalctemplate.c b/database/rrdcalctemplate.c index ba7e7ec9..f2b9767c 100644 --- a/database/rrdcalctemplate.c +++ b/database/rrdcalctemplate.c @@ -13,7 +13,7 @@ void rrdcalctemplate_link_matching(RRDSET *st) { for(rt = host->templates; rt ; rt = rt->next) { if(rt->hash_context == st->hash_context && !strcmp(rt->context, st->context) && (!rt->family_pattern || simple_pattern_matches(rt->family_pattern, st->family))) { - RRDCALC *rc = rrdcalc_create(host, rt, st->id); + RRDCALC *rc = rrdcalc_create_from_template(host, rt, st->id); if(unlikely(!rc)) info("Health tried to create alarm from template '%s' on chart '%s' of host '%s', but it failed", rt->name, st->id, host->hostname); diff --git a/database/rrdcalctemplate.h b/database/rrdcalctemplate.h index b8996bc1..92bb4138 100644 --- a/database/rrdcalctemplate.h +++ b/database/rrdcalctemplate.h @@ -49,6 +49,12 @@ struct rrdcalctemplate { float delay_multiplier; // multiplier for all delays when alarms switch status // ------------------------------------------------------------------------ + // notification repeat settings + + uint32_t warn_repeat_every; // interval between repeating warning notifications + uint32_t crit_repeat_every; // interval between repeating critical notifications + + // ------------------------------------------------------------------------ // expressions related to the alarm EVAL_EXPRESSION *calculation; diff --git a/database/rrddim.c b/database/rrddim.c index 0cf6734a..088c80d0 100644 --- a/database/rrddim.c +++ b/database/rrddim.c @@ -60,7 +60,7 @@ inline int rrddim_set_algorithm(RRDSET *st, RRDDIM *rd, RRD_ALGORITHM algorithm) debug(D_RRD_CALLS, "Updating algorithm of dimension '%s/%s' from %s to %s", st->id, rd->name, rrd_algorithm_name(rd->algorithm), rrd_algorithm_name(algorithm)); rd->algorithm = algorithm; rd->exposed = 0; - rrdset_flag_set(st, RRDSET_FLAG_HOMEGENEOUS_CHECK); + rrdset_flag_set(st, RRDSET_FLAG_HOMOGENEOUS_CHECK); rrdset_flag_clear(st, RRDSET_FLAG_UPSTREAM_EXPOSED); return 1; } @@ -72,7 +72,7 @@ inline int rrddim_set_multiplier(RRDSET *st, RRDDIM *rd, collected_number multip debug(D_RRD_CALLS, "Updating multiplier of dimension '%s/%s' from " COLLECTED_NUMBER_FORMAT " to " COLLECTED_NUMBER_FORMAT, st->id, rd->name, rd->multiplier, multiplier); rd->multiplier = multiplier; rd->exposed = 0; - rrdset_flag_set(st, RRDSET_FLAG_HOMEGENEOUS_CHECK); + rrdset_flag_set(st, RRDSET_FLAG_HOMOGENEOUS_CHECK); rrdset_flag_clear(st, RRDSET_FLAG_UPSTREAM_EXPOSED); return 1; } @@ -84,7 +84,7 @@ inline int rrddim_set_divisor(RRDSET *st, RRDDIM *rd, collected_number divisor) debug(D_RRD_CALLS, "Updating divisor of dimension '%s/%s' from " COLLECTED_NUMBER_FORMAT " to " COLLECTED_NUMBER_FORMAT, st->id, rd->name, rd->divisor, divisor); rd->divisor = divisor; rd->exposed = 0; - rrdset_flag_set(st, RRDSET_FLAG_HOMEGENEOUS_CHECK); + rrdset_flag_set(st, RRDSET_FLAG_HOMOGENEOUS_CHECK); rrdset_flag_clear(st, RRDSET_FLAG_UPSTREAM_EXPOSED); return 1; } diff --git a/database/rrdhost.c b/database/rrdhost.c index c552c6c3..d6252d20 100644 --- a/database/rrdhost.c +++ b/database/rrdhost.c @@ -147,6 +147,10 @@ RRDHOST *rrdhost_create(const char *hostname, host->rrdpush_sender_pipe[0] = -1; host->rrdpush_sender_pipe[1] = -1; host->rrdpush_sender_socket = -1; +#ifdef ENABLE_HTTPS + host->ssl.conn = NULL; + host->ssl.flags = NETDATA_SSL_START; +#endif netdata_mutex_init(&host->rrdpush_sender_buffer_mutex); netdata_rwlock_init(&host->rrdhost_rwlock); @@ -162,7 +166,7 @@ RRDHOST *rrdhost_create(const char *hostname, host->program_version = strdupz((program_version && *program_version)?program_version:"unknown"); host->registry_hostname = strdupz((registry_hostname && *registry_hostname)?registry_hostname:hostname); - host->system_info = rrdhost_system_info_dup(system_info); + host->system_info = system_info; avl_init_lock(&(host->rrdset_root_index), rrdset_compare); avl_init_lock(&(host->rrdset_root_index_name), rrdset_compare_name); @@ -175,6 +179,10 @@ RRDHOST *rrdhost_create(const char *hostname, if(config_get_boolean(CONFIG_SECTION_GLOBAL, "delete orphan hosts files", 1) && !is_localhost) rrdhost_flag_set(host, RRDHOST_FLAG_DELETE_ORPHAN_HOST); + host->health_default_warn_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat warning", "never"); + host->health_default_crit_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat critical", "never"); + avl_init_lock(&(host->alarms_idx_health_log), alarm_compare_id); + avl_init_lock(&(host->alarms_idx_name), alarm_compare_name); // ------------------------------------------------------------------------ // initialize health variables @@ -270,12 +278,12 @@ RRDHOST *rrdhost_create(const char *hostname, // load health configuration if(host->health_enabled) { - health_alarm_log_load(host); - health_alarm_log_open(host); - rrdhost_wrlock(host); health_readdir(host, health_user_config_dir(), health_stock_config_dir(), NULL); rrdhost_unlock(host); + + health_alarm_log_load(host); + health_alarm_log_open(host); } @@ -812,81 +820,103 @@ restart_after_removal: // RRDHOST - set system info from environment variables int rrdhost_set_system_info_variable(struct rrdhost_system_info *system_info, char *name, char *value) { + int res = 0; + if(!strcmp(name, "NETDATA_SYSTEM_OS_NAME")){ + freez(system_info->os_name); system_info->os_name = strdupz(value); } else if(!strcmp(name, "NETDATA_SYSTEM_OS_ID")){ + freez(system_info->os_id); system_info->os_id = strdupz(value); } else if(!strcmp(name, "NETDATA_SYSTEM_OS_ID_LIKE")){ + freez(system_info->os_id_like); system_info->os_id_like = strdupz(value); } else if(!strcmp(name, "NETDATA_SYSTEM_OS_VERSION")){ + freez(system_info->os_version); system_info->os_version = strdupz(value); } else if(!strcmp(name, "NETDATA_SYSTEM_OS_VERSION_ID")){ + freez(system_info->os_version_id); system_info->os_version_id = strdupz(value); } else if(!strcmp(name, "NETDATA_SYSTEM_OS_DETECTION")){ + freez(system_info->os_detection); system_info->os_detection = strdupz(value); } else if(!strcmp(name, "NETDATA_SYSTEM_KERNEL_NAME")){ + freez(system_info->kernel_name); system_info->kernel_name = strdupz(value); } else if(!strcmp(name, "NETDATA_SYSTEM_KERNEL_VERSION")){ + freez(system_info->kernel_version); system_info->kernel_version = strdupz(value); } else if(!strcmp(name, "NETDATA_SYSTEM_ARCHITECTURE")){ + freez(system_info->architecture); system_info->architecture = strdupz(value); } else if(!strcmp(name, "NETDATA_SYSTEM_VIRTUALIZATION")){ + freez(system_info->virtualization); system_info->virtualization = strdupz(value); } else if(!strcmp(name, "NETDATA_SYSTEM_VIRT_DETECTION")){ + freez(system_info->virt_detection); system_info->virt_detection = strdupz(value); } else if(!strcmp(name, "NETDATA_SYSTEM_CONTAINER")){ + freez(system_info->container); system_info->container = strdupz(value); } else if(!strcmp(name, "NETDATA_SYSTEM_CONTAINER_DETECTION")){ + freez(system_info->container_detection); system_info->container_detection = strdupz(value); } - else return 1; + else { + res = 1; + } - return 0; + return res; } -struct rrdhost_system_info *rrdhost_system_info_dup(struct rrdhost_system_info *system_info) { - struct rrdhost_system_info *ret = callocz(1, sizeof(struct rrdhost_system_info)); +/** + * Alarm Compare ID + * + * Callback function used with the binary trees to compare the id of RRDCALC + * + * @param a a pointer to the RRDCAL item to insert,compare or update the binary tree + * @param b the pointer to the binary tree. + * + * @return It returns 0 case the values are equal, 1 case a is bigger than b and -1 case a is smaller than b. + */ +int alarm_compare_id(void *a, void *b) { + register uint32_t hash1 = ((RRDCALC *)a)->id; + register uint32_t hash2 = ((RRDCALC *)b)->id; + + if(hash1 < hash2) return -1; + else if(hash1 > hash2) return 1; - if(likely(system_info)) { - if(system_info->os_name) - ret->os_name = strdupz(system_info->os_name); - if(system_info->os_id) - ret->os_id = strdupz(system_info->os_id); - if(system_info->os_id_like) - ret->os_id_like = strdupz(system_info->os_id_like); - if(system_info->os_version) - ret->os_version = strdupz(system_info->os_version); - if(system_info->os_version_id) - ret->os_version_id = strdupz(system_info->os_version_id); - if(system_info->os_detection) - ret->os_detection = strdupz(system_info->os_detection); - if(system_info->kernel_name) - ret->kernel_name = strdupz(system_info->kernel_name); - if(system_info->kernel_version) - ret->kernel_version = strdupz(system_info->kernel_version); - if(system_info->architecture) - ret->architecture = strdupz(system_info->architecture); - if(system_info->virtualization) - ret->virtualization = strdupz(system_info->virtualization); - if(system_info->virt_detection) - ret->virt_detection = strdupz(system_info->virt_detection); - if(system_info->container) - ret->container = strdupz(system_info->container); - if(system_info->container_detection) - ret->container_detection = strdupz(system_info->container_detection); - } - - return ret; + return 0; +} + +/** + * Alarm Compare NAME + * + * Callback function used with the binary trees to compare the name of RRDCALC + * + * @param a a pointer to the RRDCAL item to insert,compare or update the binary tree + * @param b the pointer to the binary tree. + * + * @return It returns 0 case the values are equal, 1 case a is bigger than b and -1 case a is smaller than b. + */ +int alarm_compare_name(void *a, void *b) { + RRDCALC *in1 = (RRDCALC *)a; + RRDCALC *in2 = (RRDCALC *)b; + + if(in1->hash < in2->hash) return -1; + else if(in1->hash > in2->hash) return 1; + + return strcmp(in1->name,in2->name); } diff --git a/database/rrdset.c b/database/rrdset.c index 68959146..f8962b2f 100644 --- a/database/rrdset.c +++ b/database/rrdset.c @@ -210,7 +210,7 @@ inline void rrdset_update_heterogeneous_flag(RRDSET *st) { RRDDIM *rd; - rrdset_flag_clear(st, RRDSET_FLAG_HOMEGENEOUS_CHECK); + rrdset_flag_clear(st, RRDSET_FLAG_HOMOGENEOUS_CHECK); RRD_ALGORITHM algorithm = st->dimensions->algorithm; collected_number multiplier = abs(st->dimensions->multiplier); @@ -251,6 +251,7 @@ void rrdset_reset(RRDSET *st) { st->current_entry = 0; st->counter = 0; st->counter_done = 0; + st->rrddim_page_alignment = 0; RRDDIM *rd; rrddim_foreach_read(rd, st) { @@ -258,6 +259,11 @@ void rrdset_reset(RRDSET *st) { rd->last_collected_time.tv_usec = 0; rd->collections_counter = 0; // memset(rd->values, 0, rd->entries * sizeof(storage_number)); +#ifdef ENABLE_DBENGINE + if (RRD_MEMORY_MODE_DBENGINE == st->rrd_memory_mode) { + rrdeng_store_metric_flush_current_page(rd); + } +#endif } } @@ -505,6 +511,12 @@ RRDSET *rrdset_create_custom( if(st) { rrdset_flag_set(st, RRDSET_FLAG_SYNC_CLOCK); rrdset_flag_clear(st, RRDSET_FLAG_UPSTREAM_EXPOSED); + + if(unlikely(name)) + rrdset_set_name(st, name); + else + rrdset_set_name(st, id); + return st; } @@ -613,7 +625,7 @@ RRDSET *rrdset_create_custom( memset(st, 0, size); } else if((now - st->last_updated.tv_sec) > update_every * entries) { - error("File %s is too old. Clearing it.", fullfilename); + info("File %s is too old. Clearing it.", fullfilename); memset(st, 0, size); } else if(st->last_updated.tv_sec > now + update_every) { @@ -702,6 +714,7 @@ RRDSET *rrdset_create_custom( st->last_collected_time.tv_sec = 0; st->last_collected_time.tv_usec = 0; st->counter_done = 0; + st->rrddim_page_alignment = 0; st->gap_when_lost_iterations_above = (int) (gap_when_lost_iterations_above + 2); @@ -1273,6 +1286,22 @@ void rrdset_done(RRDSET *st) { first_entry = 1; } +#ifdef ENABLE_DBENGINE + // check if we will re-write the entire page + if(unlikely(st->rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE && + dt_usec(&st->last_collected_time, &st->last_updated) > (RRDENG_BLOCK_SIZE / sizeof(storage_number)) * update_every_ut)) { + info("%s: too old data (last updated at %ld.%ld, last collected at %ld.%ld). Resetting it. Will not store the next entry.", st->name, st->last_updated.tv_sec, st->last_updated.tv_usec, st->last_collected_time.tv_sec, st->last_collected_time.tv_usec); + rrdset_reset(st); + rrdset_init_last_updated_time(st); + + st->usec_since_last_update = update_every_ut; + + // the first entry should not be stored + store_this_entry = 0; + first_entry = 1; + } +#endif + // these are the 3 variables that will help us in interpolation // last_stored_ut = the last time we added a value to the storage // now_collect_ut = the time the current value has been collected diff --git a/docs/Add-more-charts-to-netdata.md b/docs/Add-more-charts-to-netdata.md index 382cd8d3..285713b0 100644 --- a/docs/Add-more-charts-to-netdata.md +++ b/docs/Add-more-charts-to-netdata.md @@ -1,8 +1,8 @@ -# Add more charts to netdata +# Add more charts to Netdata -netdata collects system metrics by itself. It has many [internal plugins](../collectors) for collecting most of the metrics presented by default when it starts, collecting data from `/proc`, `/sys` and other Linux kernel sources. +Netdata collects system metrics by itself. It has many [internal plugins](../collectors) for collecting most of the metrics presented by default when it starts, collecting data from `/proc`, `/sys` and other Linux kernel sources. -To collect non-system metrics, netdata supports a plugin architecture. The following are the currently available external plugins: +To collect non-system metrics, Netdata supports a plugin architecture. The following are the currently available external plugins: - **[Web Servers](#web-servers)**, such as apache, nginx, nginx_plus, tomcat, litespeed - **[Web Logs](#web-log-parsers)**, such as apache, nginx, lighttpd, gunicorn, squid access logs, apache cache.log @@ -39,16 +39,16 @@ Check also [Third Party Plugins](Third-Party-Plugins.md) for a list of plugins d ## configuring plugins -netdata comes with **internal** and **external** plugins: +Netdata comes with **internal** and **external** plugins: -1. The **internal** ones are written in `C` and run as threads within the netdata daemon. -2. The **external** ones can be written in any computer language. The netdata daemon spawns these as processes (shown with `ps fax`) and reads their metrics using pipes (so the `stdout` of external plugins is connected to netdata for metrics collection and the `stderr` of external plugins is connected to `/var/log/netdata/error.log`). +1. The **internal** ones are written in `C` and run as threads within the Netdata daemon. +2. The **external** ones can be written in any computer language. The Netdata daemon spawns these as processes (shown with `ps fax`) and reads their metrics using pipes (so the `stdout` of external plugins is connected to Netdata for metrics collection and the `stderr` of external plugins is connected to `/var/log/netdata/error.log`). -To make it easier to develop plugins, and minimize the number of threads and processes running, netdata supports **plugin orchestrators**, each of them supporting one or more data collection **modules**. Currently we ship plugin orchestrators for 4 languages: `C`, `python`, `node.js` and `bash` and 2 more are under development (`go` and `java`). +To make it easier to develop plugins, and minimize the number of threads and processes running, Netdata supports **plugin orchestrators**, each of them supporting one or more data collection **modules**. Currently we ship plugin orchestrators for 4 languages: `C`, `python`, `node.js` and `bash` and 2 more are under development (`go` and `java`). #### enabling and disabling plugins -To control which plugins netdata run, edit `netdata.conf` and check the `[plugins]` section. It looks like this: +To control which plugins Netdata run, edit `netdata.conf` and check the `[plugins]` section. It looks like this: ``` [plugins] @@ -69,6 +69,7 @@ To control which plugins netdata run, edit `netdata.conf` and check the `[plugin # charts.d = yes # apps = yes # xenstat = yes + # perf = no ``` The default for all plugins is the option `enable running new plugins`. So, setting this to `no` will disable all the plugins, except the ones specifically enabled. @@ -81,9 +82,9 @@ Each of the **plugins** may support one or more data collection **modules**. To Most **modules** come with **auto-detection**, configured to work out-of-the-box on popular operating systems with the default settings. -However, there are cases that auto-detection fails. Usually the reason is that the applications to be monitored do not allow netdata to connect. In most of the cases, allowing the user `netdata` from `localhost` to connect and collect metrics, will automatically enable data collection for the application in question (it will require a netdata restart). +However, there are cases that auto-detection fails. Usually the reason is that the applications to be monitored do not allow Netdata to connect. In most of the cases, allowing the user `netdata` from `localhost` to connect and collect metrics, will automatically enable data collection for the application in question (it will require a Netdata restart). -You can verify netdata **external plugins and their modules** are able to collect metrics, following this procedure: +You can verify Netdata **external plugins and their modules** are able to collect metrics, following this procedure: ```sh # become user netdata @@ -95,9 +96,9 @@ sudo su -s /bin/bash netdata ``` Similarly, you can use `charts.d.plugin` for BASH plugins and `node.d.plugin` for node.js plugins. -Other plugins (like `apps.plugin`, `freeipmi.plugin`, `fping.plugin`, `ioping.plugin`) use the native netdata plugin API and can be run directly. +Other plugins (like `apps.plugin`, `freeipmi.plugin`, `fping.plugin`, `ioping.plugin`, `nfacct.plugin`, `xenstat.plugin`, `perf.plugin`) use the native Netdata plugin API and can be run directly. -If you need to configure a netdata plugin or module, all user supplied configuration is kept at `/etc/netdata` while the stock versions of all files is at `/usr/lib/netdata/conf.d`. +If you need to configure a Netdata plugin or module, all user supplied configuration is kept at `/etc/netdata` while the stock versions of all files is at `/usr/lib/netdata/conf.d`. To copy a stock file and edit it, run `/etc/netdata/edit-config`. Running this command without an argument, will list the available stock files. Each file should provide plenty of examples and documentation about each module and plugin. @@ -116,6 +117,9 @@ plugin | language | plugin<br/>configuration | modules<br/>configuration | `fping.plugin`<br/>(external plugin for collecting network latencies)|`C`|`fping.conf`|This plugin is a wrapper for the `fping` command. `ioping.plugin`<br/>(external plugin for collecting disk latencies)|`C`|`ioping.conf`|This plugin is a wrapper for the `ioping` command. `freeipmi.plugin`<br/>(external plugin for collecting IPMI h/w sensors)|`C`|`netdata.conf` section `[plugin:freeipmi]` +`nfacct.plugin`<br/>(external plugin for monitoring netfilter firewall and connection tracker)|`C`|`netdata.conf` section `[plugin:nfacct]`|N/A +`xenstat.plugin`<br/>(external plugin for monitoring XCP-ng and XenServer)|`C`|`netdata.conf` section `[plugin:xenstat]`|N/A +`perf.plugin`<br/>(external plugin for monitoring CPU performance on Linux)|`C`|`netdata.conf` section `[plugin:perf]`|N/A `idlejitter.plugin`<br/>(internal plugin for monitoring CPU jitter)|`C`|N/A|N/A `macos.plugin`<br/>(internal plugin for monitoring MacOS system resources)|`C`|`netdata.conf` section `[plugin:macos]`|one section for each module `[plugin:macos:MODULE]`. Each module may provide additional sections in the form of `[plugin:macos:MODULE:SUBSECTION]`. `node.d.plugin`<br/>(external plugin orchestrator of node.js modules)|`node.js`|`node.d.conf`|a file for each module in `/etc/netdata/node.d/`. @@ -139,17 +143,17 @@ These are all the data collection plugins currently available. application|language|notes| :---------:|:------:|:----| -apache|python<br/>v2 or v3|Connects to multiple apache servers (local or remote) to collect real-time performance metrics.<br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [apache.chart.py](../collectors/python.d.plugin/apache)<br/>configuration file: [python.d/apache.conf](../collectors/python.d.plugin/apache)| -apache|BASH<br/>Shell Script|Connects to an apache server (local or remote) to collect real-time performance metrics.<br/><br/>DEPRECATED IN FAVOR OF THE PYTHON ONE. It is still supplied only as an example module to shell scripting plugins.<br/> <br/>netdata plugin: [charts.d.plugin](../collectors/charts.d.plugin#chartsdplugin)<br/>plugin module: [apache.chart.sh](../collectors/charts.d.plugin/apache)<br/>configuration file: [charts.d/apache.conf](../collectors/charts.d.plugin/apache)| -ipfs|python<br/>v2 or v3|Connects to multiple ipfs servers (local or remote) to collect real-time performance metrics.<br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [ipfs.chart.py](../collectors/python.d.plugin/ipfs)<br/>configuration file: [python.d/ipfs.conf](../collectors/python.d.plugin/ipfs)| -litespeed|python<br/>v2 or v3|reads the litespeed `rtreport` files to collect metrics.<br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [litespeed.chart.py](../collectors/python.d.plugin/litespeed)<br/>configuration file: [python.d/litespeed.conf](../collectors/python.d.plugin/litespeed) -nginx|python<br/>v2 or v3|Connects to multiple nginx servers (local or remote) to collect real-time performance metrics.<br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [nginx.chart.py](../collectors/python.d.plugin/nginx)<br/>configuration file: [python.d/nginx.conf](../collectors/python.d.plugin/nginx)| -nginx_plus|python<br/>v2 or v3|Connects to multiple nginx_plus servers (local or remote) to collect real-time performance metrics.<br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [nginx_plus.chart.py](../collectors/python.d.plugin/nginx_plus)<br/>configuration file: [python.d/nginx_plus.conf](../collectors/python.d.plugin/nginx_plus)| -nginx|BASH<br/>Shell Script|Connects to an nginx server (local or remote) to collect real-time performance metrics.<br/><br/>DEPRECATED IN FAVOR OF THE PYTHON ONE. It is still supplied only as an example module to shell scripting plugins.<br/> <br/>netdata plugin: [charts.d.plugin](../collectors/charts.d.plugin#chartsdplugin)<br/>plugin module: [nginx.chart.sh](../collectors/charts.d.plugin/nginx)<br/>configuration file: [charts.d/nginx.conf](../collectors/charts.d.plugin/nginx)| -phpfpm|python<br/>v2 or v3|Connects to multiple phpfpm servers (local or remote) to collect real-time performance metrics.<br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [phpfpm.chart.py](../collectors/python.d.plugin/phpfpm)<br/>configuration file: [python.d/phpfpm.conf](../collectors/python.d.plugin/phpfpm)| -phpfpm|BASH<br/>Shell Script|Connects to one or more phpfpm servers (local or remote) to collect real-time performance metrics.<br/><br/>DEPRECATED IN FAVOR OF THE PYTHON ONE. It is still supplied only as an example module to shell scripting plugins.<br/> <br/>netdata plugin: [charts.d.plugin](../collectors/charts.d.plugin#chartsdplugin)<br/>plugin module: [phpfpm.chart.sh](../collectors/charts.d.plugin/phpfpm)<br/>configuration file: [charts.d/phpfpm.conf](../collectors/charts.d.plugin/phpfpm)| -tomcat|python<br/>v2 or v3|Connects to multiple tomcat servers (local or remote) to collect real-time performance metrics.<br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [tomcat.chart.py](../collectors/python.d.plugin/tomcat)<br/>configuration file: [python.d/tomcat.conf](../collectors/python.d.plugin/tomcat)| -tomcat|BASH<br/>Shell Script|Connects to a tomcat server (local or remote) to collect real-time performance metrics.<br/><br/>DEPRECATED IN FAVOR OF THE PYTHON ONE. It is still supplied only as an example module to shell scripting plugins.<br/> <br/>netdata plugin: [charts.d.plugin](../collectors/charts.d.plugin#chartsdplugin)<br/>plugin module: [tomcat.chart.sh](../collectors/charts.d.plugin/tomcat)<br/>configuration file: [charts.d/tomcat.conf](../collectors/charts.d.plugin/tomcat)| +apache|python<br/>v2 or v3|Connects to multiple apache servers (local or remote) to collect real-time performance metrics.<br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [apache.chart.py](../collectors/python.d.plugin/apache)<br/>configuration file: [python.d/apache.conf](../collectors/python.d.plugin/apache)| +apache|BASH<br/>Shell Script|Connects to an apache server (local or remote) to collect real-time performance metrics.<br/><br/>DEPRECATED IN FAVOR OF THE PYTHON ONE. It is still supplied only as an example module to shell scripting plugins.<br/> <br/>Netdata plugin: [charts.d.plugin](../collectors/charts.d.plugin#chartsdplugin)<br/>plugin module: [apache.chart.sh](../collectors/charts.d.plugin/apache)<br/>configuration file: [charts.d/apache.conf](../collectors/charts.d.plugin/apache)| +ipfs|python<br/>v2 or v3|Connects to multiple ipfs servers (local or remote) to collect real-time performance metrics.<br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [ipfs.chart.py](../collectors/python.d.plugin/ipfs)<br/>configuration file: [python.d/ipfs.conf](../collectors/python.d.plugin/ipfs)| +litespeed|python<br/>v2 or v3|reads the litespeed `rtreport` files to collect metrics.<br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [litespeed.chart.py](../collectors/python.d.plugin/litespeed)<br/>configuration file: [python.d/litespeed.conf](../collectors/python.d.plugin/litespeed) +nginx|python<br/>v2 or v3|Connects to multiple nginx servers (local or remote) to collect real-time performance metrics.<br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [nginx.chart.py](../collectors/python.d.plugin/nginx)<br/>configuration file: [python.d/nginx.conf](../collectors/python.d.plugin/nginx)| +nginx_plus|python<br/>v2 or v3|Connects to multiple nginx_plus servers (local or remote) to collect real-time performance metrics.<br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [nginx_plus.chart.py](../collectors/python.d.plugin/nginx_plus)<br/>configuration file: [python.d/nginx_plus.conf](../collectors/python.d.plugin/nginx_plus)| +nginx|BASH<br/>Shell Script|Connects to an nginx server (local or remote) to collect real-time performance metrics.<br/><br/>DEPRECATED IN FAVOR OF THE PYTHON ONE. It is still supplied only as an example module to shell scripting plugins.<br/> <br/>Netdata plugin: [charts.d.plugin](../collectors/charts.d.plugin#chartsdplugin)<br/>plugin module: [nginx.chart.sh](../collectors/charts.d.plugin/nginx)<br/>configuration file: [charts.d/nginx.conf](../collectors/charts.d.plugin/nginx)| +phpfpm|python<br/>v2 or v3|Connects to multiple phpfpm servers (local or remote) to collect real-time performance metrics.<br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [phpfpm.chart.py](../collectors/python.d.plugin/phpfpm)<br/>configuration file: [python.d/phpfpm.conf](../collectors/python.d.plugin/phpfpm)| +phpfpm|BASH<br/>Shell Script|Connects to one or more phpfpm servers (local or remote) to collect real-time performance metrics.<br/><br/>DEPRECATED IN FAVOR OF THE PYTHON ONE. It is still supplied only as an example module to shell scripting plugins.<br/> <br/>Netdata plugin: [charts.d.plugin](../collectors/charts.d.plugin#chartsdplugin)<br/>plugin module: [phpfpm.chart.sh](../collectors/charts.d.plugin/phpfpm)<br/>configuration file: [charts.d/phpfpm.conf](../collectors/charts.d.plugin/phpfpm)| +tomcat|python<br/>v2 or v3|Connects to multiple tomcat servers (local or remote) to collect real-time performance metrics.<br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [tomcat.chart.py](../collectors/python.d.plugin/tomcat)<br/>configuration file: [python.d/tomcat.conf](../collectors/python.d.plugin/tomcat)| +tomcat|BASH<br/>Shell Script|Connects to a tomcat server (local or remote) to collect real-time performance metrics.<br/><br/>DEPRECATED IN FAVOR OF THE PYTHON ONE. It is still supplied only as an example module to shell scripting plugins.<br/> <br/>Netdata plugin: [charts.d.plugin](../collectors/charts.d.plugin#chartsdplugin)<br/>plugin module: [tomcat.chart.sh](../collectors/charts.d.plugin/tomcat)<br/>configuration file: [charts.d/tomcat.conf](../collectors/charts.d.plugin/tomcat)| --- @@ -158,7 +162,7 @@ tomcat|BASH<br/>Shell Script|Connects to a tomcat server (local or remote) to co application|language|notes| :---------:|:------:|:----| -web_log|python<br/>v2 or v3|powerful plugin, capable of incrementally parsing any number of web server log files <br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [web_log.chart.py](../collectors/python.d.plugin/web_log)<br/>configuration file: [python.d/web_log.conf](../collectors/python.d.plugin/web_log)| +web_log|python<br/>v2 or v3|powerful plugin, capable of incrementally parsing any number of web server log files <br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [web_log.chart.py](../collectors/python.d.plugin/web_log)<br/>configuration file: [python.d/web_log.conf](../collectors/python.d.plugin/web_log)| --- @@ -167,14 +171,14 @@ web_log|python<br/>v2 or v3|powerful plugin, capable of incrementally parsing an application|language|notes| :---------:|:------:|:----| -couchdb|python<br/>v2 or v3|Connects to multiple couchdb servers (local or remote) to collect real-time performance metrics.<br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [couchdb.chart.py](../collectors/python.d.plugin/couchdb)<br/>configuration file: [python.d/couchdb.conf](../collectors/python.d.plugin/couchdb)| -memcached|python<br/>v2 or v3|Connects to multiple memcached servers (local or remote) to collect real-time performance metrics.<br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [memcached.chart.py](../collectors/python.d.plugin/memcached)<br/>configuration file: [python.d/memcached.conf](../collectors/python.d.plugin/memcached)| -mongodb|python<br/>v2 or v3|Connects to multiple `mongodb` servers (local or remote) to collect real-time performance metrics.<br/> <br/>Requires package `python-pymongo`.<br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [mongodb.chart.py](../collectors/python.d.plugin/mongodb)<br/>configuration file: [python.d/mongodb.conf](../collectors/python.d.plugin/mongodb)| -mysql<br/>mariadb|python<br/>v2 or v3|Connects to multiple mysql or mariadb servers (local or remote) to collect real-time performance metrics.<br/> <br/>Requires package `python-mysqldb` (faster and preferred), or `python-pymysql`. <br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [mysql.chart.py](../collectors/python.d.plugin/mysql)<br/>configuration file: [python.d/mysql.conf](../collectors/python.d.plugin/mysql)| -mysql<br/>mariadb|BASH<br/>Shell Script|Connects to multiple mysql or mariadb servers (local or remote) to collect real-time performance metrics.<br/><br/>DEPRECATED IN FAVOR OF THE PYTHON ONE. It is still supplied only as an example module to shell scripting plugins.<br/> <br/>netdata plugin: [charts.d.plugin](../collectors/charts.d.plugin#chartsdplugin)<br/>plugin module: [mysql.chart.sh](../collectors/charts.d.plugin/mysql)<br/>configuration file: [charts.d/mysql.conf](../collectors/charts.d.plugin/mysql)| -postgres|python<br/>v2 or v3|Connects to multiple postgres servers (local or remote) to collect real-time performance metrics.<br/> <br/>Requires package `python-psycopg2`.<br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [postgres.chart.py](../collectors/python.d.plugin/postgres)<br/>configuration file: [python.d/postgres.conf](../collectors/python.d.plugin/postgres)| -redis|python<br/>v2 or v3|Connects to multiple redis servers (local or remote) to collect real-time performance metrics.<br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [redis.chart.py](../collectors/python.d.plugin/redis)<br/>configuration file: [python.d/redis.conf](../collectors/python.d.plugin/redis)| -rethinkdb|python<br/>v2 or v3|Connects to multiple rethinkdb servers (local or remote) to collect real-time metrics.<br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [rethinkdb.chart.py](../collectors/python.d.plugin/rethinkdbs)<br/>configuration file: [python.d/rethinkdb.conf](../collectors/python.d.plugin/rethinkdbs)| +couchdb|python<br/>v2 or v3|Connects to multiple couchdb servers (local or remote) to collect real-time performance metrics.<br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [couchdb.chart.py](../collectors/python.d.plugin/couchdb)<br/>configuration file: [python.d/couchdb.conf](../collectors/python.d.plugin/couchdb)| +memcached|python<br/>v2 or v3|Connects to multiple memcached servers (local or remote) to collect real-time performance metrics.<br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [memcached.chart.py](../collectors/python.d.plugin/memcached)<br/>configuration file: [python.d/memcached.conf](../collectors/python.d.plugin/memcached)| +mongodb|python<br/>v2 or v3|Connects to multiple `mongodb` servers (local or remote) to collect real-time performance metrics.<br/> <br/>Requires package `python-pymongo`.<br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [mongodb.chart.py](../collectors/python.d.plugin/mongodb)<br/>configuration file: [python.d/mongodb.conf](../collectors/python.d.plugin/mongodb)| +mysql<br/>mariadb|python<br/>v2 or v3|Connects to multiple mysql or mariadb servers (local or remote) to collect real-time performance metrics.<br/> <br/>Requires package `python-mysqldb` (faster and preferred), or `python-pymysql`. <br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [mysql.chart.py](../collectors/python.d.plugin/mysql)<br/>configuration file: [python.d/mysql.conf](../collectors/python.d.plugin/mysql)| +mysql<br/>mariadb|BASH<br/>Shell Script|Connects to multiple mysql or mariadb servers (local or remote) to collect real-time performance metrics.<br/><br/>DEPRECATED IN FAVOR OF THE PYTHON ONE. It is still supplied only as an example module to shell scripting plugins.<br/> <br/>Netdata plugin: [charts.d.plugin](../collectors/charts.d.plugin#chartsdplugin)<br/>plugin module: [mysql.chart.sh](../collectors/charts.d.plugin/mysql)<br/>configuration file: [charts.d/mysql.conf](../collectors/charts.d.plugin/mysql)| +postgres|python<br/>v2 or v3|Connects to multiple postgres servers (local or remote) to collect real-time performance metrics.<br/> <br/>Requires package `python-psycopg2`.<br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [postgres.chart.py](../collectors/python.d.plugin/postgres)<br/>configuration file: [python.d/postgres.conf](../collectors/python.d.plugin/postgres)| +redis|python<br/>v2 or v3|Connects to multiple redis servers (local or remote) to collect real-time performance metrics.<br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [redis.chart.py](../collectors/python.d.plugin/redis)<br/>configuration file: [python.d/redis.conf](../collectors/python.d.plugin/redis)| +rethinkdb|python<br/>v2 or v3|Connects to multiple rethinkdb servers (local or remote) to collect real-time metrics.<br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [rethinkdb.chart.py](../collectors/python.d.plugin/rethinkdbs)<br/>configuration file: [python.d/rethinkdb.conf](../collectors/python.d.plugin/rethinkdbs)| --- @@ -193,7 +197,7 @@ retroshare|python<br/>v2 or v3|Connects to multiple retroshare servers (local or application|language|notes| :---------:|:------:|:----| squid|python<br/>v2 or v3|Connects to multiple squid servers (local or remote) to collect real-time performance metrics.<br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [squid.chart.py](../collectors/python.d.plugin/squid)<br/>configuration file: [python.d/squid.conf](../collectors/python.d.plugin/squid)| -squid|BASH<br/>Shell Script|Connects to a squid server (local or remote) to collect real-time performance metrics.<br/><br/>DEPRECATED IN FAVOR OF THE PYTHON ONE. It is still supplied only as an example module to shell scripting plugins.<br/> <br/>netdata plugin: [charts.d.plugin](../collectors/charts.d.plugin#chartsdplugin)<br/>plugin module: [squid.chart.sh](../collectors/charts.d.plugin/squid)<br/>configuration file: [charts.d/squid.conf](../collectors/charts.d.plugin/squid)| +squid|BASH<br/>Shell Script|Connects to a squid server (local or remote) to collect real-time performance metrics.<br/><br/>DEPRECATED IN FAVOR OF THE PYTHON ONE. It is still supplied only as an example module to shell scripting plugins.<br/> <br/>Netdata plugin: [charts.d.plugin](../collectors/charts.d.plugin#chartsdplugin)<br/>plugin module: [squid.chart.sh](../collectors/charts.d.plugin/squid)<br/>configuration file: [charts.d/squid.conf](../collectors/charts.d.plugin/squid)| --- @@ -202,7 +206,7 @@ squid|BASH<br/>Shell Script|Connects to a squid server (local or remote) to coll application|language|notes| :---------:|:------:|:----| -varnish|python<br/>v2 or v3|Uses the varnishstat command to provide varnish cache statistics (client metrics, cache perfomance, thread-related metrics, backend health, memory usage etc.).<br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [varnish.chart.py](../collectors/python.d.plugin/varnish)<br/>configuration file: [python.d/varnish.conf](../collectors/python.d.plugin/varnish)| +varnish|python<br/>v2 or v3|Uses the varnishstat command to provide varnish cache statistics (client metrics, cache perfomance, thread-related metrics, backend health, memory usage etc.).<br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [varnish.chart.py](../collectors/python.d.plugin/varnish)<br/>configuration file: [python.d/varnish.conf](../collectors/python.d.plugin/varnish)| --- @@ -211,7 +215,7 @@ varnish|python<br/>v2 or v3|Uses the varnishstat command to provide varnish cach application|language|notes| :---------:|:------:|:----| -elasticsearch|python<br/>v2 or v3|Monitor elasticsearch performance and health metrics.<br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [elasticsearch.chart.py](../collectors/python.d.plugin/elasticsearch)<br/>configuration file: [python.d/elasticsearch.conf](../collectors/python.d.plugin/elasticsearch)| +elasticsearch|python<br/>v2 or v3|Monitor elasticsearch performance and health metrics.<br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [elasticsearch.chart.py](../collectors/python.d.plugin/elasticsearch)<br/>configuration file: [python.d/elasticsearch.conf](../collectors/python.d.plugin/elasticsearch)| --- @@ -220,12 +224,12 @@ elasticsearch|python<br/>v2 or v3|Monitor elasticsearch performance and health m application|language|notes| :---------:|:------:|:----| -named|node.js|Connects to multiple named (ISC-Bind) servers (local or remote) to collect real-time performance metrics. All versions of bind after 9.9.10 are supported.<br/> <br/>netdata plugin: [node.d.plugin](../collectors/node.d.plugin#nodedplugin)<br/>plugin module: [named.node.js](../collectors/node.d.plugin/named)<br/>configuration file: [node.d/named.conf](../collectors/node.d.plugin/named)| -bind_rndc|python<br/>v2 or v3|Parses named.stats dump file to collect real-time performance metrics. All versions of bind after 9.6 are supported.<br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [bind_rndc.chart.py](../collectors/python.d.plugin/bind_rndc)<br/>configuration file: [python.d/bind_rndc.conf](../collectors/python.d.plugin/bind_rndc)| -nsd|python<br/>v2 or v3|Charts the nsd received queries and zones.<br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [nsd.chart.py](../collectors/python.d.plugin/nsd)<br/>configuration file: [python.d/nsd.conf](../collectors/python.d.plugin/nsd) -powerdns|python<br/>v2 or v3|Monitors powerdns performance and health metrics <br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [powerdns.chart.py](../collectors/python.d.plugin/powerdns)<br/>configuration file: [python.d/powerdns.conf](../collectors/python.d.plugin/powerdns)| -dnsdist|python<br/>v2 or v3|Monitors dnsdist performance and health metrics <br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [dnsdist.chart.py](../collectors/python.d.plugin/dnsdist)<br/>configuration file: [python.d/dnsdist.conf](../collectors/python.d.plugin/dnsdist)| -unbound|python<br/>v2 or v3|Monitors Unbound performance and resource usage metrics <br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [unbound.chart.py](../collectors/python.d.plugin/unbound)<br/>configuration file: [python.d/unbound.conf](../collectors/python.d.plugin/unbound)| +named|node.js|Connects to multiple named (ISC-Bind) servers (local or remote) to collect real-time performance metrics. All versions of bind after 9.9.10 are supported.<br/> <br/>Netdata plugin: [node.d.plugin](../collectors/node.d.plugin#nodedplugin)<br/>plugin module: [named.node.js](../collectors/node.d.plugin/named)<br/>configuration file: [node.d/named.conf](../collectors/node.d.plugin/named)| +bind_rndc|python<br/>v2 or v3|Parses named.stats dump file to collect real-time performance metrics. All versions of bind after 9.6 are supported.<br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [bind_rndc.chart.py](../collectors/python.d.plugin/bind_rndc)<br/>configuration file: [python.d/bind_rndc.conf](../collectors/python.d.plugin/bind_rndc)| +nsd|python<br/>v2 or v3|Charts the nsd received queries and zones.<br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [nsd.chart.py](../collectors/python.d.plugin/nsd)<br/>configuration file: [python.d/nsd.conf](../collectors/python.d.plugin/nsd) +powerdns|python<br/>v2 or v3|Monitors powerdns performance and health metrics <br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [powerdns.chart.py](../collectors/python.d.plugin/powerdns)<br/>configuration file: [python.d/powerdns.conf](../collectors/python.d.plugin/powerdns)| +dnsdist|python<br/>v2 or v3|Monitors dnsdist performance and health metrics <br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [dnsdist.chart.py](../collectors/python.d.plugin/dnsdist)<br/>configuration file: [python.d/dnsdist.conf](../collectors/python.d.plugin/dnsdist)| +unbound|python<br/>v2 or v3|Monitors Unbound performance and resource usage metrics <br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [unbound.chart.py](../collectors/python.d.plugin/unbound)<br/>configuration file: [python.d/unbound.conf](../collectors/python.d.plugin/unbound)| --- @@ -234,7 +238,7 @@ unbound|python<br/>v2 or v3|Monitors Unbound performance and resource usage metr application|language|notes| :---------:|:------:|:----| -isc dhcp|python<br/>v2 or v3|Monitor lease database to show all active leases.<br/> <br/>Python v2 requires package `python-ipaddress`.<br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [isc-dhcpd.chart.py](../collectors/python.d.plugin/isc_dhcpd)<br/>configuration file: [python.d/isc-dhcpd.conf](../collectors/python.d.plugin/isc_dhcpd)| +isc dhcp|python<br/>v2 or v3|Monitor lease database to show all active leases.<br/> <br/>Python v2 requires package `python-ipaddress`.<br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [isc-dhcpd.chart.py](../collectors/python.d.plugin/isc_dhcpd)<br/>configuration file: [python.d/isc-dhcpd.conf](../collectors/python.d.plugin/isc_dhcpd)| --- @@ -243,8 +247,8 @@ isc dhcp|python<br/>v2 or v3|Monitor lease database to show all active leases.<b application|language|notes| :---------:|:------:|:----| -haproxy|python<br/>v2 or v3|Monitor frontend, backend and health metrics.<br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [haproxy.chart.py](../collectors/python.d.plugin/haproxy)<br/>configuration file: [python.d/haproxy.conf](../collectors/python.d.plugin/haproxy)| -traefik|python<br/>v2 or v3|Connects to multiple traefik instances (local or remote) to collect API metrics (response status code, response time, average response time and server uptime).<br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [traefik.chart.py](../collectors/python.d.plugin/traefik)<br/>configuration file: [python.d/traefik.conf](../collectors/python.d.plugin/traefik)| +haproxy|python<br/>v2 or v3|Monitor frontend, backend and health metrics.<br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [haproxy.chart.py](../collectors/python.d.plugin/haproxy)<br/>configuration file: [python.d/haproxy.conf](../collectors/python.d.plugin/haproxy)| +traefik|python<br/>v2 or v3|Connects to multiple traefik instances (local or remote) to collect API metrics (response status code, response time, average response time and server uptime).<br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [traefik.chart.py](../collectors/python.d.plugin/traefik)<br/>configuration file: [python.d/traefik.conf](../collectors/python.d.plugin/traefik)| --- @@ -252,8 +256,8 @@ traefik|python<br/>v2 or v3|Connects to multiple traefik instances (local or rem application|language|notes| :---------:|:------:|:----| -rabbitmq|python<br/>v2 or v3|Monitor rabbitmq performance and health metrics.<br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [rabbitmq.chart.py](../collectors/python.d.plugin/rabbitmq)<br/>configuration file: [python.d/rabbitmq.conf](../collectors/python.d.plugin/rabbitmq)| -beanstalkd|python<br/>v2 or v3|Provides server and tube level statistics.<br/> <br/>Requires beanstalkc python package (`pip install beanstalkc` or install package `python-beanstalkc`, which also installs `python-yaml`).<br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [beanstalk.chart.py](../collectors/python.d.plugin/beanstalk)<br/>configuration file: [python.d/beanstalk.conf](../collectors/python.d.plugin/beanstalk)| +rabbitmq|python<br/>v2 or v3|Monitor rabbitmq performance and health metrics.<br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [rabbitmq.chart.py](../collectors/python.d.plugin/rabbitmq)<br/>configuration file: [python.d/rabbitmq.conf](../collectors/python.d.plugin/rabbitmq)| +beanstalkd|python<br/>v2 or v3|Provides server and tube level statistics.<br/> <br/>Requires beanstalkc python package (`pip install beanstalkc` or install package `python-beanstalkc`, which also installs `python-yaml`).<br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [beanstalk.chart.py](../collectors/python.d.plugin/beanstalk)<br/>configuration file: [python.d/beanstalk.conf](../collectors/python.d.plugin/beanstalk)| --- @@ -262,8 +266,8 @@ beanstalkd|python<br/>v2 or v3|Provides server and tube level statistics.<br/>&n application|language|notes| :---------:|:------:|:----| -apcupsd|BASH<br/>Shell Script|Connects to an apcupsd server to collect real-time statistics of an APC UPS.<br/> <br/>netdata plugin: [charts.d.plugin](../collectors/charts.d.plugin#chartsdplugin)<br/>plugin module: [apcupsd.chart.sh](../collectors/charts.d.plugin/apcupsd)<br/>configuration file: [charts.d/apcupsd.conf](../collectors/charts.d.plugin/apcupsd)| -nut|BASH<br/>Shell Script|Connects to a nut server (upsd) to collect real-time UPS statistics.<br/> <br/>netdata plugin: [charts.d.plugin](../collectors/charts.d.plugin#chartsdplugin)<br/>plugin module: [nut.chart.sh](../collectors/charts.d.plugin/nut)<br/>configuration file: [charts.d/nut.conf](../collectors/charts.d.plugin/nut)| +apcupsd|BASH<br/>Shell Script|Connects to an apcupsd server to collect real-time statistics of an APC UPS.<br/> <br/>Netdata plugin: [charts.d.plugin](../collectors/charts.d.plugin#chartsdplugin)<br/>plugin module: [apcupsd.chart.sh](../collectors/charts.d.plugin/apcupsd)<br/>configuration file: [charts.d/apcupsd.conf](../collectors/charts.d.plugin/apcupsd)| +nut|BASH<br/>Shell Script|Connects to a nut server (upsd) to collect real-time UPS statistics.<br/> <br/>Netdata plugin: [charts.d.plugin](../collectors/charts.d.plugin#chartsdplugin)<br/>plugin module: [nut.chart.sh](../collectors/charts.d.plugin/nut)<br/>configuration file: [charts.d/nut.conf](../collectors/charts.d.plugin/nut)| --- @@ -272,7 +276,7 @@ nut|BASH<br/>Shell Script|Connects to a nut server (upsd) to collect real-time U application|language|notes| :---------:|:------:|:----| -megacli|python<br/>v2 or v3|Collects adapter, physical drives and battery stats..<br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [megacli.chart.py](../collectors/python.d.plugin/megacli)<br/>configuration file: [python.d/megacli.conf](../collectors/python.d.plugin/megacli)| +megacli|python<br/>v2 or v3|Collects adapter, physical drives and battery stats..<br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [megacli.chart.py](../collectors/python.d.plugin/megacli)<br/>configuration file: [python.d/megacli.conf](../collectors/python.d.plugin/megacli)| --- @@ -280,11 +284,11 @@ megacli|python<br/>v2 or v3|Collects adapter, physical drives and battery stats. application|language|notes| :---------:|:------:|:----| -dovecot|python<br/>v2 or v3|Connects to multiple dovecot servers (local or remote) to collect real-time performance metrics.<br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [dovecot.chart.py](../collectors/python.d.plugin/dovecot)<br/>configuration file: [python.d/dovecot.conf](../collectors/python.d.plugin/dovecot)| -exim|python<br/>v2 or v3|Charts the exim queue size.<br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [exim.chart.py](../collectors/python.d.plugin/exim)<br/>configuration file: [python.d/exim.conf](../collectors/python.d.plugin/exim)| -exim|BASH<br/>Shell Script|Charts the exim queue size.<br/><br/>DEPRECATED IN FAVOR OF THE PYTHON ONE. It is still supplied only as an example module to shell scripting plugins.<br/> <br/>netdata plugin: [charts.d.plugin](../collectors/charts.d.plugin#chartsdplugin)<br/>plugin module: [exim.chart.sh](../collectors/charts.d.plugin/exim)<br/>configuration file: [charts.d/exim.conf](../collectors/charts.d.plugin/exim)| -postfix|python<br/>v2 or v3|Charts the postfix queue size (supports multiple queues).<br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [postfix.chart.py](../collectors/python.d.plugin/postfix)<br/>configuration file: [python.d/postfix.conf](../collectors/python.d.plugin/postfix)| -postfix|BASH<br/>Shell Script|Charts the postfix queue size.<br/><br/>DEPRECATED IN FAVOR OF THE PYTHON ONE. It is still supplied only as an example module to shell scripting plugins.<br/> <br/>netdata plugin: [charts.d.plugin](../collectors/charts.d.plugin#chartsdplugin)<br/>plugin module: [postfix.chart.sh](../collectors/charts.d.plugin/postfix)<br/>configuration file: [charts.d/postfix.conf](../collectors/charts.d.plugin/postfix)| +dovecot|python<br/>v2 or v3|Connects to multiple dovecot servers (local or remote) to collect real-time performance metrics.<br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [dovecot.chart.py](../collectors/python.d.plugin/dovecot)<br/>configuration file: [python.d/dovecot.conf](../collectors/python.d.plugin/dovecot)| +exim|python<br/>v2 or v3|Charts the exim queue size.<br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [exim.chart.py](../collectors/python.d.plugin/exim)<br/>configuration file: [python.d/exim.conf](../collectors/python.d.plugin/exim)| +exim|BASH<br/>Shell Script|Charts the exim queue size.<br/><br/>DEPRECATED IN FAVOR OF THE PYTHON ONE. It is still supplied only as an example module to shell scripting plugins.<br/> <br/>Netdata plugin: [charts.d.plugin](../collectors/charts.d.plugin#chartsdplugin)<br/>plugin module: [exim.chart.sh](../collectors/charts.d.plugin/exim)<br/>configuration file: [charts.d/exim.conf](../collectors/charts.d.plugin/exim)| +postfix|python<br/>v2 or v3|Charts the postfix queue size (supports multiple queues).<br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [postfix.chart.py](../collectors/python.d.plugin/postfix)<br/>configuration file: [python.d/postfix.conf](../collectors/python.d.plugin/postfix)| +postfix|BASH<br/>Shell Script|Charts the postfix queue size.<br/><br/>DEPRECATED IN FAVOR OF THE PYTHON ONE. It is still supplied only as an example module to shell scripting plugins.<br/> <br/>Netdata plugin: [charts.d.plugin](../collectors/charts.d.plugin#chartsdplugin)<br/>plugin module: [postfix.chart.sh](../collectors/charts.d.plugin/postfix)<br/>configuration file: [charts.d/postfix.conf](../collectors/charts.d.plugin/postfix)| --- @@ -293,7 +297,7 @@ postfix|BASH<br/>Shell Script|Charts the postfix queue size.<br/><br/>DEPRECATED application|language|notes| :---------:|:------:|:----| -NFS Client|`C`|This is handled entirely by the netdata daemon.<br/> <br/>Configuration: `netdata.conf`, section `[plugin:proc:/proc/net/rpc/nfs]`. +NFS Client|`C`|This is handled entirely by the Netdata daemon.<br/> <br/>Configuration: `netdata.conf`, section `[plugin:proc:/proc/net/rpc/nfs]`. NFS Server|`C`|This is handled entirely by the netdata daemon.<br/> <br/>Configuration: `netdata.conf`, section `[plugin:proc:/proc/net/rpc/nfsd]`. samba|python<br/>v2 or v3|Performance metrics of Samba SMB2 file sharing.<br/> <br/>documentation page: [python.d.plugin module samba](../collectors/python.d.plugin/samba)<br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [samba.chart.py](../collectors/python.d.plugin/samba)<br/>configuration file: [python.d/samba.conf](../collectors/python.d.plugin/samba)| @@ -319,11 +323,12 @@ xenstat|C|Collects host and domain statistics for XenServer or XCP-ng hypervisor application|language|notes| :---------:|:------:|:----| -apps|C|`apps.plugin` collects resource usage statistics for all processes running in the system. It groups the entire process tree and reports dozens of metrics for CPU utilization, memory footprint, disk I/O, swap memory, network connections, open files and sockets, etc. It reports metrics for application groups, users and user groups.<br/> <br/>[Documentation of `apps.plugin`](../collectors/apps.plugin/).<br/> <br/>netdata plugin: [`apps_plugin.c`](../collectors/apps.plugin)<br/>configuration file: [`apps_groups.conf`](../collectors/apps.plugin)| -ioping|C|Charts disk latency statistics for a directory/file/device, using the `ioping` command. A recent (probably unreleased) version of ioping is required. The plugin supplied can install it in `/usr/local`.<br/> <br/>netdata plugin: [ioping.plugin](../collectors/ioping.plugin) (this is a shell wrapper to start ioping - once ioping is started, netdata and ioping communicate directly - it can also install the right version of ioping)<br/>configuration file: [ioping.conf](../collectors/ioping.plugin)| -cpu_apps|BASH<br/>Shell Script|Collects the CPU utilization of select apps.<br/><br/>DEPRECATED IN FAVOR OF `apps.plugin`. It is still supplied only as an example module to shell scripting plugins.<br/> <br/>netdata plugin: [charts.d.plugin](../collectors/charts.d.plugin#chartsdplugin)<br/>plugin module: [cpu_apps.chart.sh](../collectors/charts.d.plugin/cpu_apps)<br/>configuration file: [charts.d/cpu_apps.conf](../collectors/charts.d.plugin/cpu_apps)| -load_average|BASH<br/>Shell Script|Collects the current system load average.<br/><br/>DEPRECATED IN FAVOR OF THE NETDATA INTERNAL ONE. It is still supplied only as an example module to shell scripting plugins.<br/> <br/>netdata plugin: [charts.d.plugin](../collectors/charts.d.plugin#chartsdplugin)<br/>plugin module: [load_average.chart.sh](../collectors/charts.d.plugin/load_average)<br/>configuration file: [charts.d/load_average.conf](../collectors/charts.d.plugin/load_average)| -mem_apps|BASH<br/>Shell Script|Collects the memory footprint of select applications.<br/><br/>DEPRECATED IN FAVOR OF `apps.plugin`. It is still supplied only as an example module to shell scripting plugins.<br/> <br/>netdata plugin: [charts.d.plugin](../collectors/charts.d.plugin#chartsdplugin)<br/>plugin module: [mem_apps.chart.sh](../collectors/charts.d.plugin/mem_apps)<br/>configuration file: [charts.d/mem_apps.conf](../collectors/charts.d.plugin/mem_apps)| +apps|C|`apps.plugin` collects resource usage statistics for all processes running in the system. It groups the entire process tree and reports dozens of metrics for CPU utilization, memory footprint, disk I/O, swap memory, network connections, open files and sockets, etc. It reports metrics for application groups, users and user groups.<br/> <br/>[Documentation of `apps.plugin`](../collectors/apps.plugin/).<br/> <br/>Netdata plugin: [`apps_plugin.c`](../collectors/apps.plugin)<br/>configuration file: [`apps_groups.conf`](../collectors/apps.plugin)| +ioping|C|Charts disk latency statistics for a directory/file/device, using the `ioping` command. A recent (probably unreleased) version of ioping is required. The plugin supplied can install it in `/usr/local`.<br/> <br/>Netdata plugin: [ioping.plugin](../collectors/ioping.plugin) (this is a shell wrapper to start ioping - once ioping is started, Netdata and ioping communicate directly - it can also install the right version of ioping)<br/>configuration file: [ioping.conf](../collectors/ioping.plugin)| +perf|C|`perf.plugin` collects CPU performance metrics using hardware performance monitoring units (PMU).<br/> <br/>[Documentation of `perf.plugin`](../collectors/perf.plugin/).<br/> <br/>Netdata plugin: [`perf_plugin.c`](../collectors/perf.plugin)| +cpu_apps|BASH<br/>Shell Script|Collects the CPU utilization of select apps.<br/><br/>DEPRECATED IN FAVOR OF `apps.plugin`. It is still supplied only as an example module to shell scripting plugins.<br/> <br/>Netdata plugin: [charts.d.plugin](../collectors/charts.d.plugin#chartsdplugin)<br/>plugin module: [cpu_apps.chart.sh](../collectors/charts.d.plugin/cpu_apps)<br/>configuration file: [charts.d/cpu_apps.conf](../collectors/charts.d.plugin/cpu_apps)| +load_average|BASH<br/>Shell Script|Collects the current system load average.<br/><br/>DEPRECATED IN FAVOR OF THE NETDATA INTERNAL ONE. It is still supplied only as an example module to shell scripting plugins.<br/> <br/>Netdata plugin: [charts.d.plugin](../collectors/charts.d.plugin#chartsdplugin)<br/>plugin module: [load_average.chart.sh](../collectors/charts.d.plugin/load_average)<br/>configuration file: [charts.d/load_average.conf](../collectors/charts.d.plugin/load_average)| +mem_apps|BASH<br/>Shell Script|Collects the memory footprint of select applications.<br/><br/>DEPRECATED IN FAVOR OF `apps.plugin`. It is still supplied only as an example module to shell scripting plugins.<br/> <br/>Netdata plugin: [charts.d.plugin](../collectors/charts.d.plugin#chartsdplugin)<br/>plugin module: [mem_apps.chart.sh](../collectors/charts.d.plugin/mem_apps)<br/>configuration file: [charts.d/mem_apps.conf](../collectors/charts.d.plugin/mem_apps)| --- @@ -332,14 +337,14 @@ mem_apps|BASH<br/>Shell Script|Collects the memory footprint of select applicati application|language|notes| :---------:|:------:|:----| -cpufreq|BASH<br/>Shell Script|Collects current CPU frequency from `/sys/devices`.<br/><br/>DEPRECATED IN FAVOR OF THE PYTHON ONE. It is still supplied only as an example module to shell scripting plugins.<br/> <br/>netdata plugin: [charts.d.plugin](../collectors/charts.d.plugin#chartsdplugin)<br/>plugin module: [cpufreq.chart.sh](../collectors/charts.d.plugin/cpufreq)<br/>configuration file: [charts.d/cpufreq.conf](../collectors/charts.d.plugin/cpufreq)| -IPMI|C|Collects temperatures, voltages, currents, power, fans and `SEL` events from IPMI using `libipmimonitoring`.<br/>Check [Monitoring IPMI](../collectors/freeipmi.plugin/) for more information<br/> <br/>netdata plugin: [freeipmi.plugin](../collectors/freeipmi.plugin)<br/>configuration file: none required - to enable it, compile/install netdata with `--enable-plugin-freeipmi`| -hddtemp|python<br/>v2 or v3|Connects to multiple hddtemp servers (local or remote) to collect real-time performance metrics.<br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [hddtemp.chart.py](../collectors/python.d.plugin/hddtemp)<br/>configuration file: [python.d/hddtemp.conf](../collectors/python.d.plugin/hddtemp)| -hddtemp|BASH<br/>Shell Script|Connects to a hddtemp server (local or remote) to collect real-time performance metrics.<br/><br/>DEPRECATED IN FAVOR OF THE PYTHON ONE. It is still supplied only as an example module to shell scripting plugins.<br/> <br/>netdata plugin: [charts.d.plugin](../collectors/charts.d.plugin#chartsdplugin)<br/>plugin module: [hddtemp.chart.sh](../collectors/charts.d.plugin/hddtemp)<br/>configuration file: [charts.d/hddtemp.conf](../collectors/charts.d.plugin/hddtemp)| -sensors|BASH<br/>Shell Script|Collects sensors values from files in `/sys`.<br/><br/>DEPRECATED IN FAVOR OF THE PYTHON ONE. It is still supplied only as an example module to shell scripting plugins.<br/> <br/>netdata plugin: [charts.d.plugin](../collectors/charts.d.plugin#chartsdplugin)<br/>plugin module: [sensors.chart.sh](../collectors/charts.d.plugin/sensors)<br/>configuration file: [charts.d/sensors.conf](../collectors/charts.d.plugin/sensors)| -sensors|python<br/>v2 or v3|Uses `lm-sensors` to collect sensor data.<br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [sensors.chart.py](../collectors/python.d.plugin/sensors)<br/>configuration file: [python.d/sensors.conf](../collectors/python.d.plugin/sensors)| -smartd_log|python<br/>v2 or v3|Collects the S.M.A.R.T attributes from `smartd` log files.<br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [smartd_log.chart.py](../collectors/python.d.plugin/smartd_log)<br/>configuration file: [python.d/smartd_log.conf](../collectors/python.d.plugin/smartd_log)| -w1sensor|python<br/>v2 or v3|Collects data from connected 1-Wire sensors.<br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [w1sensor.chart.py](../collectors/python.d.plugin/w1sensor)<br/>configuration file: [python.d/w1sensor.conf](../collectors/python.d.plugin/w1sensor)| +cpufreq|BASH<br/>Shell Script|Collects current CPU frequency from `/sys/devices`.<br/><br/>DEPRECATED IN FAVOR OF THE PYTHON ONE. It is still supplied only as an example module to shell scripting plugins.<br/> <br/>Netdata plugin: [charts.d.plugin](../collectors/charts.d.plugin#chartsdplugin)<br/>plugin module: [cpufreq.chart.sh](../collectors/charts.d.plugin/cpufreq)<br/>configuration file: [charts.d/cpufreq.conf](../collectors/charts.d.plugin/cpufreq)| +IPMI|C|Collects temperatures, voltages, currents, power, fans and `SEL` events from IPMI using `libipmimonitoring`.<br/>Check [Monitoring IPMI](../collectors/freeipmi.plugin/) for more information<br/> <br/>Netdata plugin: [freeipmi.plugin](../collectors/freeipmi.plugin)<br/>configuration file: none required - to enable it, compile/install Netdata with `--enable-plugin-freeipmi`| +hddtemp|python<br/>v2 or v3|Connects to multiple hddtemp servers (local or remote) to collect real-time performance metrics.<br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [hddtemp.chart.py](../collectors/python.d.plugin/hddtemp)<br/>configuration file: [python.d/hddtemp.conf](../collectors/python.d.plugin/hddtemp)| +hddtemp|BASH<br/>Shell Script|Connects to a hddtemp server (local or remote) to collect real-time performance metrics.<br/><br/>DEPRECATED IN FAVOR OF THE PYTHON ONE. It is still supplied only as an example module to shell scripting plugins.<br/> <br/>Netdata plugin: [charts.d.plugin](../collectors/charts.d.plugin#chartsdplugin)<br/>plugin module: [hddtemp.chart.sh](../collectors/charts.d.plugin/hddtemp)<br/>configuration file: [charts.d/hddtemp.conf](../collectors/charts.d.plugin/hddtemp)| +sensors|BASH<br/>Shell Script|Collects sensors values from files in `/sys`.<br/><br/>DEPRECATED IN FAVOR OF THE PYTHON ONE. It is still supplied only as an example module to shell scripting plugins.<br/> <br/>Netdata plugin: [charts.d.plugin](../collectors/charts.d.plugin#chartsdplugin)<br/>plugin module: [sensors.chart.sh](../collectors/charts.d.plugin/sensors)<br/>configuration file: [charts.d/sensors.conf](../collectors/charts.d.plugin/sensors)| +sensors|python<br/>v2 or v3|Uses `lm-sensors` to collect sensor data.<br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [sensors.chart.py](../collectors/python.d.plugin/sensors)<br/>configuration file: [python.d/sensors.conf](../collectors/python.d.plugin/sensors)| +smartd_log|python<br/>v2 or v3|Collects the S.M.A.R.T attributes from `smartd` log files.<br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [smartd_log.chart.py](../collectors/python.d.plugin/smartd_log)<br/>configuration file: [python.d/smartd_log.conf](../collectors/python.d.plugin/smartd_log)| +w1sensor|python<br/>v2 or v3|Collects data from connected 1-Wire sensors.<br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [w1sensor.chart.py](../collectors/python.d.plugin/w1sensor)<br/>configuration file: [python.d/w1sensor.conf](../collectors/python.d.plugin/w1sensor)| --- @@ -348,11 +353,11 @@ w1sensor|python<br/>v2 or v3|Collects data from connected 1-Wire sensors.<br/>&n application|language|notes| :---------:|:------:|:----| -ap|BASH<br/>Shell Script|Uses the `iw` command to provide statistics of wireless clients connected to a wireless access point running on this host (works well with `hostapd`).<br/> <br/>netdata plugin: [charts.d.plugin](../collectors/charts.d.plugin#chartsdplugin)<br/>plugin module: [ap.chart.sh](../collectors/charts.d.plugin/ap)<br/>configuration file: [charts.d/ap.conf](../collectors/charts.d.plugin/ap)| -fping|C|Charts network latency statistics for any number of nodes, using the `fping` command. A recent (probably unreleased) version of fping is required. The plugin supplied can install it in `/usr/local`.<br/> <br/>netdata plugin: [fping.plugin](../collectors/fping.plugin) (this is a shell wrapper to start fping - once fping is started, netdata and fping communicate directly - it can also install the right version of fping)<br/>configuration file: [fping.conf](../collectors/fping.plugin)| -snmp|node.js|Connects to multiple snmp servers to collect real-time performance metrics.<br/> <br/>netdata plugin: [node.d.plugin](../collectors/node.d.plugin#nodedplugin)<br/>plugin module: [snmp.node.js](../collectors/node.d.plugin/snmp)<br/>configuration file: [node.d/snmp.conf](../collectors/node.d.plugin/snmp)| +ap|BASH<br/>Shell Script|Uses the `iw` command to provide statistics of wireless clients connected to a wireless access point running on this host (works well with `hostapd`).<br/> <br/>Netdata plugin: [charts.d.plugin](../collectors/charts.d.plugin#chartsdplugin)<br/>plugin module: [ap.chart.sh](../collectors/charts.d.plugin/ap)<br/>configuration file: [charts.d/ap.conf](../collectors/charts.d.plugin/ap)| +fping|C|Charts network latency statistics for any number of nodes, using the `fping` command. A recent (probably unreleased) version of fping is required. The plugin supplied can install it in `/usr/local`.<br/> <br/>Netdata plugin: [fping.plugin](../collectors/fping.plugin) (this is a shell wrapper to start fping - once fping is started, Netdata and fping communicate directly - it can also install the right version of fping)<br/>configuration file: [fping.conf](../collectors/fping.plugin)| +snmp|node.js|Connects to multiple snmp servers to collect real-time performance metrics.<br/> <br/>Netdata plugin: [node.d.plugin](../collectors/node.d.plugin#nodedplugin)<br/>plugin module: [snmp.node.js](../collectors/node.d.plugin/snmp)<br/>configuration file: [node.d/snmp.conf](../collectors/node.d.plugin/snmp)| nfacct|C|collects netfilter firewall, connection tracker and accounting metrics using `libmnl` and `libnetfilter_acct`| -dns_query_time|python<br/>v2 or v3|Provides DNS query time statistics.<br/> <br/>Requires package `dnspython` (`pip install dnspython` or install package `python-dnspython`).<br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [dns_query_time.chart.py](../collectors/python.d.plugin/dns_query_time)<br/>configuration file: [python.d/dns_query_time.conf](../collectors/python.d.plugin/dns_query_time)| +dns_query_time|python<br/>v2 or v3|Provides DNS query time statistics.<br/> <br/>Requires package `dnspython` (`pip install dnspython` or install package `python-dnspython`).<br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [dns_query_time.chart.py](../collectors/python.d.plugin/dns_query_time)<br/>configuration file: [python.d/dns_query_time.conf](../collectors/python.d.plugin/dns_query_time)| http|python<br />v2 or v3|Monitors a generic web page for status code and returned content in HTML port|ptyhon<br />v2 or v3|Checks if a generic TCP port for its availability and response time @@ -363,8 +368,8 @@ port|ptyhon<br />v2 or v3|Checks if a generic TCP port for its availability and application|language|notes| :---------:|:------:|:----| -chrony|python<br/>v2 or v3|Uses the chronyc command to provide chrony statistics (Frequency, Last offset, RMS offset, Residual freq, Root delay, Root dispersion, Skew, System time).<br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [chrony.chart.py](../collectors/python.d.plugin/chrony)<br/>configuration file: [python.d/chrony.conf](../collectors/python.d.plugin/chrony)| -ntpd|python<br/>v2 or v3|Connects to multiple ntpd servers (local or remote) to provide statistics of system variables and optional also peer variables (if enabled in the configuration).<br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [ntpd.chart.py](../collectors/python.d.plugin/ntpd)<br/>configuration file: [python.d/ntpd.conf](../collectors/python.d.plugin/ntpd)| +chrony|python<br/>v2 or v3|Uses the chronyc command to provide chrony statistics (Frequency, Last offset, RMS offset, Residual freq, Root delay, Root dispersion, Skew, System time).<br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [chrony.chart.py](../collectors/python.d.plugin/chrony)<br/>configuration file: [python.d/chrony.conf](../collectors/python.d.plugin/chrony)| +ntpd|python<br/>v2 or v3|Connects to multiple ntpd servers (local or remote) to provide statistics of system variables and optional also peer variables (if enabled in the configuration).<br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [ntpd.chart.py](../collectors/python.d.plugin/ntpd)<br/>configuration file: [python.d/ntpd.conf](../collectors/python.d.plugin/ntpd)| --- @@ -373,9 +378,9 @@ ntpd|python<br/>v2 or v3|Connects to multiple ntpd servers (local or remote) to application|language|notes| :---------:|:------:|:----| -freeradius|python<br/>v2 or v3|Uses the radclient command to provide freeradius statistics (authentication, accounting, proxy-authentication, proxy-accounting).<br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [freeradius.chart.py](../collectors/python.d.plugin/freeradius)<br/>configuration file: [python.d/freeradius.conf](../collectors/python.d.plugin/freeradius)| -openvpn|python<br/>v2 or v3|All data from openvpn-status.log in your dashboard! <br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [ovpn_status_log.chart.py](../collectors/python.d.plugin/ovpn_status_log)<br/>configuration file: [python.d/ovpn_status_log.conf](../collectors/python.d.plugin/ovpn_status_log)| -fail2ban|python<br/>v2 or v3|Monitor fail2ban log file to show all bans for all active jails <br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [fail2ban.chart.py](../collectors/python.d.plugin/fail2ban)<br/>configuration file: [python.d/fail2ban.conf](../collectors/python.d.plugin/fail2ban)| +freeradius|python<br/>v2 or v3|Uses the radclient command to provide freeradius statistics (authentication, accounting, proxy-authentication, proxy-accounting).<br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [freeradius.chart.py](../collectors/python.d.plugin/freeradius)<br/>configuration file: [python.d/freeradius.conf](../collectors/python.d.plugin/freeradius)| +openvpn|python<br/>v2 or v3|All data from openvpn-status.log in your dashboard! <br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [ovpn_status_log.chart.py](../collectors/python.d.plugin/ovpn_status_log)<br/>configuration file: [python.d/ovpn_status_log.conf](../collectors/python.d.plugin/ovpn_status_log)| +fail2ban|python<br/>v2 or v3|Monitor fail2ban log file to show all bans for all active jails <br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [fail2ban.chart.py](../collectors/python.d.plugin/fail2ban)<br/>configuration file: [python.d/fail2ban.conf](../collectors/python.d.plugin/fail2ban)| --- @@ -384,7 +389,7 @@ fail2ban|python<br/>v2 or v3|Monitor fail2ban log file to show all bans for all application|language|notes| :---------:|:------:|:----| -opensips|BASH<br/>Shell Script|Connects to an opensips server (local only) to collect real-time performance metrics.<br/> <br/>netdata plugin: [charts.d.plugin](../collectors/charts.d.plugin#chartsdplugin)<br/>plugin module: [opensips.chart.sh](../collectors/charts.d.plugin/opensips)<br/>configuration file: [charts.d/opensips.conf](../collectors/charts.d.plugin/opensips)| +opensips|BASH<br/>Shell Script|Connects to an opensips server (local only) to collect real-time performance metrics.<br/> <br/>Netdata plugin: [charts.d.plugin](../collectors/charts.d.plugin#chartsdplugin)<br/>plugin module: [opensips.chart.sh](../collectors/charts.d.plugin/opensips)<br/>configuration file: [charts.d/opensips.conf](../collectors/charts.d.plugin/opensips)| --- @@ -393,7 +398,7 @@ opensips|BASH<br/>Shell Script|Connects to an opensips server (local only) to co application|language|notes| :---------:|:------:|:----| -go_expvar|python<br/>v2 or v3|Parses metrics exposed by applications written in the Go programming language using the [expvar package](https://golang.org/pkg/expvar/).<br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [go_expvar.chart.py](../collectors/python.d.plugin/go_expvar)<br/>configuration file: [python.d/go_expvar.conf](../collectors/python.d.plugin/go_expvar)<br/>documentation: [Monitoring Go Applications](../collectors/python.d.plugin/go_expvar/)| +go_expvar|python<br/>v2 or v3|Parses metrics exposed by applications written in the Go programming language using the [expvar package](https://golang.org/pkg/expvar/).<br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [go_expvar.chart.py](../collectors/python.d.plugin/go_expvar)<br/>configuration file: [python.d/go_expvar.conf](../collectors/python.d.plugin/go_expvar)<br/>documentation: [Monitoring Go Applications](../collectors/python.d.plugin/go_expvar/)| --- @@ -402,9 +407,9 @@ go_expvar|python<br/>v2 or v3|Parses metrics exposed by applications written in application|language|notes| :---------:|:------:|:----| -sma_webbox|node.js|Connects to multiple remote SMA webboxes to collect real-time performance metrics of the photovoltaic (solar) power generation.<br/> <br/>netdata plugin: [node.d.plugin](../collectors/node.d.plugin#nodedplugin)<br/>plugin module: [sma_webbox.node.js](../collectors/node.d.plugin/sma_webbox)<br/>configuration file: [node.d/sma_webbox.conf](../collectors/node.d.plugin/sma_webbox)| -fronius|node.js|Connects to multiple remote Fronius Symo servers to collect real-time performance metrics of the photovoltaic (solar) power generation.<br/> <br/>netdata plugin: [node.d.plugin](../collectors/node.d.plugin#nodedplugin)<br/>plugin module: [fronius.node.js](../collectors/node.d.plugin/fronius)<br/>configuration file: [node.d/fronius.conf](../collectors/node.d.plugin/fronius)| -stiebeleltron|node.js|Collects the temperatures and other metrics from your Stiebel Eltron heating system using their Internet Service Gateway (ISG web).<br/> <br/>netdata plugin: [node.d.plugin](../collectors/node.d.plugin#nodedplugin)<br/>plugin module: [stiebeleltron.node.js](../collectors/node.d.plugin/stiebeleltron)<br/>configuration file: [node.d/stiebeleltron.conf](../collectors/node.d.plugin/stiebeleltron)| +sma_webbox|node.js|Connects to multiple remote SMA webboxes to collect real-time performance metrics of the photovoltaic (solar) power generation.<br/> <br/>Netdata plugin: [node.d.plugin](../collectors/node.d.plugin#nodedplugin)<br/>plugin module: [sma_webbox.node.js](../collectors/node.d.plugin/sma_webbox)<br/>configuration file: [node.d/sma_webbox.conf](../collectors/node.d.plugin/sma_webbox)| +fronius|node.js|Connects to multiple remote Fronius Symo servers to collect real-time performance metrics of the photovoltaic (solar) power generation.<br/> <br/>Netdata plugin: [node.d.plugin](../collectors/node.d.plugin#nodedplugin)<br/>plugin module: [fronius.node.js](../collectors/node.d.plugin/fronius)<br/>configuration file: [node.d/fronius.conf](../collectors/node.d.plugin/fronius)| +stiebeleltron|node.js|Collects the temperatures and other metrics from your Stiebel Eltron heating system using their Internet Service Gateway (ISG web).<br/> <br/>Netdata plugin: [node.d.plugin](../collectors/node.d.plugin#nodedplugin)<br/>plugin module: [stiebeleltron.node.js](../collectors/node.d.plugin/stiebeleltron)<br/>configuration file: [node.d/stiebeleltron.conf](../collectors/node.d.plugin/stiebeleltron)| --- @@ -413,7 +418,7 @@ stiebeleltron|node.js|Collects the temperatures and other metrics from your Stie application|language|notes| :---------:|:------:|:----| -Spring Boot Application|java|Monitors running Java [Spring Boot](https://spring.io/) applications that expose their metrics with the use of the **Spring Boot Actuator** included in Spring Boot library.<br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [springboot](../collectors/python.d.plugin/springboot)<br/>configuration file: [python.d/springboot.conf](../collectors/python.d.plugin/springboot) +Spring Boot Application|java|Monitors running Java [Spring Boot](https://spring.io/) applications that expose their metrics with the use of the **Spring Boot Actuator** included in Spring Boot library.<br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [springboot](../collectors/python.d.plugin/springboot)<br/>configuration file: [python.d/springboot.conf](../collectors/python.d.plugin/springboot) --- @@ -422,7 +427,7 @@ Spring Boot Application|java|Monitors running Java [Spring Boot](https://spring. application|language|notes| :---------:|:------:|:----| -puppet|python<br/>v2 or v3|Connects to multiple Puppet Server and Puppet DB instances (local or remote) to collect real-time status metrics.<br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [puppet.chart.py](../collectors/python.d.plugin/puppet)<br/>configuration file: [python.d/puppet.conf](../collectors/python.d.plugin/puppet)| +puppet|python<br/>v2 or v3|Connects to multiple Puppet Server and Puppet DB instances (local or remote) to collect real-time status metrics.<br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [puppet.chart.py](../collectors/python.d.plugin/puppet)<br/>configuration file: [python.d/puppet.conf](../collectors/python.d.plugin/puppet)| --- @@ -430,7 +435,7 @@ puppet|python<br/>v2 or v3|Connects to multiple Puppet Server and Puppet DB inst application|language|notes| :---------:|:------:|:----| -SpigotMC|Python<br/>v2 or v3|Monitors Spigot Minecraft server ticks per second and number of online players using the Minecraft remote console.<br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [spigotmc.chart.py](../collectors/python.d.plugin/spigotmc)<br/>configuration file: [python.d/spigotmc.conf](../collectors/python.d.plugin/spigotmc)| +SpigotMC|Python<br/>v2 or v3|Monitors Spigot Minecraft server ticks per second and number of online players using the Minecraft remote console.<br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [spigotmc.chart.py](../collectors/python.d.plugin/spigotmc)<br/>configuration file: [python.d/spigotmc.conf](../collectors/python.d.plugin/spigotmc)| --- @@ -438,7 +443,7 @@ SpigotMC|Python<br/>v2 or v3|Monitors Spigot Minecraft server ticks per second a application|language|notes| :---------:|:------:|:----| -BOINC|Python<br/>v2 or v3|Monitors task states for local and remote BOINC client software using the remote GUI RPC interface. Also provides alarms for a handful of error conditions. Requires manual configuration<br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [boinc.chart.py](../collectors/python.d.plugin/boinc)<br/>configuration file: [python.d/boinc.conf](../collectors/python.d.plugin/boinc)| +BOINC|Python<br/>v2 or v3|Monitors task states for local and remote BOINC client software using the remote GUI RPC interface. Also provides alarms for a handful of error conditions. Requires manual configuration<br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [boinc.chart.py](../collectors/python.d.plugin/boinc)<br/>configuration file: [python.d/boinc.conf](../collectors/python.d.plugin/boinc)| --- @@ -446,7 +451,7 @@ BOINC|Python<br/>v2 or v3|Monitors task states for local and remote BOINC client application|language|notes| :---------:|:------:|:----| -example|BASH<br/>Shell Script|Skeleton plugin in BASH.<br/><br/>DEPRECATED IN FAVOR OF THE PYTHON ONE. It is still supplied only as an example module to shell scripting plugins.<br/> <br/>netdata plugin: [charts.d.plugin](../collectors/charts.d.plugin#chartsdplugin)<br/>plugin module: [example.chart.sh](../collectors/charts.d.plugin/example)<br/>configuration file: [charts.d/example.conf](../collectors/charts.d.plugin/example)| -example|python<br/>v2 or v3|Skeleton plugin in Python.<br/> <br/>netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [example.chart.py](../collectors/python.d.plugin/example)<br/>configuration file: [python.d/example.conf](../collectors/python.d.plugin/example)| +example|BASH<br/>Shell Script|Skeleton plugin in BASH.<br/><br/>DEPRECATED IN FAVOR OF THE PYTHON ONE. It is still supplied only as an example module to shell scripting plugins.<br/> <br/>Netdata plugin: [charts.d.plugin](../collectors/charts.d.plugin#chartsdplugin)<br/>plugin module: [example.chart.sh](../collectors/charts.d.plugin/example)<br/>configuration file: [charts.d/example.conf](../collectors/charts.d.plugin/example)| +example|python<br/>v2 or v3|Skeleton plugin in Python.<br/> <br/>Netdata plugin: [python.d.plugin](../collectors/python.d.plugin)<br/>plugin module: [example.chart.py](../collectors/python.d.plugin/example)<br/>configuration file: [python.d/example.conf](../collectors/python.d.plugin/example)| [![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fdocs%2FAdd-more-charts-to-netdata&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)]() diff --git a/docs/Charts.md b/docs/Charts.md index 64c36302..42ac4453 100644 --- a/docs/Charts.md +++ b/docs/Charts.md @@ -4,7 +4,7 @@ Before configuring an alarm or writing a collector, it's important to understand ## Charts -Each chart that you see on the netdata dashboard contains one or more dimensions, one for each collected or calculated metric. +Each chart that you see on the Netdata dashboard contains one or more dimensions, one for each collected or calculated metric. The chart name or chart id is what you see in parentheses at the top left corner of the chart you are interested in. For example, if you go to the system cpu chart: `http://your.netdata.ip:19999/#menu_system_submenu_cpu`, you will see at the top left of the chart the label "Total CPU utilization (system.cpu)". In this case, the chart name is `system.cpu`. @@ -16,7 +16,7 @@ Most charts depict more than one dimensions. The dimensions of a chart are calle When you have several instances of a monitored hardware or software resource (e.g. network interfaces, mysql instances etc.), you need to be able to identify each one separately. Netdata uses "families" to identify such instances. For example, if I have the network interfaces `eth0` and `eth1`, `eth0` will be one family, and `eth1` will be another. -The reasoning behind calling these instances "families" is that different charts for the same instance can and many times are related (relatives, family, you get it). The family of a chart is usually the name of the netdata dashboard submenu that you see selected on the right navigation pane, when you are looking at a chart. For the example of the two network interfaces, you would see a submenu `eth0` and a submenu `eth1` under the "Network Interfaces" menu on the right navigation pane. +The reasoning behind calling these instances "families" is that different charts for the same instance can and many times are related (relatives, family, you get it). The family of a chart is usually the name of the Netdata dashboard submenu that you see selected on the right navigation pane, when you are looking at a chart. For the example of the two network interfaces, you would see a submenu `eth0` and a submenu `eth1` under the "Network Interfaces" menu on the right navigation pane. ## Contexts diff --git a/docs/Demo-Sites.md b/docs/Demo-Sites.md index f6aad139..0d478d73 100644 --- a/docs/Demo-Sites.md +++ b/docs/Demo-Sites.md @@ -1,10 +1,10 @@ # Demo sites -Live demo installations of netdata are available at **[https://my-netdata.io](https://my-netdata.io)**: +Live demo installations of Netdata are available at **[https://www.netdata.cloud](https://www.netdata.cloud/#live-demo)**: -Location | netdata demo URL | 60 mins reqs | VM Donated by +Location | Netdata demo URL | 60 mins reqs | VM Donated by :-------:|:-----------------:|:----------:|:------------- -London (UK)|**[london.my-netdata.io](https://london.my-netdata.io)**<br/>(this is the global netdata **registry** and has **named** and **mysql** charts)|[![Requests Per Second](https://london.my-netdata.io/api/v1/badge.svg?chart=netdata.requests&dimensions=requests&after=-3600&options=unaligned&group=sum&label=reqs&units=empty&value_color=blue&precision=0&v42)](https://london.my-netdata.io)|[DigitalOcean.com](https://m.do.co/c/83dc9f941745) +London (UK)|**[london.my-netdata.io](https://london.my-netdata.io)**<br/>(this is the global Netdata **registry** and has **named** and **mysql** charts)|[![Requests Per Second](https://london.my-netdata.io/api/v1/badge.svg?chart=netdata.requests&dimensions=requests&after=-3600&options=unaligned&group=sum&label=reqs&units=empty&value_color=blue&precision=0&v42)](https://london.my-netdata.io)|[DigitalOcean.com](https://m.do.co/c/83dc9f941745) Atlanta (USA)|**[cdn77.my-netdata.io](https://cdn77.my-netdata.io)**<br/>(with **named** and **mysql** charts)|[![Requests Per Second](https://cdn77.my-netdata.io/api/v1/badge.svg?chart=netdata.requests&dimensions=requests&after=-3600&options=unaligned&group=sum&label=reqs&units=empty&value_color=blue&precision=0&v42)](https://cdn77.my-netdata.io)|[CDN77.com](https://www.cdn77.com/) Israel|**[octopuscs.my-netdata.io](https://octopuscs.my-netdata.io)**|[![Requests Per Second](https://octopuscs.my-netdata.io/api/v1/badge.svg?chart=netdata.requests&dimensions=requests&after=-3600&options=unaligned&group=sum&label=reqs&units=empty&value_color=blue&precision=0&v42)](https://octopuscs.my-netdata.io)|[OctopusCS.com](https://www.octopuscs.com) Roubaix (France)|**[ventureer.my-netdata.io](https://ventureer.my-netdata.io)**|[![Requests Per Second](https://ventureer.my-netdata.io/api/v1/badge.svg?chart=netdata.requests&dimensions=requests&after=-3600&options=unaligned&group=sum&label=reqs&units=empty&value_color=blue&precision=0&v42)](https://ventureer.my-netdata.io)|[Ventureer.com](https://ventureer.com/) diff --git a/docs/Donations-netdata-has-received.md b/docs/Donations-netdata-has-received.md index 3c737be8..062cb02b 100644 --- a/docs/Donations-netdata-has-received.md +++ b/docs/Donations-netdata-has-received.md @@ -1,13 +1,13 @@ # Donations -This is a list of the donations we have received for netdata (sorted alphabetically on their name): +This is a list of the donations we have received for Netdata (sorted alphabetically on their name): what donated|related links|who donated|description of the donation ----:|:-----:|:---:|:----------- Packages Distribution|-|**[PackageCloud.io](https://packagecloud.io/)**|**PackageCloud.io** donated to a free open-source subscription to their awesome Package Distribution services. Cross Browser Testing|-|**[BrowserStack.com](https://www.browserstack.com/)**|**BrowserStack.com** donated a free subscription to their awesome Browser Testing services (all three of them: Live, Screenshots, Responsive). Cloud VM|[cdn77.my-netdata.io](http://cdn77.my-netdata.io)|**[CDN77.com](https://www.cdn77.com/)**|**CDN77.com** donated a VM with 2 CPU cores, 4GB RAM and 20GB HD, on their excellent CDN network. -Localization Management|[netdata localization project](https://crowdin.com/project/netdata) (check issue [#279](https://github.com/netdata/netdata/issues/279))|**[Crowdin.com](https://crowdin.com/)**|**Crowdin.com** donated an open source license to their Localization Management Platform. +Localization Management|[Netdata localization project](https://crowdin.com/project/netdata) (check issue [#279](https://github.com/netdata/netdata/issues/279))|**[Crowdin.com](https://crowdin.com/)**|**Crowdin.com** donated an open source license to their Localization Management Platform. Cloud VMs|[london.my-netdata.io](https://london.my-netdata.io) (Several VMs)|**[DigitalOcean.com](https://www.digitalocean.com/)**|**DigitalOcean.com** donated 1000 USD to be used in their excellent Cloud Computing services. Many thanks to [Justin Paine](https://github.com/xxdesmus) for making this happen. Development IDE|-|**[JetBrains.com](https://www.jetbrains.com/)**|**JetBrains.com** donated an open source license for 4 developers for 1 year, to their excellent IDEs. Cloud VM|[octopuscs.my-netdata.io](https://octopuscs.my-netdata.io)|**[OctopusCS.com](https://octopuscs.com/)**|**OctopusCS.com** donated a VM with 4 CPU cores, 16GB RAM and 50GB HD in their excellent Cloud Computing services. @@ -18,7 +18,7 @@ Thank you! --- -**Do you want to donate?** We are thirsty for on-line services that can help us make netdata better. We also try to build a network of demo sites (VMs) that can help us show the full potential of netdata. +**Do you want to donate?** We are thirsty for on-line services that can help us make Netdata better. We also try to build a network of demo sites (VMs) that can help us show the full potential of Netdata. Please contact me at costa@tsaousis.gr. diff --git a/docs/GettingStarted.md b/docs/GettingStarted.md index cc58634f..3ddf4c38 100644 --- a/docs/GettingStarted.md +++ b/docs/GettingStarted.md @@ -1,10 +1,10 @@ # Getting Started -These are your first steps **after** you have installed netdata. If you haven't installed it already, please check the [installation page](../packaging/installer). +These are your first steps **after** you have installed Netdata. If you haven't installed it already, please check the [installation page](../packaging/installer). ## Accessing the dashboard -To access the netdata dashboard, navigate with your browser to: +To access the Netdata dashboard, navigate with your browser to: ``` http://your.server.ip:19999/ @@ -14,7 +14,7 @@ http://your.server.ip:19999/ **Verify Netdata is running.** -Open an ssh session to the server and execute `sudo ps -e | grep netdata`. It should respond with the PID of the netdata daemon. If it prints nothing, Netdata is not running. Check the [installation page](../packaging/installer) to install it. +Open an ssh session to the server and execute `sudo ps -e | grep netdata`. It should respond with the PID of the Netdata daemon. If it prints nothing, Netdata is not running. Check the [installation page](../packaging/installer) to install it. **Verify Netdata responds to HTTP requests.** @@ -32,9 +32,9 @@ If still Netdata does not receive the requests, something is blocking them. A fi </details> <br/> -When you install multiple Netdata servers, all your servers will appear at the `my-netdata` menu at the top left of the dashboard. For this to work, you have to manually access just once, the dashboard of each of your netdata servers. +When you install multiple Netdata servers, all your servers will appear at the node menu at the top left of the dashboard. For this to work, you have to manually access just once, the dashboard of each of your netdata servers. -The `my-netdata` menu is more than just browser bookmarks. When switching Netdata servers from that menu, any settings of the current view are propagated to the other netdata server: +The node menu is more than just browser bookmarks. When switching Netdata servers from that menu, any settings of the current view are propagated to the other netdata server: - the current charts panning (drag the charts left or right), - the current charts zooming (`SHIFT` + mouse wheel over a chart), @@ -43,7 +43,7 @@ The `my-netdata` menu is more than just browser bookmarks. When switching Netdat - the theme you use, - etc. -are all sent over to other netdata server, to allow you troubleshoot cross-server performance issues easily. +are all sent over to other Netdata server, to allow you troubleshoot cross-server performance issues easily. ## Starting and stopping Netdata @@ -55,15 +55,15 @@ To start/stop Netdata, depending on your environment, you should use: - `service netdata start` and `service netdata stop` - `/etc/init.d/netdata start` and `/etc/init.d/netdata stop` -Once netdata is installed, the installer configures it to start at boot and stop at shutdown. +Once Netdata is installed, the installer configures it to start at boot and stop at shutdown. For more information about using these commands, consult your system documentation. ## Sizing Netdata -The default installation of netdata is configured for a small round-robin database: just 1 hour of data. Depending on the memory your system has and the amount you can dedicate to Netdata, you should adapt this. On production systems with limited RAM, we suggest to set this to 3-4 hours. For best results you should set this to 24 or 48 hours. +The default installation of Netdata is configured for a small round-robin database: just 1 hour of data. Depending on the memory your system has and the amount you can dedicate to Netdata, you should adapt this. On production systems with limited RAM, we suggest to set this to 3-4 hours. For best results you should set this to 24 or 48 hours. -For every hour of data, Netdata needs about 25MB of RAM. If you can dedicate about 100MB of RAM to netdata, you should set its database size to 4 hours. +For every hour of data, Netdata needs about 25MB of RAM. If you can dedicate about 100MB of RAM to Netdata, you should set its database size to 4 hours. To do this, edit `/etc/netdata/netdata.conf` (or `/opt/netdata/etc/netdata/netdata.conf`) and set: @@ -77,24 +77,24 @@ Make sure the `history` line is not commented (comment lines start with `#`). 1 hour is 3600 seconds, so the number you need to set is the result of `HOURS * 3600`. !!! danger - Be careful when you set this on production systems. If you set it too high, your system may run out of memory. By default, netdata is configured to be killed first when the system starves for memory, but better be careful to avoid issues. + Be careful when you set this on production systems. If you set it too high, your system may run out of memory. By default, Netdata is configured to be killed first when the system starves for memory, but better be careful to avoid issues. For more information about Netdata memory requirements, [check this page](../database). -If your kernel supports KSM (most do), you can [enable KSM to half netdata memory requirement](../database#ksm). +If your kernel supports KSM (most do), you can [enable KSM to half Netdata memory requirement](../database#ksm). ## Service discovery and auto-detection Netdata supports auto-detection of data collection sources. It auto-detects almost everything: database servers, web servers, dns server, etc. -This auto-detection process happens **only once**, when netdata starts. To have Netdata re-discover data sources, you need to restart it. There are a few exceptions to this: +This auto-detection process happens **only once**, when Netdata starts. To have Netdata re-discover data sources, you need to restart it. There are a few exceptions to this: - containers and VMs are auto-detected forever (when Netdata is running at the host). - many data sources are collected but are silenced by default, until there is useful information to collect (for example network interface dropped packet, will appear after a packet has been dropped). - services that are not optimal to collect on all systems, are disabled by default. - services we received feedback from users that caused issues when monitored, are also disabled by default (for example, `chrony` is disabled by default, because CentOS ships a version of it that uses 100% CPU when queried for statistics). -Once a data collection source is detected, netdata will never quit trying to collect data from it, until Netdata is restarted. So, if you stop your web server, netdata will pick it up automatically when it is started again. +Once a data collection source is detected, Netdata will never quit trying to collect data from it, until Netdata is restarted. So, if you stop your web server, Netdata will pick it up automatically when it is started again. Since Netdata is installed on all your systems (even inside containers), auto-detection is limited to `localhost`. This simplifies significantly the security model of a Netdata monitored infrastructure, since most applications allow `localhost` access by default. @@ -104,10 +104,10 @@ A few well known data collection sources that commonly need to be configured are ## Configuration quick start -In netdata we have: +In Netdata we have: -- **internal** data collection plugins (running inside the netdata daemon) -- **external** data collection plugins (independent processes, sending data to netdata over pipes) +- **internal** data collection plugins (running inside the Netdata daemon) +- **external** data collection plugins (independent processes, sending data to Netdata over pipes) - modular plugin **orchestrators** (external plugins that have multiple data collection modules) You can enable and disable plugins (internal and external) via `netdata.conf` at the section `[plugins]`. @@ -151,7 +151,7 @@ sudo /etc/netdata/edit-config python.d/nginx.conf Netdata ships hundreds of health monitoring alarms for detecting anomalies. These are optimized for production servers. -Many users install netdata on workstations and are frustrated by the default alarms shipped with netdata. On these cases, we suggest to disable health monitoring. +Many users install Netdata on workstations and are frustrated by the default alarms shipped with Netdata. On these cases, we suggest to disable health monitoring. To disable it, edit `/etc/netdata/netdata.conf` (or `/opt/netdata/etc/netdata/netdata.conf` if you installed the static 64bit package) and set: @@ -176,7 +176,7 @@ and set `SEND_EMAIL="NO"`. - Check [Data Collection](../collectors) for configuring data collection plugins. - Check [Health Monitoring](../health) for configuring your own alarms, or setting up alarm notifications. -- Check [Streaming](../streaming) for centralizing netdata metrics. -- Check [Backends](../backends) for long term archiving of netdata metrics to time-series databases. +- Check [Streaming](../streaming) for centralizing Netdata metrics. +- Check [Backends](../backends) for long term archiving of Netdata metrics to time-series databases. [![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fdocs%2FGettingStarted&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)]() diff --git a/docs/Performance.md b/docs/Performance.md index b08549f1..fbc6d576 100644 --- a/docs/Performance.md +++ b/docs/Performance.md @@ -1,6 +1,6 @@ # Performance -netdata performance is affected by: +Netdata performance is affected by: **Data collection** - the number of charts for which data are collected @@ -19,11 +19,11 @@ You can control all the above. ## Netdata Daemon -For most server systems, with a few hundred charts and a few thousand dimensions, the netdata daemon, without any web clients accessing it, should not use more than 1% of a single core. +For most server systems, with a few hundred charts and a few thousand dimensions, the Netdata daemon, without any web clients accessing it, should not use more than 1% of a single core. -To prove netdata scalability, check issue [#1323](https://github.com/netdata/netdata/issues/1323#issuecomment-265501668) where netdata collects 95.000 metrics per second, with 12% CPU utilization of a single core! +To prove Netdata scalability, check issue [#1323](https://github.com/netdata/netdata/issues/1323#issuecomment-265501668) where Netdata collects 95.000 metrics per second, with 12% CPU utilization of a single core! -In embedded systems, if the netdata daemon is using a lot of CPU without any web clients accessing it, you should lower the data collection frequency. To set the data collection frequency, edit `/etc/netdata/netdata.conf` and set `update_every` to a higher number (this is the frequency in seconds data are collected for all charts: higher number of seconds = lower frequency, the default is 1 for per second data collection). You can also set this frequency per module or chart. Check the [daemon configuration](../daemon/config) for plugins and charts. For specific modules, the configuration needs to be changed in: +In embedded systems, if the Netdata daemon is using a lot of CPU without any web clients accessing it, you should lower the data collection frequency. To set the data collection frequency, edit `/etc/netdata/netdata.conf` and set `update_every` to a higher number (this is the frequency in seconds data are collected for all charts: higher number of seconds = lower frequency, the default is 1 for per second data collection). You can also set this frequency per module or chart. Check the [daemon configuration](../daemon/config) for plugins and charts. For specific modules, the configuration needs to be changed in: - `python.d.conf` for [python](../collectors/python.d.plugin/#pythondplugin) - `node.d.conf` for [nodejs](../collectors/node.d.plugin/#nodedplugin) - `charts.d.conf` for [bash](../collectors/charts.d.plugin/#chartsdplugin) @@ -34,24 +34,24 @@ If a plugin is using a lot of CPU, you should lower its update frequency, or if ## CPU consumption when web clients are accessing dashboards -Netdata is very efficient when servicing web clients. On most server platforms, netdata should be able to serve **1800 web client requests per second per core** for auto-refreshing charts. +Netdata is very efficient when servicing web clients. On most server platforms, Netdata should be able to serve **1800 web client requests per second per core** for auto-refreshing charts. Normally, each user connected will request less than 10 chart refreshes per second (the page may have hundreds of charts, but only the visible are refreshed). So you can expect 180 users per CPU core accessing dashboards before having any delays. Netdata runs with the lowest possible process priority, so even if 1000 users are accessing dashboards, it should not influence your applications. CPU utilization will reach 100%, but your applications should get all the CPU they need. -To lower the CPU utilization of netdata when clients are accessing the dashboard, set `web compression level = 1`, or disable web compression completely by setting `enable web responses gzip compression = no`. Both settings are in the `[web]` section. +To lower the CPU utilization of Netdata when clients are accessing the dashboard, set `web compression level = 1`, or disable web compression completely by setting `enable web responses gzip compression = no`. Both settings are in the `[web]` section. ## Monitoring a heavy loaded system -Netdata, while running, does not depend on disk I/O (apart its log files and `access.log` is written with buffering enabled and can be disabled). Some plugins that need disk may stop and show gaps during heavy system load, but the netdata daemon itself should be able to work and collect values from `/proc` and `/sys` and serve web clients accessing it. +Netdata, while running, does not depend on disk I/O (apart its log files and `access.log` is written with buffering enabled and can be disabled). Some plugins that need disk may stop and show gaps during heavy system load, but the Netdata daemon itself should be able to work and collect values from `/proc` and `/sys` and serve web clients accessing it. -Keep in mind that netdata saves its database when it exits and loads it back when restarted. While it is running though, its DB is only stored in RAM and no I/O takes place for it. +Keep in mind that Netdata saves its database when it exits and loads it back when restarted. While it is running though, its DB is only stored in RAM and no I/O takes place for it. ## Netdata process priority -By default, netdata runs with the `idle` process scheduler, which assigns CPU resources to netdata, only when the system has such resources to spare. +By default, Netdata runs with the `idle` process scheduler, which assigns CPU resources to Netdata, only when the system has such resources to spare. The following `netdata.conf` settings control this: @@ -62,15 +62,15 @@ The following `netdata.conf` settings control this: process nice level = 19 ``` -The policies supported by netdata are `idle` (the netdata default), `other` (also as `nice`), `batch`, `rr`, `fifo`. netdata also recognizes `keep` and `none` to keep the current settings without changing them. +The policies supported by Netdata are `idle` (the Netdata default), `other` (also as `nice`), `batch`, `rr`, `fifo`. Netdata also recognizes `keep` and `none` to keep the current settings without changing them. -For `other`, `nice` and `batch`, the setting `process nice level = 19` is activated to configure the nice level of netdata. Nice gets values -20 (highest) to 19 (lowest). +For `other`, `nice` and `batch`, the setting `process nice level = 19` is activated to configure the nice level of Netdata. Nice gets values -20 (highest) to 19 (lowest). For `rr` and `fifo`, the setting `process scheduling priority = 0` is activated to configure the priority of the relative scheduling policy. Priority gets values 1 (lowest) to 99 (highest). For the details of each scheduler, see `man sched_setscheduler` and `man sched`. -When netdata is running under systemd, it can only lower its priority (the default is `other` with `nice level = 0`). If you want to make netdata to get more CPU than that, you will need to set in `netdata.conf`: +When Netdata is running under systemd, it can only lower its priority (the default is `other` with `nice level = 0`). If you want to make Netdata to get more CPU than that, you will need to set in `netdata.conf`: ``` [global] @@ -85,17 +85,17 @@ CPUSchedulingPriority=99 Nice=-10 ``` -## Running netdata in embedded devices +## Running Netdata in embedded devices Embedded devices usually have very limited CPU resources available, and in most cases, just a single core. -> keep in mind that netdata on RPi 2 and 3 does not require any tuning. The default settings will be good. The following tunables apply only when running netdata on RPi 1 or other very weak IoT devices. +> keep in mind that Netdata on RPi 2 and 3 does not require any tuning. The default settings will be good. The following tunables apply only when running Netdata on RPi 1 or other very weak IoT devices. We suggest to do the following: ### 1. Disable External plugins -External plugins can consume more system resources than the netdata server. Disable the ones you don't need. If you need them, increase their `update every` value (again in `/etc/netdata/netdata.conf`), so that they do not run that frequently. +External plugins can consume more system resources than the Netdata server. Disable the ones you don't need. If you need them, increase their `update every` value (again in `/etc/netdata/netdata.conf`), so that they do not run that frequently. Edit `/etc/netdata/netdata.conf`, find the `[plugins]` section: @@ -126,8 +126,8 @@ plugin|description `idlejitter`|internal plugin (written in C) that attempts show if the systems starved for CPU. Disabling it will eliminate a thread. `cgroups`|monitoring linux containers. Most probably you are not going to need it. This will also eliminate another thread. `checks`|a debugging plugin, which is disabled by default. -`apps`|a plugin that monitors system processes. It is very complex and heavy (consumes twice the CPU resources of the netdata daemon), so if you don't need to monitor the process tree, you can disable it. -`charts.d`|BASH plugins (squid, nginx, mysql, etc). This is a heavy plugin, that consumes twice the CPU resources of the netdata daemon. +`apps`|a plugin that monitors system processes. It is very complex and heavy (consumes twice the CPU resources of the Netdata daemon), so if you don't need to monitor the process tree, you can disable it. +`charts.d`|BASH plugins (squid, nginx, mysql, etc). This is a heavy plugin, that consumes twice the CPU resources of the Netdata daemon. `node.d`|node.js plugin, currently used for SNMP data collection and monitoring named (the name server). `python.d`|has many modules and can use over 20MB of memory. @@ -137,7 +137,7 @@ For most IoT devices, you can disable all plugins except `proc`. For `proc` ther ### 2. Disable internal plugins -In this section you can select which modules of the `proc` plugin you need. All these are run in a single thread, one after another. Still, each one needs some RAM and consumes some CPU cycles. With all the modules enabled, the `proc` plugin adds ~9 MiB on top of the 5 MiB required by the netdata daemon. +In this section you can select which modules of the `proc` plugin you need. All these are run in a single thread, one after another. Still, each one needs some RAM and consumes some CPU cycles. With all the modules enabled, the `proc` plugin adds ~9 MiB on top of the 5 MiB required by the Netdata daemon. ``` [plugin:proc] @@ -150,7 +150,7 @@ Refer to the [proc.plugins documentation](../collectors/proc.plugin/) for the li ### 3. Lower internal plugin update frequency -If netdata is still using a lot of CPU, lower its update frequency. Going from per second updates, to once every 2 seconds updates, will cut the CPU resources of all netdata programs **in half**, and you will still have very frequent updates. +If Netdata is still using a lot of CPU, lower its update frequency. Going from per second updates, to once every 2 seconds updates, will cut the CPU resources of all Netdata programs **in half**, and you will still have very frequent updates. If the CPU of the embedded device is too weak, try setting even lower update frequency. Experiment with `update every = 5` or `update every = 10` (higher number = lower frequency) in `netdata.conf`, until you get acceptable results. @@ -172,18 +172,14 @@ Normally, you will not need them. To disable them, set: ``` ### 5. Set memory mode to RAM -Setting the memory mode to `ram` will disable loading and saving the round robin database. This will not affect anything while running netdata, but it might be required if you have very limited storage available. +Setting the memory mode to `ram` will disable loading and saving the round robin database. This will not affect anything while running Netdata, but it might be required if you have very limited storage available. ``` [global] memory mode = ram ``` -### 6. Use the single threaded web server - -Normally, netdata spawns a thread for each web client. This allows netdata to utilize all the available cores for servicing chart refreshes. You can however disable this feature and serve all charts one after another, using a single thread / core. This will might lower the CPU pressure on the embedded device. To enable the single threaded web server, edit `/etc/netdata/netdata.conf` and set `mode = single-threaded` in the `[web]` section. - -### 7. Lower memory requirements +### 6. Lower memory requirements You can set the default size of the round robin database for all charts, using: @@ -197,9 +193,9 @@ The units for history is `[global].update every` seconds. So if `[global].update Check also [Database](../database) for directions on calculating the size of the round robin database. -### 8. Disable gzip compression of responses +### 7. Disable gzip compression of responses -Gzip compression of the web responses is using more CPU that the rest of netdata. You can lower the compression level or disable gzip compression completely. You can disable it, like this: +Gzip compression of the web responses is using more CPU that the rest of Netdata. You can lower the compression level or disable gzip compression completely. You can disable it, like this: ``` [web] @@ -214,7 +210,7 @@ To lower the compression level, do this: gzip compression level = 1 ``` -Finally, if no web server is installed on your device, you can use port tcp/80 for netdata: +Finally, if no web server is installed on your device, you can use port tcp/80 for Netdata: ``` [web] diff --git a/docs/Running-behind-apache.md b/docs/Running-behind-apache.md index f0ce70c7..a71897f4 100644 --- a/docs/Running-behind-apache.md +++ b/docs/Running-behind-apache.md @@ -2,10 +2,10 @@ Below you can find instructions for configuring an apache server to: -1. proxy a single netdata via an HTTP and HTTPS virtual host -2. dynamically proxy any number of netdata +1. proxy a single Netdata via an HTTP and HTTPS virtual host +2. dynamically proxy any number of Netdata 3. add user authentication -4. adjust netdata settings to get optimal results +4. adjust Netdata settings to get optimal results ## Requirements @@ -33,13 +33,13 @@ sudo a2enmod rewrite --- -## netdata on an existing virtual host +## Netdata on an existing virtual host -On any **existing** and already **working** apache virtual host, you can redirect requests for URL `/netdata/` to one or more netdata servers. +On any **existing** and already **working** apache virtual host, you can redirect requests for URL `/netdata/` to one or more Netdata servers. -### proxy one netdata, running on the same server apache runs +### proxy one Netdata, running on the same server apache runs -Add the following on top of any existing virtual host. It will allow you to access netdata as `http://virtual.host/netdata/`. +Add the following on top of any existing virtual host. It will allow you to access Netdata as `http://virtual.host/netdata/`. ``` <VirtualHost *:80> @@ -52,7 +52,7 @@ Add the following on top of any existing virtual host. It will allow you to acce Require all granted </Proxy> - # Local netdata server accessed with '/netdata/', at localhost:19999 + # Local Netdata server accessed with '/netdata/', at localhost:19999 ProxyPass "/netdata/" "http://localhost:19999/" connectiontimeout=5 timeout=30 keepalive=on ProxyPassReverse "/netdata/" "http://localhost:19999/" @@ -67,9 +67,9 @@ Add the following on top of any existing virtual host. It will allow you to acce </VirtualHost> ``` -### proxy multiple netdata running on multiple servers +### proxy multiple Netdata running on multiple servers -Add the following on top of any existing virtual host. It will allow you to access multiple netdata as `http://virtual.host/netdata/HOSTNAME/`, where `HOSTNAME` is the hostname of any other netdata server you have (to access the `localhost` netdata, use `http://virtual.host/netdata/localhost/`). +Add the following on top of any existing virtual host. It will allow you to access multiple Netdata as `http://virtual.host/netdata/HOSTNAME/`, where `HOSTNAME` is the hostname of any other Netdata server you have (to access the `localhost` Netdata, use `http://virtual.host/netdata/localhost/`). ``` <VirtualHost *:80> @@ -105,9 +105,9 @@ If you want to control the servers your users can connect to, replace the `Proxy ProxyPassMatch "^/netdata/(server1|server2|server3|server4)/(.*)" "http://$1:19999/$2" connectiontimeout=5 timeout=30 keepalive=on ``` -## netdata on a dedicated virtual host +## Netdata on a dedicated virtual host -You can proxy netdata through apache, using a dedicated apache virtual host. +You can proxy Netdata through apache, using a dedicated apache virtual host. Create a new apache site: @@ -158,7 +158,7 @@ Repeat the operation for as many servers as you need. ## Enable Basic Auth -If you wish to add an authentication (user/password) to access your netdata, do these: +If you wish to add an authentication (user/password) to access your Netdata, do these: Install the package `apache2-utils`. On debian / ubuntu run `sudo apt-get install apache2-utils`. @@ -184,28 +184,28 @@ Modify the virtual host with these: </Location> ``` -Specify `Location /` if netdata is running on dedicated virtual host. +Specify `Location /` if Netdata is running on dedicated virtual host. Note: Changes are applied by reloading or restarting Apache. # Netdata configuration -You might edit `/etc/netdata/netdata.conf` to optimize your setup a bit. For applying these changes you need to restart netdata. +You might edit `/etc/netdata/netdata.conf` to optimize your setup a bit. For applying these changes you need to restart Netdata. ## Response compression -If you plan to use netdata exclusively via apache, you can gain some performance by preventing double compression of its output (netdata compresses its response, apache re-compresses it) by editing `/etc/netdata/netdata.conf` and setting: +If you plan to use Netdata exclusively via apache, you can gain some performance by preventing double compression of its output (Netdata compresses its response, apache re-compresses it) by editing `/etc/netdata/netdata.conf` and setting: ``` [web] enable gzip compression = no ``` -Once you disable compression at netdata (and restart it), please verify you receive compressed responses from apache (it is important to receive compressed responses - the charts will be more snappy). +Once you disable compression at Netdata (and restart it), please verify you receive compressed responses from apache (it is important to receive compressed responses - the charts will be more snappy). -## Limit direct access to netdata +## Limit direct access to Netdata -You would also need to instruct netdata to listen only on `localhost`, `127.0.0.1` or `::1`. +You would also need to instruct Netdata to listen only on `localhost`, `127.0.0.1` or `::1`. ``` [web] @@ -224,13 +224,13 @@ or --- -You can also use a unix domain socket. This will also provide a faster route between apache and netdata: +You can also use a unix domain socket. This will also provide a faster route between apache and Netdata: ``` [web] bind to = unix:/tmp/netdata.sock ``` -_note: netdata v1.8+ support unix domain sockets_ +_note: Netdata v1.8+ support unix domain sockets_ At the apache side, prepend the 2nd argument to `ProxyPass` with `unix:/tmp/netdata.sock|`, like this: @@ -247,13 +247,13 @@ If your apache server is not on localhost, you can set: bind to = * allow connections from = IP_OF_APACHE_SERVER ``` -_note: netdata v1.9+ support `allow connections from`_ +_note: Netdata v1.9+ support `allow connections from`_ -`allow connections from` accepts [netdata simple patterns](../libnetdata/simple_pattern/) to match against the connection IP address. +`allow connections from` accepts [Netdata simple patterns](../libnetdata/simple_pattern/) to match against the connection IP address. ## prevent the double access.log -apache logs accesses and netdata logs them too. You can prevent netdata from generating its access log, by setting this in `/etc/netdata/netdata.conf`: +apache logs accesses and Netdata logs them too. You can prevent Netdata from generating its access log, by setting this in `/etc/netdata/netdata.conf`: ``` [global] @@ -262,9 +262,9 @@ apache logs accesses and netdata logs them too. You can prevent netdata from gen ## Troubleshooting mod_proxy -Make sure the requests reach netdata, by examing `/var/log/netdata/access.log`. +Make sure the requests reach Netdata, by examing `/var/log/netdata/access.log`. -1. if the requests do not reach netdata, your apache does not forward them. -2. if the requests reach netdata by the URLs are wrong, you have not re-written them properly. +1. if the requests do not reach Netdata, your apache does not forward them. +2. if the requests reach Netdata by the URLs are wrong, you have not re-written them properly. [![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fdocs%2FRunning-behind-apache&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)]() diff --git a/docs/Running-behind-caddy.md b/docs/Running-behind-caddy.md index 1b25b0a2..4e530e94 100644 --- a/docs/Running-behind-caddy.md +++ b/docs/Running-behind-caddy.md @@ -1,6 +1,6 @@ # Netdata via Caddy -To run netdata via [Caddy's proxying,](https://caddyserver.com/docs/proxy) set your Caddyfile up like this: +To run Netdata via [Caddy's proxying,](https://caddyserver.com/docs/proxy) set your Caddyfile up like this: ``` netdata.domain.tld { @@ -10,7 +10,7 @@ netdata.domain.tld { Other directives can be added between the curly brackets as needed. -To run netdata in a subfolder: +To run Netdata in a subfolder: ``` netdata.domain.tld { @@ -20,10 +20,10 @@ netdata.domain.tld { } ``` -## limit direct access to netdata +## limit direct access to Netdata -You would also need to instruct netdata to listen only to `127.0.0.1` or `::1`. +You would also need to instruct Netdata to listen only to `127.0.0.1` or `::1`. -To limit access to netdata only from localhost, set `bind socket to IP = 127.0.0.1` or `bind socket to IP = ::1` in `/etc/netdata/netdata.conf`. +To limit access to Netdata only from localhost, set `bind socket to IP = 127.0.0.1` or `bind socket to IP = ::1` in `/etc/netdata/netdata.conf`. [![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fdocs%2FRunning-behind-caddy&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)]() diff --git a/docs/Running-behind-lighttpd.md b/docs/Running-behind-lighttpd.md index 5c74439a..8e43a038 100644 --- a/docs/Running-behind-lighttpd.md +++ b/docs/Running-behind-lighttpd.md @@ -1,6 +1,6 @@ # Netdata via lighttpd v1.4.x -Here is a config for accessing netdata in a suburl via lighttpd 1.4.46 and newer: +Here is a config for accessing Netdata in a suburl via lighttpd 1.4.46 and newer: ```txt $HTTP["url"] =~ "^/netdata/" { @@ -24,7 +24,7 @@ $SERVER["socket"] == ":19998" { --- -If the only thing the server is exposing via the web is netdata (and thus no suburl rewriting required), +If the only thing the server is exposing via the web is Netdata (and thus no suburl rewriting required), then you can get away with just ``` proxy.server = ( "" => ( ( "host" => "127.0.0.1", "port" => 19999 ))) @@ -45,7 +45,7 @@ other auth methods, and more info on htdigest, can be found in lighttpd's [mod_a --- It seems that lighttpd (or some versions of it), fail to proxy compressed web responses. -To solve this issue, disable web response compression in netdata. +To solve this issue, disable web response compression in Netdata. Open /etc/netdata/netdata.conf and set in [global]: @@ -53,10 +53,10 @@ Open /etc/netdata/netdata.conf and set in [global]: enable web responses gzip compression = no ``` -## limit direct access to netdata +## limit direct access to Netdata -You would also need to instruct netdata to listen only to `127.0.0.1` or `::1`. +You would also need to instruct Netdata to listen only to `127.0.0.1` or `::1`. -To limit access to netdata only from localhost, set `bind socket to IP = 127.0.0.1` or `bind socket to IP = ::1` in `/etc/netdata/netdata.conf`. +To limit access to Netdata only from localhost, set `bind socket to IP = 127.0.0.1` or `bind socket to IP = ::1` in `/etc/netdata/netdata.conf`. [![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fdocs%2FRunning-behind-lighttpd&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)]() diff --git a/docs/Running-behind-nginx.md b/docs/Running-behind-nginx.md index 3918af24..b38d27fa 100644 --- a/docs/Running-behind-nginx.md +++ b/docs/Running-behind-nginx.md @@ -1,12 +1,12 @@ # Netdata via nginx -To pass netdata via a nginx, use this: +To pass Netdata via a nginx, use this: ### As a virtual host ``` upstream backend { - # the netdata server + # the Netdata server server 127.0.0.1:19999; keepalive 64; } @@ -69,7 +69,7 @@ server { } ``` -### As a subfolder for multiple netdata servers, via one nginx +### As a subfolder for multiple Netdata servers, via one nginx ``` upstream backend-server1 { @@ -112,11 +112,24 @@ server { Of course you can add as many backend servers as you like. -Using the above, you access netdata on the backend servers, like this: +Using the above, you access Netdata on the backend servers, like this: - `http://nginx.server/netdata/server1/` to reach `backend-server1` - `http://nginx.server/netdata/server2/` to reach `backend-server2` +### Using TLS communication + +In case the Netdata web server has been [configured to use TLS](../web/server/#enabling-tls-support), +you must also encrypt the communication between Nginx and Netdata. + +To enable encryption, first [enable SSL on nginx](http://nginx.org/en/docs/http/configuring_https_servers.html) and then put the following in the location section of the Nginx configuration: + +``` +proxy_set_header X-Forwarded-Proto https; +proxy_pass https://localhost:19999; +``` + +If nginx is not configured as described here, you will probably receive the error `SSL_ERROR_RX_RECORD_TOO_LONG`. ### Enable authentication @@ -139,9 +152,9 @@ server { } ``` -## limit direct access to netdata +## limit direct access to Netdata -If your nginx is on `localhost`, you can use this to protect your netdata: +If your nginx is on `localhost`, you can use this to protect your Netdata: ``` [web] @@ -150,13 +163,13 @@ If your nginx is on `localhost`, you can use this to protect your netdata: --- -You can also use a unix domain socket. This will also provide a faster route between nginx and netdata: +You can also use a unix domain socket. This will also provide a faster route between nginx and Netdata: ``` [web] bind to = unix:/tmp/netdata.sock ``` -_note: netdata v1.8+ support unix domain sockets_ +_note: Netdata v1.8+ support unix domain sockets_ At the nginx side, use something like this to use the same unix domain socket: @@ -177,13 +190,13 @@ If your nginx server is not on localhost, you can set: allow connections from = IP_OF_NGINX_SERVER ``` -_note: netdata v1.9+ support `allow connections from`_ +_note: Netdata v1.9+ support `allow connections from`_ -`allow connections from` accepts [netdata simple patterns](../libnetdata/simple_pattern/) to match against the connection IP address. +`allow connections from` accepts [Netdata simple patterns](../libnetdata/simple_pattern/) to match against the connection IP address. ## prevent the double access.log -nginx logs accesses and netdata logs them too. You can prevent netdata from generating its access log, by setting this in `/etc/netdata/netdata.conf`: +nginx logs accesses and Netdata logs them too. You can prevent Netdata from generating its access log, by setting this in `/etc/netdata/netdata.conf`: ``` [global] @@ -201,4 +214,5 @@ If you get an 502 Bad Gateway error you might check your nginx error log: If you see something like the above, chances are high that SELinux prevents nginx from connecting to the backend server. To fix that, just use this policy: `setsebool -P httpd_can_network_connect true`. + [![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fdocs%2FRunning-behind-nginx&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)]() diff --git a/docs/Third-Party-Plugins.md b/docs/Third-Party-Plugins.md index 38fa90e4..8d227203 100644 --- a/docs/Third-Party-Plugins.md +++ b/docs/Third-Party-Plugins.md @@ -4,7 +4,7 @@ The following is a list of Netdata plugins distributed by third parties: ## Nvidia GPUs -[netdata nv plugin](https://github.com/coraxx/netdata_nv_plugin) monitors nvidia GPUs. +[Netdata nv plugin](https://github.com/coraxx/netdata_nv_plugin) monitors nvidia GPUs. ![image](https://user-images.githubusercontent.com/2662304/29516895-351e905e-867b-11e7-9863-3fb6924490ab.png) diff --git a/docs/a-github-star-is-important.md b/docs/a-github-star-is-important.md index e46d5644..cac01f3e 100644 --- a/docs/a-github-star-is-important.md +++ b/docs/a-github-star-is-important.md @@ -1,12 +1,12 @@ # A GitHub star is important -**GitHub stars** allow netdata to expand its reach, its community, especially attract people with skills willing to contribute to it. +**GitHub stars** allow Netdata to expand its reach, its community, especially attract people with skills willing to contribute to it. -Compared to its first release, netdata is now **twice as fast**, has all its bugs settled and a lot more functionality. This happened because a lot of people find it useful, use it daily at home and work, **rely on it** and **contribute to it**. +Compared to its first release, Netdata is now **twice as fast**, has all its bugs settled and a lot more functionality. This happened because a lot of people find it useful, use it daily at home and work, **rely on it** and **contribute to it**. **GitHub stars** also **motivate** us. They state that you find our work **useful**. They give us strength to continue, to work **harder** to make it even **better**. -So, give netdata a **GitHub star**, at the top right of this page. +So, give Netdata a **GitHub star**, at the top right of this page. Thank you! diff --git a/docs/anonymous-statistics.md b/docs/anonymous-statistics.md index 1e426e2c..376a2c4a 100644 --- a/docs/anonymous-statistics.md +++ b/docs/anonymous-statistics.md @@ -1,11 +1,11 @@ # Anonymous Statistics -From Netdata v1.12 and above, anonymous usage information is collected by default and send to Google Analytics. +From Netdata v1.12 and above, anonymous usage information is collected by default and sent to Google Analytics. The statistics calculated from this information will be used for: -1. **Quality assurance**, to help us understand if netdata behaves as expected and help us identify repeating issues for certain distributions or environment. +1. **Quality assurance**, to help us understand if Netdata behaves as expected and help us identify repeating issues for certain distributions or environment. -2. **Usage statistics**, to help us focus on the parts of netdata that are used the most, or help us identify the extend our development decisions influence the community. +2. **Usage statistics**, to help us focus on the parts of Netdata that are used the most, or help us identify the extend our development decisions influence the community. Information is sent to Netdata via two different channels: - Google Tag Manager is used when an agent's dashboard is accessed. @@ -42,7 +42,7 @@ The only thing that's impossible for us to prevent from being **sent** is the UR ## Anonymous Statistics Script -Every time the daemon is started or stopped and every time a fatal condition is encountered, netdata uses the anonymous statistics script to collect system information and send it to GA via an http call. The information collected for all events is: +Every time the daemon is started or stopped and every time a fatal condition is encountered, Netdata uses the anonymous statistics script to collect system information and send it to GA via an http call. The information collected for all events is: - Netdata version - OS name, version, id, id_like - Kernel name, version, architecture @@ -56,7 +56,8 @@ To see exactly what and how is collected, you can review the script template `da ## Opt-Out To opt-out from sending anonymous statistics, you can create a file called `.opt-out-from-anonymous-statistics` under the user configuration directory (usually `/etc/netdata`). The effect of creating the file is the following: -- The daemon will never execute the anonymous statistics script -- The anonymous statistics script will exit immediately if called via any other way (e.g. shell) -- The Google Tag Manager Javascript snippet will remain in the page, but the linked tag will not be fired. The effect is that no data will ever be sent to GA. + - The daemon will never execute the anonymous statistics script + - The anonymous statistics script will exit immediately if called via any other way (e.g. shell) + - The Google Tag Manager Javascript snippet will remain in the page, but the linked tag will not be fired. The effect is that no data will ever be sent to GA. +You can also disable telemetry by passing the option `--disable-telemetry` to any of the installers. diff --git a/docs/configuration-guide.md b/docs/configuration-guide.md index 811a33ed..2a9539dc 100644 --- a/docs/configuration-guide.md +++ b/docs/configuration-guide.md @@ -1,6 +1,6 @@ # Configuration guide -No configuration is required to run netdata, but you will find plenty of options to tweak, so that you can adapt it to your particular needs. +No configuration is required to run Netdata, but you will find plenty of options to tweak, so that you can adapt it to your particular needs. <details markdown="1"><summary>Configuration files are placed in `/etc/netdata`.</summary> Depending on your installation method, Netdata will have been installed either directly under `/`, or under `/opt/netdata`. The paths mentioned here and in the documentation in general assume that your installation is under `/`. If it is not, you will find the exact same paths under `/opt/netdata` as well. (i.e. `/etc/netdata` will be `/opt/netdata/etc/netdata`).</details> @@ -20,7 +20,7 @@ Under that directory you will see the following: - `stats.d` is a directory under which you can add .conf files to add [synthetic charts](../collectors/statsd.plugin/#synthetic-statsd-charts). - Individual collector plugin config files, such as `fping.conf` for the [fping plugin](../collectors/fping.plugin/) and `apps_groups.conf` for the [apps plugin](../collectors/apps.plugin/) -So there are many configuration files to control every aspect of Netdata's behavior. It can be overwhelming at first, but you won't have to deal with any of them, unless you have specific things you need to change. The following HOWTO will guide you on how to customize your netdata, based on what you want to do. +So there are many configuration files to control every aspect of Netdata's behavior. It can be overwhelming at first, but you won't have to deal with any of them, unless you have specific things you need to change. The following HOWTO will guide you on how to customize your Netdata, based on what you want to do. ## How to @@ -57,6 +57,10 @@ Entire plugins can be turned off from the [netdata.conf [plugins]](../daemon/con - `node.d.conf` for [nodejs](../collectors/node.d.plugin/#nodedplugin) - `charts.d.conf` for [bash](../collectors/charts.d.plugin/#chartsdplugin) +##### Show charts with zero metrics + +By default, Netdata will enable monitoring metrics for disks, memory, and network only when they are not zero. If they are constantly zero they are ignored. Metrics that will start having values, after netdata is started, will be detected and charts will be automatically added to the dashboard (a refresh of the dashboard is needed for them to appear though). Use `yes` instead of `auto` in plugin configuration sections to enable these charts permanently. + ### Modify alarms and notifications ##### Add a new alarm @@ -69,9 +73,9 @@ Just set `enabled = no` in the [netdata.conf [health]](../daemon/config/#health- ##### Modify or disable a specific alarm -The `health.d` directory that contains the alarm triggers for [health monitoring](../health/#health-monitoring). It has one .conf file per collector. You can easily find the .conf file you will need to modify, by looking for the "source" line on the table that appears on the right side of an alarm on the netdata gui. +The `health.d` directory that contains the alarm triggers for [health monitoring](../health/#health-monitoring). It has one .conf file per collector. You can easily find the .conf file you will need to modify, by looking for the "source" line on the table that appears on the right side of an alarm on the Netdata gui. -For example, if you click on Alarms and go to the tab 'All', the default netdata installation will show you at the top the configured alarm for `10 min cpu usage` (it's the name of the badge). Looking at the table on the right side, you will see a row that says: `source 4@/usr/lib/netdata/conf.d/health.d/cpu.conf`. This way, you know that you will need to run `/etc/netdata/edit-config health.d/cpu.conf` and look for alarm at line 4 of the conf file. +For example, if you click on Alarms and go to the tab 'All', the default Netdata installation will show you at the top the configured alarm for `10 min cpu usage` (it's the name of the badge). Looking at the table on the right side, you will see a row that says: `source 4@/usr/lib/netdata/conf.d/health.d/cpu.conf`. This way, you know that you will need to run `/etc/netdata/edit-config health.d/cpu.conf` and look for alarm at line 4 of the conf file. As stated at the top of the .conf file, **you can disable an alarm notification by setting the 'to' line to: silent**. To modify how the alarm gets triggered, we suggest that you go through the guide on [health monitoring](../health/#health-monitoring). @@ -82,7 +86,7 @@ You only need to configure `health_alarm_notify.conf`. To learn how to do it, re ### Make security-related customizations -##### Change the netdata web server access lists +##### Change the Netdata web server access lists You have several options under the [netdata.conf [web]](../web/server/#access-lists) section. @@ -90,38 +94,38 @@ You have several options under the [netdata.conf [web]](../web/server/#access-li You will need to configure the [registry] section in netdata.conf. First read the [registry documentation](../registry/). In it, are instructions on how to [run your own registry](../registry/#run-your-own-registry). -##### Change the IP address/port netdata listens to +##### Change the IP address/port Netdata listens to The settings are under netdata.conf [web]. Look at the [web server documentation](../web/server/#binding-netdata-to-multiple-ports) for more info. ### System resource usage -##### Reduce the resources netdata uses +##### Reduce the resources Netdata uses -The page on [netdata performance](Performance.md) has an excellent guide on how to reduce the netdata cpu/disk/RAM utilization to levels suitable even for the weakest [IoT devices](netdata-for-IoT.md). +The page on [Netdata performance](Performance.md) has an excellent guide on how to reduce the Netdata cpu/disk/RAM utilization to levels suitable even for the weakest [IoT devices](netdata-for-IoT.md). -##### Change when netdata saves metrics to disk +##### Change when Netdata saves metrics to disk [netdata.conf [global]](../daemon/config/#global-section-options) : `memory mode`</details> -##### Prevent netdata from getting immediately killed when my server runs out of memory +##### Prevent Netdata from getting immediately killed when my server runs out of memory -You can change the netdata [OOM score](../daemon/#oom-score) in netdata.conf [global]. +You can change the Netdata [OOM score](../daemon/#oom-score) in netdata.conf [global]. ### Other -##### Move netdata directories +##### Move Netdata directories The various directory paths are in [netdata.conf [global]](../daemon/config/#global-section-options). -## How netdata configuration works +## How Netdata configuration works The configuration files are `name = value` dictionaries with `[sections]`. Write whatever you like there as long as it follows this simple format. Netdata loads this dictionary and then when the code needs a value from it, it just looks up the `name` in the dictionary at the proper `section`. In all places, in the code, there are both the `names` and their `default values`, so if something is not found in the configuration file, the default is used. The lookup is made using B-Trees and hashes (no string comparisons), so they are super fast. Also the `names` of the settings can be `my super duper setting that once set to yes, will turn the world upside down = no` - so goodbye to most of the documentation involved. -Next, netdata can generate a valid configuration for the user to edit. No need to remember anything. Just get the configuration from the server (`/netdata.conf` on your netdata server), edit it and save it. +Next, Netdata can generate a valid configuration for the user to edit. No need to remember anything. Just get the configuration from the server (`/netdata.conf` on your Netdata server), edit it and save it. Last, what about options you believe you have set, but you misspelled?When you get the configuration file from the server, there will be a comment above all `name = value` pairs the server does not use. So you know that whatever you wrote there, is not used. @@ -129,6 +133,6 @@ Last, what about options you believe you have set, but you misspelled?When you g Unix prefers regular expressions. But they are just too hard, too cryptic to use, write and understand. -So, netdata supports [simple patterns](../libnetdata/simple_pattern/). +So, Netdata supports [simple patterns](../libnetdata/simple_pattern/). [![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fdocs%2Fconfiguration-guide&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)]() diff --git a/docs/generator/buildhtml.sh b/docs/generator/buildhtml.sh index 04311224..e1c108fb 100755 --- a/docs/generator/buildhtml.sh +++ b/docs/generator/buildhtml.sh @@ -19,18 +19,18 @@ GO_D_DIR="collectors/go.d.plugin" rm -rf ${GO_D_DIR} git clone https://github.com/netdata/go.d.plugin.git ${GO_D_DIR} -# Copy all netdata .md files to docs/generator/src. Exclude htmldoc itself and also the directory node_modules generatord by Netlify +# Copy all Netdata .md files to docs/generator/src. Exclude htmldoc itself and also the directory node_modules generatord by Netlify echo "Copying files" rm -rf ${SRC_DIR} find . -type d \( -path ./${GENERATOR_DIR} -o -path ./node_modules \) -prune -o -name "*.md" -print | cpio -pd ${SRC_DIR} -# Copy netdata html resources +# Copy Netdata html resources cp -a ./${GENERATOR_DIR}/custom ./${SRC_DIR}/ # Modify the first line of the main README.md, to enable proper static html generation echo "Modifying README header" -sed -i -e '0,/# netdata /s//# Introduction\n\n/' ${SRC_DIR}/README.md +sed -i -e '0,/# Netdata /s//# Introduction\n\n/' ${SRC_DIR}/README.md # Remove all GA tracking code find ${SRC_DIR} -name "*.md" -print0 | xargs -0 sed -i -e 's/\[!\[analytics.*UA-64295674-3)\]()//g' @@ -81,7 +81,7 @@ prep_html() { # Build html docs mkdocs build --config-file="${MKDOCS_CONFIG_FILE}" - # Fix edit buttons for the markdowns that are not on the main netdata repo + # Fix edit buttons for the markdowns that are not on the main Netdata repo find "${GENERATOR_DIR}/${SITE_DIR}/${GO_D_DIR}" -name "*.html" -print0 | xargs -0 sed -i -e 's/https:\/\/github.com\/netdata\/netdata\/blob\/master\/collectors\/go.d.plugin/https:\/\/github.com\/netdata\/go.d.plugin\/blob\/master/g' if [ "${lang}" != "en" ] ; then find "${GENERATOR_DIR}/${SITE_DIR}" -name "*.html" -print0 | xargs -0 sed -i -e 's/https:\/\/github.com\/netdata\/netdata\/blob\/master\/\S*md/https:\/\/github.com\/netdata\/localization\//g' diff --git a/docs/generator/buildyaml.sh b/docs/generator/buildyaml.sh index e367ab50..e4a5466a 100755 --- a/docs/generator/buildyaml.sh +++ b/docs/generator/buildyaml.sh @@ -127,6 +127,7 @@ echo -ne " - 'docs/Demo-Sites.md' - REDISTRIBUTED.md - CHANGELOG.md - CONTRIBUTING.md + - SECURITY.md - Why Netdata: - 'docs/why-netdata/README.md' - 'docs/why-netdata/1s-granularity.md' @@ -139,7 +140,7 @@ echo -ne " - 'docs/Demo-Sites.md' - 'packaging/installer/UPDATE.md' - 'packaging/installer/UNINSTALL.md' - 'docs/GettingStarted.md' -- Running netdata: +- Running Netdata: - 'daemon/README.md' - 'docs/configuration-guide.md' - 'daemon/config/README.md' @@ -228,6 +229,7 @@ navpart 3 collectors/ioping.plugin navpart 3 collectors/freeipmi.plugin navpart 3 collectors/nfacct.plugin navpart 3 collectors/xenstat.plugin +navpart 3 collectors/perf.plugin echo -ne " - 'docs/Third-Party-Plugins.md' @@ -251,9 +253,8 @@ navpart 2 web/api/badges "" "" 2 navpart 2 web/api/health "" "" 2 navpart 2 web/api/queries "" "Queries" 2 -echo -ne "- Hacking netdata: +echo -ne "- Hacking Netdata: - CODE_OF_CONDUCT.md - - 'docs/Netdata-Security-and-Disclosure-Information.md' - CONTRIBUTORS.md " navpart 2 packaging/makeself "" "" 4 diff --git a/docs/generator/checklinks.sh b/docs/generator/checklinks.sh index 6538d39b..acc14465 100755 --- a/docs/generator/checklinks.sh +++ b/docs/generator/checklinks.sh @@ -21,8 +21,8 @@ printhelp () { By default, nothing is actually checked. The following options tell it what to check: -a Check all link types -w Check wiki links (and just warn if you see one) - -b Check absolute links to the netdata repo (and change them to relative). Only checks links to https://github.com/netdata/netdata/????/master* - -l Check relative links to the netdata repo (and replace them with links that the html static site can live with, under docs/generator/src only) + -b Check absolute links to the Netdata repo (and change them to relative). Only checks links to https://github.com/netdata/netdata/????/master* + -l Check relative links to the Netdata repo (and replace them with links that the html static site can live with, under docs/generator/src only) -e Check external links, outside the wiki or the repo (useless without adding the -u option, to verify that they're not broken) " } @@ -233,7 +233,7 @@ checklinks () { if [ "$CHKWIKI" -eq 1 ] ; then echo "-- WARNING: $f - $lnk points to the wiki. Please replace it manually" ; fi ;; https://github.com/netdata/netdata/????/master* ) - echo "-- ERROR: $f - $lnk is an absolute link to a netdata file. Please convert to relative." + echo "-- ERROR: $f - $lnk is an absolute link to a Netdata file. Please convert to relative." EXITCODE=1 ;; http* ) diff --git a/docs/generator/custom/img/favicon.ico b/docs/generator/custom/img/favicon.ico Binary files differindex 7ed95725..703716cd 100644 --- a/docs/generator/custom/img/favicon.ico +++ b/docs/generator/custom/img/favicon.ico diff --git a/docs/high-performance-netdata.md b/docs/high-performance-netdata.md index a9947d9b..553ad6da 100644 --- a/docs/high-performance-netdata.md +++ b/docs/high-performance-netdata.md @@ -1,18 +1,18 @@ -# High performance netdata +# High performance Netdata -If you plan to run a netdata public on the internet, you will get the most performance out of it by following these rules: +If you plan to run a Netdata public on the internet, you will get the most performance out of it by following these rules: ## 1. run behind nginx -The internal web server is optimized to provide the best experience with few clients connected to it. Normally a web browser will make 4-6 concurrent connections to a web server, so that it can send requests in parallel. To best serve a single client, netdata spawns a thread for each connection it receives (so 4-6 threads per connected web browser). +The internal web server is optimized to provide the best experience with few clients connected to it. Normally a web browser will make 4-6 concurrent connections to a web server, so that it can send requests in parallel. To best serve a single client, Netdata spawns a thread for each connection it receives (so 4-6 threads per connected web browser). -If you plan to have your netdata public on the internet, this strategy wastes resources. It provides a lock-free environment so each thread is autonomous to serve the browser, but it does not scale well. Running netdata behind nginx, idle connections to netdata can be reused, thus improving significantly the performance of netdata. +If you plan to have your Netdata public on the internet, this strategy wastes resources. It provides a lock-free environment so each thread is autonomous to serve the browser, but it does not scale well. Running Netdata behind nginx, idle connections to Netdata can be reused, thus improving significantly the performance of Netdata. In the following nginx configuration we do the following: -- allow nginx to maintain up to 1024 idle connections to netdata (so netdata will have up to 1024 threads waiting for requests) +- allow nginx to maintain up to 1024 idle connections to Netdata (so Netdata will have up to 1024 threads waiting for requests) -- allow nginx to compress the responses of netdata (later we will disable gzip compression at netdata) +- allow nginx to compress the responses of Netdata (later we will disable gzip compression at Netdata) - we disable wordpress pingback attacks and allow only GET, HEAD and OPTIONS requests. @@ -65,14 +65,14 @@ Then edit `/etc/netdata/netdata.conf` and set these config options: These options: -- `[global].bind socket to IP = 127.0.0.1` makes netdata listen only for requests from localhost (nginx). -- `[global].access log = none` disables the access.log of netdata. It is not needed since netdata only listens for requests on 127.0.0.1 and thus only nginx can access it. nginx has its own access.log for your record. +- `[global].bind socket to IP = 127.0.0.1` makes Netdata listen only for requests from localhost (nginx). +- `[global].access log = none` disables the access.log of Netdata. It is not needed since Netdata only listens for requests on 127.0.0.1 and thus only nginx can access it. nginx has its own access.log for your record. - `[global].disconnect idle web clients after seconds = 3600` will kill inactive web threads after an hour of inactivity. -- `[global].enable web responses gzip compression = no` disables gzip compression at netdata (nginx will compress the responses). +- `[global].enable web responses gzip compression = no` disables gzip compression at Netdata (nginx will compress the responses). ## 2. increase open files limit (non-systemd) -By default Linux limits open file descriptors per process to 1024. This means that less than half of this number of client connections can be accepted by both nginx and netdata. To increase them, create 2 new files: +By default Linux limits open file descriptors per process to 1024. This means that less than half of this number of client connections can be accepted by both nginx and Netdata. To increase them, create 2 new files: 1. `/etc/security/limits.d/nginx.conf`, with these contents: diff --git a/docs/netdata-for-IoT.md b/docs/netdata-for-IoT.md index 97fba07e..ca538543 100644 --- a/docs/netdata-for-IoT.md +++ b/docs/netdata-for-IoT.md @@ -2,7 +2,7 @@ ![image1](https://cloud.githubusercontent.com/assets/2662304/14252446/11ae13c4-fa90-11e5-9d03-d93a3eb3317a.gif) -> New to netdata? Check its demo: **[https://my-netdata.io/](https://my-netdata.io/)** +> New to Netdata? Check its demo: **[https://my-netdata.io/](https://my-netdata.io/)** > > [![User Base](https://registry.my-netdata.io/api/v1/badge.svg?chart=netdata.registry_entries&dimensions=persons&label=user%20base&units=null&value_color=blue&precision=0&v41)](https://registry.my-netdata.io/#netdata_registry) [![Monitored Servers](https://registry.my-netdata.io/api/v1/badge.svg?chart=netdata.registry_entries&dimensions=machines&label=servers%20monitored&units=null&value_color=orange&precision=0&v41)](https://registry.my-netdata.io/#netdata_registry) [![Sessions Served](https://registry.my-netdata.io/api/v1/badge.svg?chart=netdata.registry_sessions&label=sessions%20served&units=null&value_color=yellowgreen&precision=0&v41)](https://registry.my-netdata.io/#netdata_registry) > @@ -10,23 +10,23 @@ --- -netdata is a **very efficient** server performance monitoring solution. When running in server hardware, it can collect thousands of system and application metrics **per second** with just 1% CPU utilization of a single core. Its web server responds to most data requests in about **half a millisecond** making its web dashboards spontaneous, amazingly fast! +Netdata is a **very efficient** server performance monitoring solution. When running in server hardware, it can collect thousands of system and application metrics **per second** with just 1% CPU utilization of a single core. Its web server responds to most data requests in about **half a millisecond** making its web dashboards spontaneous, amazingly fast! -netdata can also be a very efficient real-time monitoring solution for **IoT devices** (RPIs, routers, media players, wifi access points, industrial controllers and sensors of all kinds). Netdata will generally run everywhere a Linux kernel runs (and it is glibc and [musl-libc](https://www.musl-libc.org/) friendly). +Netdata can also be a very efficient real-time monitoring solution for **IoT devices** (RPIs, routers, media players, wifi access points, industrial controllers and sensors of all kinds). Netdata will generally run everywhere a Linux kernel runs (and it is glibc and [musl-libc](https://www.musl-libc.org/) friendly). You can use it as both a data collection agent (where you pull data using its API), for embedding its charts on other web pages / consoles, but also for accessing it directly with your browser to view its dashboard. -The netdata web API already provides **reduce** functions allowing it to report **average** and **max** for any timeframe. It can also respond in many formats including JSON, JSONP, CSV, HTML. Its API is also a **google charts** provider so it can directly be used by google sheets, google charts, google widgets. +The Netdata web API already provides **reduce** functions allowing it to report **average** and **max** for any timeframe. It can also respond in many formats including JSON, JSONP, CSV, HTML. Its API is also a **google charts** provider so it can directly be used by google sheets, google charts, google widgets. ![sensors](https://cloud.githubusercontent.com/assets/2662304/15339745/8be84540-1c8e-11e6-9e9a-106dea7539b6.gif) -Although netdata has been significantly optimized to lower the CPU and RAM resources it consumes, the plethora of data collection plugins may be inappropriate for weak IoT devices. Please follow the guide on [running netdata in embedded devices](Performance.md) +Although Netdata has been significantly optimized to lower the CPU and RAM resources it consumes, the plethora of data collection plugins may be inappropriate for weak IoT devices. Please follow the guide on [running Netdata in embedded devices](Performance.md) ## Monitoring RPi temperature The python version of the sensors plugin uses `lm-sensors`. Unfortunately the temperature reading of RPi are not supported by `lm-sensors`. -netdata also has a bash version of the sensors plugin that can read RPi temperatures. It is disabled by default to avoid the conflicts with the python version. +Netdata also has a bash version of the sensors plugin that can read RPi temperatures. It is disabled by default to avoid the conflicts with the python version. To enable it, run `sudo edit-config charts.d.conf` and uncomment this line: @@ -34,7 +34,7 @@ To enable it, run `sudo edit-config charts.d.conf` and uncomment this line: sensors=force ``` -Then restart netdata. You will get this: +Then restart Netdata. You will get this: ![image](https://user-images.githubusercontent.com/2662304/29658868-23aa65ae-88c5-11e7-9dad-c159600db5cc.png) diff --git a/docs/netdata-security.md b/docs/netdata-security.md index 64288106..955abebd 100644 --- a/docs/netdata-security.md +++ b/docs/netdata-security.md @@ -163,7 +163,7 @@ If sending this information to the central Netdata registry violates your securi Starting with v1.12 Netdata also collects [anonymous statistics](anonymous-statistics.md) on certain events for: -1. **Quality assurance**, to help us understand if netdata behaves as expected and help us identify repeating issues for certain distributions or environments. +1. **Quality assurance**, to help us understand if Netdata behaves as expected and help us identify repeating issues for certain distributions or environments. 2. **Usage statistics**, to help us focus on the parts of Netdata that are used the most, or help us identify the extent our development decisions influence the community. @@ -171,11 +171,11 @@ To opt-out from sending anonymous statistics, you can create a file called `.opt ## Netdata directories -path|owner|permissions| netdata |comments| +path|owner|permissions| Netdata |comments| :---|:----|:----------|:--------|:-------| -`/etc/netdata`|user `root`<br/>group `netdata`|dirs `0755`<br/>files `0640`|reads|**netdata config files**<br/>may contain sensitive information, so group `netdata` is allowed to read them. -`/usr/libexec/netdata`|user `root`<br/>group `root`|executable by anyone<br/>dirs `0755`<br/>files `0644` or `0755`|executes|**netdata plugins**<br/>permissions depend on the file - not all of them should have the executable flag.<br/>there are a few plugins that run with escalated privileges (Linux capabilities or `setuid`) - these plugins should be executable only by group `netdata`. -`/usr/share/netdata`|user `root`<br/>group `netdata`|readable by anyone<br/>dirs `0755`<br/>files `0644`|reads and sends over the network|**Netdata web static files**<br/>these files are sent over the network to anyone that has access to the netdata web server. Netdata checks the ownership of these files (using settings at the `[web]` section of `netdata.conf`) and refuses to serve them if they are not properly owned. Symbolic links are not supported. Netdata also refuses to serve URLs with `..` in their name. +`/etc/netdata`|user `root`<br/>group `netdata`|dirs `0755`<br/>files `0640`|reads|**Netdata config files**<br/>may contain sensitive information, so group `netdata` is allowed to read them. +`/usr/libexec/netdata`|user `root`<br/>group `root`|executable by anyone<br/>dirs `0755`<br/>files `0644` or `0755`|executes|**Netdata plugins**<br/>permissions depend on the file - not all of them should have the executable flag.<br/>there are a few plugins that run with escalated privileges (Linux capabilities or `setuid`) - these plugins should be executable only by group `netdata`. +`/usr/share/netdata`|user `root`<br/>group `netdata`|readable by anyone<br/>dirs `0755`<br/>files `0644`|reads and sends over the network|**Netdata web static files**<br/>these files are sent over the network to anyone that has access to the Netdata web server. Netdata checks the ownership of these files (using settings at the `[web]` section of `netdata.conf`) and refuses to serve them if they are not properly owned. Symbolic links are not supported. Netdata also refuses to serve URLs with `..` in their name. `/var/cache/netdata`|user `netdata`<br/>group `netdata`|dirs `0750`<br/>files `0660`|reads, writes, creates, deletes|**Netdata ephemeral database files**<br/>Netdata stores its ephemeral real-time database here. `/var/lib/netdata`|user `netdata`<br/>group `netdata`|dirs `0750`<br/>files `0660`|reads, writes, creates, deletes|**Netdata permanent database files**<br/>Netdata stores here the registry data, health alarm log db, etc. `/var/log/netdata`|user `netdata`<br/>group `root`|dirs `0755`<br/>files `0644`|writes, creates|**Netdata log files**<br/>all the Netdata applications, logs their errors or other informational messages to files in this directory. These files should be log rotated. diff --git a/docs/privacy-policy.md b/docs/privacy-policy.md index af50b885..e46d783e 100644 --- a/docs/privacy-policy.md +++ b/docs/privacy-policy.md @@ -32,21 +32,34 @@ Note that you can learn about Google’s practices in connection with its analyt Information from Cookies: We and our service providers (for example, Google Analytics as described above) may collect information using cookies or similar technologies for the purposes described above and below. Cookies are pieces of information that are stored by your browser on the hard drive or memory of your computer or other Internet access device. Cookies may enable us to personalize your experience on the Services, maintain a persistent session, passively collect demographic information about your computer, and monitor advertisements and other activities. The Websites may use different kinds of cookies and other types of local storage (such as browser-based or plugin-based local storage). +ND Registry: The global registry, together with certain browser features, allow Netdata to provide unified cross-server dashboards, via the node menu. +The menu lists the Netdata servers you have visited. For example, when you jump from server to server using the node menu, several session settings +(like the currently viewed charts, the current zoom and pan operations on the charts, etc.) are propagated to the new server, so that the new dashboard will come with exactly the +same view. The global registry keeps track of 4 entities: -ND Registry: The global registry, together with certain browser features, allow netdata to provide unified cross-server dashboards, via the `my-netdata` menu. The menu lists the netdata servers you have visited. For example, when you jump from server to server using the `my-netdata` menu, several session settings (like the currently viewed charts, the current zoom and pan operations on the charts, etc.) are propagated to the new server, so that the new dashboard will come with exactly the same view. The global registry keeps track of 3 entities: +1. **machines**: i.e. the netdata installations (a random GUID generated by each netdata the first time it starts; we call this **machine_guid**) -1. **machines**: i.e. the netdata installations (a random GUID generated by each netdata the first time it starts; we call this **machine_guid**). For each netdata installation (each `machine_guid`) the registry keeps track of the different URLs it is accessed. + For each netdata installation (each `machine_guid`) the registry keeps track of the different URLs it is accessed. -2. **persons**: i.e. the web browsers accessing the netdata installations (a random GUID generated by the registry the first time it sees a new web browser; we call this **person_guid**). For each person, the registry keeps track of the netdata installations it has accessed and their URLs. +2. **persons**: i.e. the web browsers accessing the netdata installations (a random GUID generated by the registry the first time it sees a new web browser; we call this **person_guid**) -3. **URLs** of netdata installations (as seen by the web browsers). For each URL, the registry keeps the URL and nothing more. Each URL is linked to *persons* and *machines*. The only way to find a URL is to know its **machine_guid** or have a **person_guid** it is linked to it. + For each person, the registry keeps track of the netdata installations it has accessed and their URLs. + +3. **URLs** of netdata installations (as seen by the web browsers) + + For each URL, the registry keeps the URL and nothing more. Each URL is linked to *persons* and *machines*. The only way to find a URL is to know its **machine_guid** or have a **person_guid** it is linked to it. + +4. **accounts**: i.e. the information used to sign-in via one of the available sign-in methods. Depending on the method, this may include an email, an email and a profile picture. + +For *persons*/*accounts* and *machines*, the registry keeps links to *URLs*, each link with 2 timestamps (first time seen, last time seen) and a counter (number of times it has been seen). +*machines*, *persons*, and timestamps are stored in the netdata registry regardless of whether you sign in or not. If sending this information is against your policies, you can [run your own registry](../registry/#run-your-own-registry). Note that ND versions with the 'Sign in' feature of the ND Cloud do not use the global registry. ND Cloud: When you sign up to obtain a user account via the 'Sign in' link on the ND agent user interface, ND is granted access to personal information in the user profile of the authentication provider you choose (e.g. GitHub or Google). ND collects and uses this personal information pursuant to its legitimate interest in establishing and maintaining your account providing you with the features we provide Registered Users. We may use your email address to contact you regarding changes to this policy or other applicable policies. The login name or email address of your profile may be used to attribute you in connection with any content you submit to any Service. -Anonymous Usage Statistics: From Netdata v1.12 and above, anonymous usage information is collected by default on certain events of the ND daemon and send to Google Analytics. Every time the daemon is started or stopped and every time a fatal condition is encountered, netdata collects system information and sends it to GA via an http call. The information collected for all events is: +Anonymous Usage Statistics: From Netdata v1.12 and above, anonymous usage information is collected by default on certain events of the ND daemon and send to Google Analytics. Every time the daemon is started or stopped and every time a fatal condition is encountered, Netdata collects system information and sends it to GA via an http call. The information collected for all events is: - Netdata version - OS name, version, id, id_like - Kernel name, version, architecture @@ -56,9 +69,9 @@ Furthermore, the FATAL event sends the Netdata process & thread info, along with The statistics calculated from this information are used for: -1. **Quality assurance**, to help us understand if netdata behaves as expected and help us identify repeating issues for certain distributions or environment. +1. **Quality assurance**, to help us understand if Netdata behaves as expected and help us identify repeating issues for certain distributions or environment. -2. **Usage statistics**, to help us focus on the parts of netdata that are used the most, or help us identify the extend our development decisions influence the community. +2. **Usage statistics**, to help us focus on the parts of Netdata that are used the most, or help us identify the extend our development decisions influence the community. To opt-out from sending anonymous statistics, you can create reate a file called `.opt-out-from-anonymous-statistics` under the user configuration directory (usually `/etc/netdata`). diff --git a/docs/why-netdata/1s-granularity.md b/docs/why-netdata/1s-granularity.md index 08985454..0d12a2d4 100644 --- a/docs/why-netdata/1s-granularity.md +++ b/docs/why-netdata/1s-granularity.md @@ -34,13 +34,13 @@ So, the monitoring industry fails to massively provide high resolution metrics, 2. Data collection needs optimization, otherwise it will significantly affect the monitored systems. 3. Data collection is a lot harder, especially on busy virtual environments. -## What does netdata do differently? +## What does Netdata do differently? Netdata decentralizes monitoring completely. Each Netdata node is autonomous. It collects metrics locally, it stores them locally, it runs checks against them to trigger alarms locally, and provides an API for the dashboards to visualize them. This allows Netdata to scale to infinity. Of course, Netdata can centralize metrics when needed. For example, it is not practical to keep metrics locally on ephemeral nodes. For these cases, Netdata streams the metrics in real-time, from the ephemeral nodes to one or more non-ephemeral nodes nearby. This centralization is again distributed. On a large infrastructure, there may be many centralization points. -To eliminate the error introduced by data collection latencies on busy virtual environments, Netdata interpolates collected metrics. It does this using microsecond timings, per data source, offering measurements with an error rate of 0.0001%. When running [in debug mode, netdata calculates this error rate](https://github.com/netdata/netdata/blob/36199f449852f8077ea915a3a14a33fa2aff6d85/database/rrdset.c#L1070-L1099) for every point collected, ensuring that the database works with acceptable accuracy. +To eliminate the error introduced by data collection latencies on busy virtual environments, Netdata interpolates collected metrics. It does this using microsecond timings, per data source, offering measurements with an error rate of 0.0001%. When running [in debug mode, Netdata calculates this error rate](https://github.com/netdata/netdata/blob/36199f449852f8077ea915a3a14a33fa2aff6d85/database/rrdset.c#L1070-L1099) for every point collected, ensuring that the database works with acceptable accuracy. Finally, Netdata is really fast. Optimization is a core product feature. On modern hardware, Netdata can collect metrics with a rate of above 1M metrics per second per core (this includes everything, parsing data sources, interpolating data, storing data in the time series database, etc). So, for a few thousands metrics per second per node, Netdata needs negligible CPU resources (just 1-2% of a single core). diff --git a/docs/why-netdata/immediate-results.md b/docs/why-netdata/immediate-results.md index 9afe4afd..12333671 100644 --- a/docs/why-netdata/immediate-results.md +++ b/docs/why-netdata/immediate-results.md @@ -20,7 +20,7 @@ Open-source solutions rely almost entirely on configuration. So, you have to go Monitoring SaaS providers offer a very basic set of pre-configured metrics, dashboards and alarms. They assume that you will configure the rest you may need. So, once more, the result will reflect your skills, your experience, your understanding. -## What does netdata do? +## What does Netdata do? 1. Metrics are auto-detected, so for 99% of the cases data collection works out of the box. 2. Metrics are converted to human readable units, right after data collection, before storing them into the database. diff --git a/docs/why-netdata/meaningful-presentation.md b/docs/why-netdata/meaningful-presentation.md index 6414d023..f6fd0756 100644 --- a/docs/why-netdata/meaningful-presentation.md +++ b/docs/why-netdata/meaningful-presentation.md @@ -42,9 +42,9 @@ Of course, it is just not practical to work that way when the database has 10,00 So, they collect very limited metrics. Basic dashboards can be created with these metrics, but for any issue that needs to be troubleshooted, the monitoring system is just not adequate. It cannot help. So, engineers are using the console to access the rest of the metrics and find the root cause. -## What does netdata do? +## What does Netdata do? -In netdata, the meaning of metrics is incorporated into the database: +In Netdata, the meaning of metrics is incorporated into the database: 1. all metrics are converted and stored to human-friendly units. This is a data-collection process, not a visualization process. For example, cpu utilization in Netdata is stored as percentage, not as kernel ticks. diff --git a/docs/why-netdata/unlimited-metrics.md b/docs/why-netdata/unlimited-metrics.md index e35034a2..a4ecaf3f 100644 --- a/docs/why-netdata/unlimited-metrics.md +++ b/docs/why-netdata/unlimited-metrics.md @@ -33,12 +33,12 @@ They can't do otherwise! 2. It is a lot easier to provide an illusion of monitoring by using a few basic metrics. 3. Troubleshooting slowdowns is the hardest IT problem to solve, so most solutions just avoid it. -## What does netdata do? +## What does Netdata do? Netdata collects, stores and visualizes everything, every single metric exposed by systems and applications. Due to Netdata's distributed nature, the number of metrics collected does not have any noticeable effect on the performance or the cost of the monitoring infrastructure. -Of course, since netdata is also about [meaningful presentation](meaningful-presentation.md), the number of metrics makes Netdata development slower. We, the Netdata developers, need to have a good understanding of the metrics before adding them into Netdata. We need to organize the metrics, add information related to them, configure alarms for them, so that you, the Netdata users, will have the best out-of-the-box experience and all the information required to kill the console for troubleshooting slowdowns. +Of course, since Netdata is also about [meaningful presentation](meaningful-presentation.md), the number of metrics makes Netdata development slower. We, the Netdata developers, need to have a good understanding of the metrics before adding them into Netdata. We need to organize the metrics, add information related to them, configure alarms for them, so that you, the Netdata users, will have the best out-of-the-box experience and all the information required to kill the console for troubleshooting slowdowns. [![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fdocs%2Fwhy-netdata%2Funlimited-metrics&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)]() diff --git a/health/Makefile.am b/health/Makefile.am index 62a4c6d3..5310bd8a 100644 --- a/health/Makefile.am +++ b/health/Makefile.am @@ -35,6 +35,7 @@ dist_healthconfig_DATA = \ health.d/cpu.conf \ health.d/couchdb.conf \ health.d/disks.conf \ + health.d/dnsmasq_dhcp.conf \ health.d/dockerd.conf \ health.d/elasticsearch.conf \ health.d/entropy.conf \ @@ -62,13 +63,16 @@ dist_healthconfig_DATA = \ health.d/netfilter.conf \ health.d/nginx.conf \ health.d/nginx_plus.conf \ + health.d/pihole.conf \ health.d/phpfpm.conf \ health.d/portcheck.conf \ health.d/postgres.conf \ + health.d/processes.conf \ health.d/qos.conf \ health.d/ram.conf \ health.d/redis.conf \ health.d/retroshare.conf \ + health.d/riakkv.conf \ health.d/softnet.conf \ health.d/squid.conf \ health.d/stiebeleltron.conf \ @@ -81,6 +85,8 @@ dist_healthconfig_DATA = \ health.d/udp_errors.conf \ health.d/varnish.conf \ health.d/web_log.conf \ + health.d/wmi.conf \ health.d/x509check.conf \ health.d/zfs.conf \ + health.d/dbengine.conf \ $(NULL) diff --git a/health/README.md b/health/README.md index 54f6a3e1..81cc043d 100644 --- a/health/README.md +++ b/health/README.md @@ -11,7 +11,6 @@ packet dropped). Netdata also supports alarm **templates**, so that an alarm can be attached to all the charts of the same context (i.e. all network interfaces, or all disks, or all mysql servers, etc.). - Each alarm can execute a single query to the database using statistical algorithms against past data, but alarms can be combined. So, if you need 2 queries in the database, you can combine 2 alarms together (both will run a query to the database, and the results can be combined). @@ -342,6 +341,24 @@ delay: [[[up U] [down D] multiplier M] max X] their matching one) and a delay is in place. - All are reset to their defaults when the alarm switches state without a delay in place. +--- + +#### Alarm line `repeat` + +Defines the interval between repeating notifications for the alarms in CRITICAL or WARNING mode. This will override the default interval settings inherited from health settings in `netdata.conf`. The default settings for repeating notifications are `default repeat warning = DURATION` and `default repeat critical = DURATION` which can be found in health stock configuration. + +Format: + +``` +repeat: [off] [warning DURATION] [critical DURATION] +``` + +* `off`: Turns off the repeating feature for the current alarm. This is effective when the default repeat settings has been enabled in health configuration. +* `warning DURATION`: Defines the interval when the alarm is in WARNING state. Use `0s` to turn off the repeating notification for WARNING mode. +* `critical DURATION`: Defines the interval when the alarm is in CRITICAL state. Use `0s` to turn off the repeating notification for CRITICAL mode. + +--- + #### Alarm line `option` The only possible value for the `option` line is @@ -567,12 +584,15 @@ template: disk_full_percent every: 1m warn: $this > 80 crit: $this > 95 + repeat: warning 120s critical 10s ``` `$used` and `$avail` are the `used` and `avail` chart dimensions as shown on the dashboard. So, the `calc` line finds the percentage of used space. `$this` resolves to this percentage. +This is a repeating alarm and if the alarm becomes CRITICAL it repeats the notifications every 10 seconds. It also repeats notifications every 2 minutes if the alarm goes into WARNING mode. + ### Example 3 Predict if any disk will run out of space in the near future. diff --git a/health/health.c b/health/health.c index f92a1ba6..55bd7284 100644 --- a/health/health.c +++ b/health/health.c @@ -13,18 +13,74 @@ unsigned int default_health_enabled = 1; // ---------------------------------------------------------------------------- // health initialization +/** + * User Config directory + * + * Get the config directory for health and return it. + * + * @return a pointer to the user config directory + */ inline char *health_user_config_dir(void) { char buffer[FILENAME_MAX + 1]; snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_user_config_dir); return config_get(CONFIG_SECTION_HEALTH, "health configuration directory", buffer); } +/** + * Stock Config Directory + * + * Get the Stock config directory and return it. + * + * @return a pointer to the stock config directory. + */ inline char *health_stock_config_dir(void) { char buffer[FILENAME_MAX + 1]; snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_stock_config_dir); return config_get(CONFIG_SECTION_HEALTH, "stock health configuration directory", buffer); } +/** + * Silencers init + * + * Function used to initialize the silencer structure. + */ +void health_silencers_init(void) { + struct stat statbuf; + if (!stat(silencers_filename,&statbuf)) { + off_t length = statbuf.st_size; + if (length && length < HEALTH_SILENCERS_MAX_FILE_LEN) { + FILE *fd = fopen(silencers_filename, "r"); + if (fd) { + char *str = mallocz((length+1)* sizeof(char)); + if(str) { + size_t copied; + copied = fread(str, sizeof(char), length, fd); + if (copied == (length* sizeof(char))) { + str[length] = 0x00; + json_parse(str, NULL, health_silencers_json_read_callback); + info("Parsed health silencers file %s", silencers_filename); + } else { + error("Cannot read the data from health silencers file %s", silencers_filename); + } + freez(str); + } + fclose(fd); + } else { + error("Cannot open the file %s",silencers_filename); + } + } else { + error("Health silencers file %s has the size %ld that is out of range[ 1 , %d ]. Aborting read.", silencers_filename, length, HEALTH_SILENCERS_MAX_FILE_LEN); + } + } else { + error("Cannot open the file %s",silencers_filename); + } +} + +/** + * Health Init + * + * Initialize the health thread. + */ void health_init(void) { debug(D_HEALTH, "Health configuration initializing"); @@ -32,11 +88,20 @@ void health_init(void) { debug(D_HEALTH, "Health is disabled."); return; } + + health_silencers_init(); } // ---------------------------------------------------------------------------- // re-load health configuration +/** + * Reload host + * + * Reload configuration for a specific host. + * + * @param host the structure of the host that the function will reload the configuration. + */ void health_reload_host(RRDHOST *host) { if(unlikely(!host->health_enabled)) return; @@ -84,6 +149,11 @@ void health_reload_host(RRDHOST *host) { rrdhost_unlock(host); } +/** + * Reload + * + * Reload the host configuration for all hosts. + */ void health_reload(void) { rrd_rdlock(); @@ -255,17 +325,18 @@ static inline void health_alarm_log_process(RRDHOST *host) { netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock); ALARM_ENTRY *ae; - for(ae = host->health_log.alarms; ae && ae->unique_id >= host->health_last_processed_id ; ae = ae->next) { - if(unlikely( - !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) && - !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED) + for(ae = host->health_log.alarms; ae && ae->unique_id >= host->health_last_processed_id; ae = ae->next) { + if(likely(!alarm_entry_isrepeating(host, ae))) { + if(unlikely( + !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) && + !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED) )) { + if(unlikely(ae->unique_id < first_waiting)) + first_waiting = ae->unique_id; - if(unlikely(ae->unique_id < first_waiting)) - first_waiting = ae->unique_id; - - if(likely(now >= ae->delay_up_to_timestamp)) - health_process_notifications(host, ae); + if(likely(now >= ae->delay_up_to_timestamp)) + health_process_notifications(host, ae); + } } } @@ -294,10 +365,12 @@ static inline void health_alarm_log_process(RRDHOST *host) { ALARM_ENTRY *t = ae->next; - health_alarm_log_free_one_nochecks_nounlink(ae); + if(likely(!alarm_entry_isrepeating(host, ae))) { + health_alarm_log_free_one_nochecks_nounlink(ae); + host->health_log.count--; + } ae = t; - host->health_log.count--; } netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock); @@ -411,7 +484,7 @@ SILENCE_TYPE check_silenced(RRDCALC *rc, char* host, SILENCERS *silencers) { debug(D_HEALTH, "Alarm %s matched a silence entry, but no SILENCE or DISABLE command was issued via the command API. The match has no effect.", rc->name); } else { debug(D_HEALTH, "Alarm %s via the command API - name:%s context:%s chart:%s host:%s family:%s" - , (silencers->stype==STYPE_DISABLE_ALARMS)?"Disabled":"Silenced" + , (silencers->stype == STYPE_DISABLE_ALARMS)?"Disabled":"Silenced" , rc->name , (rc->rrdset)?rc->rrdset->context:"" , rc->chart @@ -425,6 +498,16 @@ SILENCE_TYPE check_silenced(RRDCALC *rc, char* host, SILENCERS *silencers) { return STYPE_NONE; } +/** + * Update Disabled Silenced + * + * Update the variable rrdcalc_flags of the structure RRDCALC according with the values of the host structure + * + * @param host structure that contains information about the host monitored. + * @param rc structure with information about the alarm + * + * @return It returns 1 case rrdcalc_flags is DISABLED or 0 otherwise + */ int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) { uint32_t rrdcalc_flags_old = rc->rrdcalc_flags; // Clear the flags @@ -454,6 +537,15 @@ int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) { return 0; } +/** + * Health Main + * + * The main thread of the health system. In this function all the alarms will be processed. + * + * @param ptr is a pointer to the netdata_static_thread structure. + * + * @return It always returns NULL + */ void *health_main(void *ptr) { netdata_thread_cleanup_push(health_main_cleanup, ptr); @@ -464,12 +556,6 @@ void *health_main(void *ptr) { time_t hibernation_delay = config_get_number(CONFIG_SECTION_HEALTH, "postpone alarms during hibernation for seconds", 60); unsigned int loop = 0; - - silencers = mallocz(sizeof(SILENCERS)); - silencers->all_alarms=0; - silencers->stype=STYPE_NONE; - silencers->silencers=NULL; - while(!netdata_exit) { loop++; debug(D_HEALTH, "Health monitoring iteration no %u started", loop); @@ -756,20 +842,22 @@ void *health_main(void *ptr) { rc->delay_last = delay; rc->delay_up_to_timestamp = now + delay; - health_alarm_log( - host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, - rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, - rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info, - rc->delay_last, - ( - ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) | - ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) - ) - - ); - - rc->last_status_change = now; - rc->status = status; + if(likely(!rrdcalc_isrepeating(rc))) { + ALARM_ENTRY *ae = health_create_alarm_entry( + host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, + rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, + rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info, + rc->delay_last, + ( + ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) | + ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) + ) + ); + health_alarm_log(host, ae); + } + rc->last_status_change = now; + rc->old_status = rc->status; + rc->status = status; } rc->last_updated = now; @@ -779,6 +867,35 @@ void *health_main(void *ptr) { next_run = rc->next_update; } + // process repeating alarms + RRDCALC *rc; + for(rc = host->alarms; rc ; rc = rc->next) { + int repeat_every = 0; + if(unlikely(rrdcalc_isrepeating(rc))) { + if(unlikely(rc->status == RRDCALC_STATUS_WARNING)) + repeat_every = rc->warn_repeat_every; + else if(unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) + repeat_every = rc->crit_repeat_every; + } + if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) { + rc->last_repeat = now; + ALARM_ENTRY *ae = health_create_alarm_entry( + host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, + rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, + rc->old_value, rc->value, rc->old_status, rc->status, rc->source, rc->units, rc->info, + rc->delay_last, + ( + ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) | + ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) + ) + ); + ae->last_repeat = rc->last_repeat; + health_process_notifications(host, ae); + debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id); + health_alarm_log_free_one_nochecks_nounlink(ae); + } + } + rrdhost_unlock(host); } diff --git a/health/health.d/dbengine.conf b/health/health.d/dbengine.conf new file mode 100644 index 00000000..7a623ba2 --- /dev/null +++ b/health/health.d/dbengine.conf @@ -0,0 +1,26 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + + alarm: 10min_dbengine_global_fs_errors + on: netdata.dbengine_global_errors + os: linux freebsd macos + hosts: * + lookup: sum -10m unaligned of FS errors + units: errors + every: 10s + crit: $this > 0 + delay: down 15m multiplier 1.5 max 1h + info: number of File-System errors dbengine came across the last 10 minutes (too many open files, wrong permissions etc) + to: sysadmin + + alarm: 10min_dbengine_global_io_errors + on: netdata.dbengine_global_errors + os: linux freebsd macos + hosts: * + lookup: sum -10m unaligned of I/O errors + units: errors + every: 10s + crit: $this > 0 + delay: down 1h multiplier 1.5 max 3h + info: number of IO errors dbengine came across the last 10 minutes (out of space, bad disk etc) + to: sysadmin diff --git a/health/health.d/disks.conf b/health/health.d/disks.conf index 26f85848..9c194ced 100644 --- a/health/health.d/disks.conf +++ b/health/health.d/disks.conf @@ -13,7 +13,7 @@ template: disk_space_usage on: disk.space os: linux freebsd hosts: * -families: * +families: !/dev !/dev/* !/run !/run/* * calc: $used * 100 / ($avail + $used) units: % every: 1m @@ -27,7 +27,7 @@ template: disk_inode_usage on: disk.inodes os: linux freebsd hosts: * -families: * +families: !/dev !/dev/* !/run !/run/* * calc: $used * 100 / ($avail + $used) units: % every: 1m diff --git a/health/health.d/dnsmasq_dhcp.conf b/health/health.d/dnsmasq_dhcp.conf new file mode 100644 index 00000000..b7eb4e0a --- /dev/null +++ b/health/health.d/dnsmasq_dhcp.conf @@ -0,0 +1,12 @@ + # dhcp-range utilization + + template: dnsmasq_dhcp_dhcp_range_utilization + on: dnsmasq_dhcp.dhcp_range_utilization + every: 10s + units: % + calc: $used + warn: $this > ( ($status >= $WARNING ) ? ( 80 ) : ( 90 ) ) + crit: $this > ( ($status >= $CRITICAL) ? ( 90 ) : ( 95 ) ) + delay: down 5m + info: dhcp-range utilization above threshold! + to: sysadmin diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf new file mode 100644 index 00000000..4a121723 --- /dev/null +++ b/health/health.d/pihole.conf @@ -0,0 +1,67 @@ + + # Make sure Pi-hole is responding. + +template: pihole_last_collected_secs + on: pihole.dns_queries_total + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: webmaster + + # Blocked DNS queries. + + template: pihole_blocked_queries + on: pihole.dns_queries_percentage + every: 10s + units: % + calc: $blocked + warn: $this > ( ($status >= $WARNING ) ? ( 45 ) : ( 55 ) ) + crit: $this > ( ($status >= $CRITICAL) ? ( 55 ) : ( 75 ) ) + delay: up 2m down 5m + info: percentage of blocked dns queries for the last 24 hour + to: sysadmin + + + # Blocklist last update time. + # Default update interval is a week. + + template: pihole_blocklist_last_update + on: pihole.blocklist_last_update + every: 10s + units: seconds + calc: $ago + warn: $this > 60 * 60 * 24 * 8 + crit: $this > 60 * 60 * 24 * 8 * 2 + info: blocklist last update time + to: sysadmin + + + # Gravity file check (gravity.list). + + template: pihole_blocklist_gravity_file + on: pihole.blocklist_last_update + every: 10s + units: boolean + calc: $file_exists + crit: $this != 1 + delay: up 2m down 5m + info: gravity file existence + to: sysadmin + + + # Pi-hole's ability to block unwanted domains. + # Should be enabled. The whole point of Pi-hole! + + template: pihole_status + on: pihole.unwanted_domains_blocking_status + every: 10s + units: boolean + calc: $enabled + warn: $this != 1 + delay: up 2m down 5m + info: unwanted domains blocking status + to: sysadmin diff --git a/health/health.d/processes.conf b/health/health.d/processes.conf new file mode 100644 index 00000000..d96998fd --- /dev/null +++ b/health/health.d/processes.conf @@ -0,0 +1,27 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + alarm: active_processes_limit_freebsd + on: system.active_processes + os: freebsd + hosts: * + calc: $active + units: processes + every: 5s + warn: $this > (($status >= $WARNING) ? (75000) : (80000)) + crit: $this > (($status == $CRITICAL) ? (85000) : (90000)) + delay: down 5m multiplier 1.5 max 1h + info: the number of active processes + to: sysadmin + + alarm: active_processes_limit + on: system.active_processes + os: linux + hosts: * + calc: $active + units: processes + every: 5s + warn: $this > (($status >= $WARNING) ? (25000) : (26000)) + crit: $this > (($status == $CRITICAL) ? (28000) : (30000)) + delay: down 5m multiplier 1.5 max 1h + info: number of active processes + to: sysadmin diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf index 93883f73..4e41bb49 100644 --- a/health/health.d/ram.conf +++ b/health/health.d/ram.conf @@ -27,7 +27,7 @@ on: mem.available os: linux hosts: * - calc: ($avail + $used_ram_to_ignore) * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers) + calc: ($avail + $system.ram.used_ram_to_ignore) * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers) units: % every: 10s warn: $this < (($status >= $WARNING) ? (15) : (10)) diff --git a/health/health.d/riakkv.conf b/health/health.d/riakkv.conf new file mode 100644 index 00000000..74530277 --- /dev/null +++ b/health/health.d/riakkv.conf @@ -0,0 +1,80 @@ +# Ensure that Riak is running. template: riak_last_collected_secs +template: riak_last_collected_secs + on: riak.kv.throughput + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: dba + +# Warn if a list keys operation is running. +template: riak_list_keys_active + on: riak.core.fsm_active + calc: $list_fsm_active + units: state machines + every: 10s + warn: $list_fsm_active > 0 + info: number of currently running list keys finite state machines + to: dba + + +## Timing healthchecks +# KV GET +template: 1h_kv_get_mean_latency + on: riak.kv.latency.get + calc: $node_get_fsm_time_mean + lookup: average -1h unaligned of time + every: 30s + units: ms + info: mean average KV GET latency over the last hour + +template: riak_kv_get_slow + on: riak.kv.latency.get + calc: $mean + lookup: average -3m unaligned of time + units: ms + every: 10s + warn: ($this > ($1h_kv_get_mean_latency * 2) ) + crit: ($this > ($1h_kv_get_mean_latency * 3) ) + info: average KV GET time over the last 3 minutes, compared to the average over the last hour + delay: down 5m multiplier 1.5 max 1h + to: dba + +# KV PUT +template: 1h_kv_put_mean_latency + on: riak.kv.latency.put + calc: $node_put_fsm_time_mean + lookup: average -1h unaligned of time + every: 30s + units: ms + info: mean average KV PUT latency over the last hour + +template: riak_kv_put_slow + on: riak.kv.latency.put + calc: $mean + lookup: average -3m unaligned of time + units: ms + every: 10s + warn: ($this > ($1h_kv_put_mean_latency * 2) ) + crit: ($this > ($1h_kv_put_mean_latency * 3) ) + info: average KV PUT time over the last 3 minutes, compared to the average over the last hour + delay: down 5m multiplier 1.5 max 1h + to: dba + + +## VM healthchecks + +# Default Erlang VM process limit: 262144 +# On systems observed, this is < 2000, but may grow depending on load. +template: riak_vm_high_process_count + on: riak.vm + calc: $sys_process_count + units: processes + every: 10s + warn: $this > 10000 + crit: $this > 100000 + info: number of processes running in the Erlang VM (the default limit on ERTS 10.2.4 is 262144) + to: dba diff --git a/health/health.d/wmi.conf b/health/health.d/wmi.conf new file mode 100644 index 00000000..0441fc1f --- /dev/null +++ b/health/health.d/wmi.conf @@ -0,0 +1,130 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + +## Availability + +template: wmi_last_collected_secs + on: cpu.collector_duration + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sysadmin + +## CPU + +template: wmi_10min_cpu_usage + on: wmi.cpu_utilization_total + os: linux + hosts: * + lookup: average -10m unaligned match-names of dpc,user,privileged,interrupt + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (75) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + info: cpu utilization for the last 10 minutes + to: sysadmin + + +## Memory + +template: wmi_ram_in_use + on: wmi.memory_utilization + os: linux + hosts: * + calc: ($used) * 100 / ($used + $available) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + info: used RAM + to: sysadmin + +template: wmi_swap_in_use + on: wmi.memory_swap_utilization + os: linux + hosts: * + calc: ($used) * 100 / ($used + $available) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + info: used Swap + to: sysadmin + + +## Network + +template: inbound_packets_discarded + on: wmi.net_discarded + os: linux + hosts: * +families: * + lookup: sum -10m unaligned absolute match-names of inbound + units: packets + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + info: interface inbound discarded packets in the last 10 minutes + to: sysadmin + +template: outbound_packets_discarded + on: wmi.net_discarded + os: linux + hosts: * +families: * + lookup: sum -10m unaligned absolute match-names of outbound + units: packets + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + info: interface outbound discarded packets in the last 10 minutes + to: sysadmin + +template: inbound_packets_errors + on: wmi.net_errors + os: linux + hosts: * +families: * + lookup: sum -10m unaligned absolute match-names of inbound + units: packets + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + info: interface inbound errors in the last 10 minutes + to: sysadmin + +template: outbound_packets_errors + on: wmi.net_errors + os: linux + hosts: * +families: * + lookup: sum -10m unaligned absolute match-names of outbound + units: packets + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + info: interface outbound errors in the last 10 minutes + to: sysadmin + + +## Disk + +template: wmi_disk_in_use + on: wmi.logical_disk_utilization + os: linux + hosts: * + calc: ($used) * 100 / ($used + $free) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + info: used disk space + to: sysadmin diff --git a/health/health.d/x509check.conf b/health/health.d/x509check.conf index dc0e6c69..a56f48fc 100644 --- a/health/health.d/x509check.conf +++ b/health/health.d/x509check.conf @@ -1,4 +1,18 @@ +# make sure x509check is running + +template: x509check_last_collected_secs + on: x509check.time_until_expiration + calc: $now - $last_collected_t + units: seconds ago + every: 60s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: webmaster + + template: x509check_days_until_expiration on: x509check.time_until_expiration calc: $expiry diff --git a/health/health.h b/health/health.h index 1511f364..6920d12d 100644 --- a/health/health.h +++ b/health/health.h @@ -35,16 +35,7 @@ extern unsigned int default_health_enabled; #define HEALTH_LISTEN_BACKLOG 4096 #endif -#define HEALTH_ALARM_KEY "alarm" -#define HEALTH_TEMPLATE_KEY "template" #define HEALTH_ON_KEY "on" -#define HEALTH_CONTEXT_KEY "context" -#define HEALTH_CHART_KEY "chart" -#define HEALTH_HOST_KEY "hosts" -#define HEALTH_OS_KEY "os" -#define HEALTH_FAMILIES_KEY "families" -#define HEALTH_LOOKUP_KEY "lookup" -#define HEALTH_CALC_KEY "calc" #define HEALTH_EVERY_KEY "every" #define HEALTH_GREEN_KEY "green" #define HEALTH_RED_KEY "red" @@ -57,38 +48,9 @@ extern unsigned int default_health_enabled; #define HEALTH_DELAY_KEY "delay" #define HEALTH_OPTIONS_KEY "options" -typedef struct silencer { - char *alarms; - SIMPLE_PATTERN *alarms_pattern; +#define HEALTH_SILENCERS_MAX_FILE_LEN 10000 - char *hosts; - SIMPLE_PATTERN *hosts_pattern; - - char *contexts; - SIMPLE_PATTERN *contexts_pattern; - - char *charts; - SIMPLE_PATTERN *charts_pattern; - - char *families; - SIMPLE_PATTERN *families_pattern; - - struct silencer *next; -} SILENCER; - -typedef enum silence_type { - STYPE_NONE, - STYPE_DISABLE_ALARMS, - STYPE_SILENCE_NOTIFICATIONS -} SILENCE_TYPE; - -typedef struct silencers { - int all_alarms; - SILENCE_TYPE stype; - SILENCER *silencers; -} SILENCERS; - -SILENCERS *silencers; +char *silencers_filename; extern void health_init(void); extern void *health_main(void *ptr); @@ -108,7 +70,7 @@ extern void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae); extern ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filename); extern void health_alarm_log_load(RRDHOST *host); -extern void health_alarm_log( +extern ALARM_ENTRY* health_create_alarm_entry( RRDHOST *host, uint32_t alarm_id, uint32_t alarm_event_id, @@ -129,6 +91,8 @@ extern void health_alarm_log( int delay, uint32_t flags); +extern void health_alarm_log(RRDHOST *host, ALARM_ENTRY *ae); + extern void health_readdir(RRDHOST *host, const char *user_path, const char *stock_path, const char *subpath); extern char *health_user_config_dir(void); extern char *health_stock_config_dir(void); diff --git a/health/health_config.c b/health/health_config.c index 35fde90b..0d6e77a9 100644 --- a/health/health_config.c +++ b/health/health_config.c @@ -23,6 +23,7 @@ #define HEALTH_INFO_KEY "info" #define HEALTH_DELAY_KEY "delay" #define HEALTH_OPTIONS_KEY "options" +#define HEALTH_REPEAT_KEY "repeat" static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) { if(!rc->chart) { @@ -45,7 +46,7 @@ static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) { rc->id = rrdcalc_get_unique_id(host, rc->chart, rc->name, &rc->next_event_id); - debug(D_HEALTH, "Health configuration adding alarm '%s.%s' (%u): exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f", + debug(D_HEALTH, "Health configuration adding alarm '%s.%s' (%u): exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u", rc->chart?rc->chart:"NOCHART", rc->name, rc->id, @@ -66,10 +67,12 @@ static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) { rc->delay_up_duration, rc->delay_down_duration, rc->delay_max_duration, - rc->delay_multiplier + rc->delay_multiplier, + rc->warn_repeat_every, + rc->crit_repeat_every ); - rrdcalc_create_part2(host, rc); + rrdcalc_add_to_host(host, rc); return 1; } @@ -100,7 +103,7 @@ static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCAL } } - debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f", + debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u", rt->name, (rt->context)?rt->context:"NONE", (rt->exec)?rt->exec:"DEFAULT", @@ -120,7 +123,9 @@ static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCAL rt->delay_up_duration, rt->delay_down_duration, rt->delay_max_duration, - rt->delay_multiplier + rt->delay_multiplier, + rt->warn_repeat_every, + rt->crit_repeat_every ); if(likely(last)) { @@ -134,48 +139,6 @@ static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCAL return 1; } -static inline int health_parse_duration(char *string, int *result) { - // make sure it is a number - if(!*string || !(isdigit(*string) || *string == '+' || *string == '-')) { - *result = 0; - return 0; - } - - char *e = NULL; - calculated_number n = str2ld(string, &e); - if(e && *e) { - switch (*e) { - case 'Y': - *result = (int) (n * 86400 * 365); - break; - case 'M': - *result = (int) (n * 86400 * 30); - break; - case 'w': - *result = (int) (n * 86400 * 7); - break; - case 'd': - *result = (int) (n * 86400); - break; - case 'h': - *result = (int) (n * 3600); - break; - case 'm': - *result = (int) (n * 60); - break; - - default: - case 's': - *result = (int) (n); - break; - } - } - else - *result = (int)(n); - - return 1; -} - static inline int health_parse_delay( size_t line, const char *filename, char *string, int *delay_up_duration, @@ -202,14 +165,14 @@ static inline int health_parse_delay( while(*s && isspace(*s)) *s++ = '\0'; if(!strcasecmp(key, "up")) { - if (!health_parse_duration(value, delay_up_duration)) { + if (!config_parse_duration(value, delay_up_duration)) { error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword", line, filename, value, key); } else given_up = 1; } else if(!strcasecmp(key, "down")) { - if (!health_parse_duration(value, delay_down_duration)) { + if (!config_parse_duration(value, delay_down_duration)) { error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword", line, filename, value, key); } @@ -224,7 +187,7 @@ static inline int health_parse_delay( else given_multiplier = 1; } else if(!strcasecmp(key, "max")) { - if (!health_parse_duration(value, delay_max_duration)) { + if (!config_parse_duration(value, delay_max_duration)) { error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword", line, filename, value, key); } @@ -285,6 +248,50 @@ static inline uint32_t health_parse_options(const char *s) { return options; } +static inline int health_parse_repeat( + size_t line, + const char *file, + char *string, + uint32_t *warn_repeat_every, + uint32_t *crit_repeat_every +) { + + char *s = string; + while(*s) { + char *key = s; + + while(*s && !isspace(*s)) s++; + while(*s && isspace(*s)) *s++ = '\0'; + + if(!*key) break; + + char *value = s; + while(*s && !isspace(*s)) s++; + while(*s && isspace(*s)) *s++ = '\0'; + + if(!strcasecmp(key, "off")) { + *warn_repeat_every = 0; + *crit_repeat_every = 0; + return 1; + } + if(!strcasecmp(key, "warning")) { + if (!config_parse_duration(value, (int*)warn_repeat_every)) { + error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword", + line, file, value, key); + } + } + else if(!strcasecmp(key, "critical")) { + if (!config_parse_duration(value, (int*)crit_repeat_every)) { + error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword", + line, file, value, key); + } + } + } + + return 1; +} + + static inline int health_parse_db_lookup( size_t line, const char *filename, char *string, RRDR_GROUPING *group_method, int *after, int *before, int *every, @@ -322,7 +329,7 @@ static inline int health_parse_db_lookup( while(*s && !isspace(*s)) s++; while(*s && isspace(*s)) *s++ = '\0'; - if(!health_parse_duration(key, after)) { + if(!config_parse_duration(key, after)) { error("Health configuration at line %zu of file '%s': invalid duration '%s' after group method", line, filename, key); return 0; @@ -343,7 +350,7 @@ static inline int health_parse_db_lookup( while(*s && !isspace(*s)) s++; while(*s && isspace(*s)) *s++ = '\0'; - if (!health_parse_duration(value, before)) { + if (!config_parse_duration(value, before)) { error("Health configuration at line %zu of file '%s': invalid duration '%s' for '%s' keyword", line, filename, value, key); } @@ -353,7 +360,7 @@ static inline int health_parse_db_lookup( while(*s && !isspace(*s)) s++; while(*s && isspace(*s)) *s++ = '\0'; - if (!health_parse_duration(value, every)) { + if (!config_parse_duration(value, every)) { error("Health configuration at line %zu of file '%s': invalid duration '%s' for '%s' keyword", line, filename, value, key); } @@ -430,7 +437,8 @@ static int health_readfile(const char *filename, void *data) { hash_info = 0, hash_recipient = 0, hash_delay = 0, - hash_options = 0; + hash_options = 0, + hash_repeat = 0; char buffer[HEALTH_CONF_MAX_LINE + 1]; @@ -454,6 +462,7 @@ static int health_readfile(const char *filename, void *data) { hash_recipient = simple_hash(HEALTH_RECIPIENT_KEY); hash_delay = simple_uhash(HEALTH_DELAY_KEY); hash_options = simple_uhash(HEALTH_OPTIONS_KEY); + hash_repeat = simple_uhash(HEALTH_REPEAT_KEY); } FILE *fp = fopen(filename, "r"); @@ -481,7 +490,7 @@ static int health_readfile(const char *filename, void *data) { if(append < HEALTH_CONF_MAX_LINE) continue; else { - error("Health configuration has too long muli-line at line %zu of file '%s'.", line, filename); + error("Health configuration has too long multi-line at line %zu of file '%s'.", line, filename); } } append = 0; @@ -532,6 +541,9 @@ static int health_readfile(const char *filename, void *data) { rc->value = NAN; rc->old_value = NAN; rc->delay_multiplier = 1.0; + rc->old_status = RRDCALC_STATUS_UNINITIALIZED; + rc->warn_repeat_every = host->health_default_warn_repeat_every; + rc->crit_repeat_every = host->health_default_crit_repeat_every; if(rrdvar_fix_name(rc->name)) error("Health configuration renamed alarm '%s' to '%s'", value, rc->name); @@ -556,6 +568,8 @@ static int health_readfile(const char *filename, void *data) { rt->green = NAN; rt->red = NAN; rt->delay_multiplier = 1.0; + rt->warn_repeat_every = host->health_default_warn_repeat_every; + rt->crit_repeat_every = host->health_default_crit_repeat_every; if(rrdvar_fix_name(rt->name)) error("Health configuration renamed template '%s' to '%s'", value, rt->name); @@ -612,7 +626,7 @@ static int health_readfile(const char *filename, void *data) { &rc->options, &rc->dimensions); } else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) { - if(!health_parse_duration(value, &rc->update_every)) + if(!config_parse_duration(value, &rc->update_every)) error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' cannot parse duration: '%s'.", line, filename, rc->name, key, value); } @@ -707,6 +721,11 @@ static int health_readfile(const char *filename, void *data) { else if(hash == hash_options && !strcasecmp(key, HEALTH_OPTIONS_KEY)) { rc->options |= health_parse_options(value); } + else if(hash == hash_repeat && !strcasecmp(key, HEALTH_REPEAT_KEY)){ + health_parse_repeat(line, filename, value, + &rc->warn_repeat_every, + &rc->crit_repeat_every); + } else { error("Health configuration at line %zu of file '%s' for alarm '%s' has unknown key '%s'.", line, filename, rc->name, key); @@ -736,7 +755,7 @@ static int health_readfile(const char *filename, void *data) { &rt->update_every, &rt->options, &rt->dimensions); } else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) { - if(!health_parse_duration(value, &rt->update_every)) + if(!config_parse_duration(value, &rt->update_every)) error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' cannot parse duration: '%s'.", line, filename, rt->name, key, value); } @@ -831,6 +850,11 @@ static int health_readfile(const char *filename, void *data) { else if(hash == hash_options && !strcasecmp(key, HEALTH_OPTIONS_KEY)) { rt->options |= health_parse_options(value); } + else if(hash == hash_repeat && !strcasecmp(key, HEALTH_REPEAT_KEY)){ + health_parse_repeat(line, filename, value, + &rt->warn_repeat_every, + &rt->crit_repeat_every); + } else { error("Health configuration at line %zu of file '%s' for template '%s' has unknown key '%s'.", line, filename, rt->name, key); diff --git a/health/health_json.c b/health/health_json.c index 78113244..e923b05c 100644 --- a/health/health_json.c +++ b/health/health_json.c @@ -140,6 +140,8 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC "\t\t\t\"delay_multiplier\": %f,\n" "\t\t\t\"delay\": %d,\n" "\t\t\t\"delay_up_to_timestamp\": %lu,\n" + "\t\t\t\"warn_repeat_every\": \"%u\",\n" + "\t\t\t\"crit_repeat_every\": \"%u\",\n" "\t\t\t\"value_string\": \"%s\",\n" , rc->chart, rc->name , (unsigned long)rc->id @@ -165,6 +167,8 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC , rc->delay_multiplier , rc->delay_last , (unsigned long)rc->delay_up_to_timestamp + , rc->warn_repeat_every + , rc->crit_repeat_every , value_string ); diff --git a/health/health_log.c b/health/health_log.c index 009e4267..c91cde6c 100644 --- a/health/health_log.c +++ b/health/health_log.c @@ -79,6 +79,7 @@ inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) { "\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" "\t%d\t%d\t%d\t%d" "\t" CALCULATED_NUMBER_FORMAT_AUTO "\t" CALCULATED_NUMBER_FORMAT_AUTO + "\t%016lx" "\n" , (ae->flags & HEALTH_ENTRY_FLAG_SAVED)?'U':'A' , host->hostname @@ -112,6 +113,7 @@ inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) { , ae->new_value , ae->old_value + , (uint64_t)ae->last_repeat ) < 0)) error("HEALTH [%s]: failed to save alarm log entry to '%s'. Health data may be lost in case of abnormal restart.", host->hostname, host->health_log_filename); else { @@ -174,10 +176,40 @@ inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filena continue; } + // Check if we got last_repeat field + time_t last_repeat = 0; + if(entries > 27) { + char* alarm_name = pointers[13]; + last_repeat = (time_t)strtoul(pointers[27], NULL, 16); + + RRDCALC *rc = alarm_max_last_repeat(host, alarm_name,simple_hash(alarm_name)); + if (!rc) { + for(rc = host->alarms; rc ; rc = rc->next) { + RRDCALC *rdcmp = (RRDCALC *) avl_insert_lock(&(host)->alarms_idx_name, (avl *)rc); + if(rdcmp != rc) { + error("Cannot insert the alarm index ID using log %s", rc->name); + } + } + + rc = alarm_max_last_repeat(host, alarm_name,simple_hash(alarm_name)); + } + + if(unlikely(rc)) { + if (rrdcalc_isrepeating(rc)) { + rc->last_repeat = last_repeat; + // We iterate through repeating alarm entries only to + // find the latest last_repeat timestamp. Otherwise, + // there is no need to keep them in memory. + continue; + } + } + } + if(unlikely(*pointers[0] == 'A')) { // make sure it is properly numbered if(unlikely(host->health_log.alarms && unique_id < host->health_log.alarms->unique_id)) { - error("HEALTH [%s]: line %zu of file '%s' has alarm log entry %u in wrong order. Ignoring it.", host->hostname, line, filename, unique_id); + error( "HEALTH [%s]: line %zu of file '%s' has alarm log entry %u in wrong order. Ignoring it." + , host->hostname, line, filename, unique_id); errored++; continue; } @@ -186,11 +218,11 @@ inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filena } else if(unlikely(*pointers[0] == 'U')) { // find the original - for(ae = host->health_log.alarms; ae; ae = ae->next) { + for(ae = host->health_log.alarms; ae ; ae = ae->next) { if(unlikely(unique_id == ae->unique_id)) { if(unlikely(*pointers[0] == 'A')) { error("HEALTH [%s]: line %zu of file '%s' adds duplicate alarm log entry %u. Using the later." - , host->hostname, line, filename, unique_id); + , host->hostname, line, filename, unique_id); *pointers[0] = 'U'; duplicate++; } @@ -270,6 +302,8 @@ inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filena ae->new_value = str2l(pointers[25]); ae->old_value = str2l(pointers[26]); + ae->last_repeat = last_repeat; + char value_string[100 + 1]; freez(ae->old_value_string); freez(ae->new_value_string); @@ -339,7 +373,7 @@ inline void health_alarm_log_load(RRDHOST *host) { // ---------------------------------------------------------------------------- // health alarm log management -inline void health_alarm_log( +inline ALARM_ENTRY* health_create_alarm_entry( RRDHOST *host, uint32_t alarm_id, uint32_t alarm_event_id, @@ -398,9 +432,24 @@ inline void health_alarm_log( ae->delay_up_to_timestamp = when + delay; ae->flags |= flags; + ae->last_repeat = 0; + if(ae->old_status == RRDCALC_STATUS_WARNING || ae->old_status == RRDCALC_STATUS_CRITICAL) ae->non_clear_duration += ae->duration; + return ae; +} + +inline void health_alarm_log( + RRDHOST *host, + ALARM_ENTRY *ae +) { + debug(D_HEALTH, "Health adding alarm log entry with id: %u", ae->unique_id); + + if(unlikely(alarm_entry_isrepeating(host, ae))) { + error("Repeating alarms cannot be added to host's alarm log entries. It seems somewhere in the logic, API is being misused. Alarm id: %u", ae->alarm_id); + return; + } // link it netdata_rwlock_wrlock(&host->health_log.alarm_log_rwlock); ae->next = host->health_log.alarms; diff --git a/health/notifications/README.md b/health/notifications/README.md index 5b7b4340..8c7ab66f 100644 --- a/health/notifications/README.md +++ b/health/notifications/README.md @@ -58,6 +58,9 @@ export NETDATA_ALARM_NOTIFY_DEBUG=1 # send test alarms to any role /usr/libexec/netdata/plugins.d/alarm-notify.sh test "ROLE" ``` + +Note that in versions before 1.16, the plugins.d directory may be installed in a different location in certain OSs (e.g. under `/usr/lib/netdata`). You can always find the location of the alarm-notify.sh script in `netdata.conf`. + If you need to dig even deeper, you can trace the execution with `bash -x`. Note that in test mode, alarm-notify.sh calls itself with many more arguments. So first do ```sh bash -x /usr/libexec/netdata/plugins.d/alarm-notify.sh test diff --git a/health/notifications/alarm-notify.sh.in b/health/notifications/alarm-notify.sh.in index ff4b3f3d..852718bc 100755 --- a/health/notifications/alarm-notify.sh.in +++ b/health/notifications/alarm-notify.sh.in @@ -189,6 +189,7 @@ fi [ -z "${NETDATA_STOCK_CONFIG_DIR}" ] && NETDATA_STOCK_CONFIG_DIR="@libconfigdir_POST@" [ -z "${NETDATA_CACHE_DIR}" ] && NETDATA_CACHE_DIR="@cachedir_POST@" [ -z "${NETDATA_REGISTRY_URL}" ] && NETDATA_REGISTRY_URL="https://registry.my-netdata.io" +[ -z "${NETDATA_REGISTRY_CLOUD_BASE_URL}" ] && NETDATA_REGISTRY_CLOUD_BASE_URL="https://netdata.cloud" # ----------------------------------------------------------------------------- # parse command line parameters @@ -681,7 +682,7 @@ date=$(date --date=@${when} "${date_format}" 2>/dev/null) # ---------------------------------------------------------------------------- # prepare some extra headers if we've been asked to thread e-mails if [ "${SEND_EMAIL}" == "YES" ] && [ "${EMAIL_THREADING}" != "NO" ]; then - email_thread_headers="In-Reply-To: <${chart}-${name}@${host}>\\nReferences: <${chart}-${name}@${host}>" + email_thread_headers="In-Reply-To: <${chart}-${name}@${host}>\\r\\nReferences: <${chart}-${name}@${host}>" else email_thread_headers= fi @@ -1790,7 +1791,7 @@ if [ "${NETDATA_REGISTRY_URL}" == "https://registry.my-netdata.io" ]; then NETDATA_REGISTRY_UNIQUE_ID="$(cat "@registrydir_POST@/netdata.public.unique.id")" fi fi - if [ ! -z "${NETDATA_REGISTRY_UNIQUE_ID}" ]; then + if [ -n "${NETDATA_REGISTRY_UNIQUE_ID}" ]; then GOTOCLOUD=1 fi fi @@ -1798,7 +1799,7 @@ fi if [ ${GOTOCLOUD} -eq 0 ]; then goto_url="${NETDATA_REGISTRY_URL}/goto-host-from-alarm.html?${redirect_params}" else - goto_url="https://netdata.cloud/alarms/redirect?agentID=${NETDATA_REGISTRY_UNIQUE_ID}&${redirect_params}" + goto_url="${NETDATA_REGISTRY_CLOUD_BASE_URL}/alarms/redirect?agentID=${NETDATA_REGISTRY_UNIQUE_ID}&${redirect_params}" fi # the severity of the alarm @@ -1953,7 +1954,7 @@ send_pushbullet "${PUSHBULLET_ACCESS_TOKEN}" "${PUSHBULLET_SOURCE_DEVICE}" "${to Severity: ${severity}\\n Chart: ${chart}\\n Family: ${family}\\n -$(date -d @${when})\\n +${date}\\n The source of this alarm is line ${src}" SENT_PUSHBULLET=$? diff --git a/health/notifications/custom/README.md b/health/notifications/custom/README.md index 627dd9d4..eeaad8a6 100644 --- a/health/notifications/custom/README.md +++ b/health/notifications/custom/README.md @@ -1,11 +1,13 @@ # Custom -Netdata allows you to send custom notifications, to any endpoint you choose. -To configure custom notifications, you will need to define the `custom_sender()` function in `health_alarm_notify.conf` -You can look at the other senders in `/usr/libexec/netdata/plugins.d/alarm-notify.sh` for examples. +Netdata allows you to send custom notifications to any endpoint you choose. + +To configure custom notifications, you will need to customize `health_alarm_notify.conf`. You can look at the other senders in `/usr/libexec/netdata/plugins.d/alarm-notify.sh` for examples of how to modify the `custom_sender()` function in `health_alarm_notify.conf`. Ensure you follow the instructions of changing any configuration file to [persist your configuration](../../../docs/configuration-guide.md#persist-my-configuration). + As with other notifications, you will also need to define the recipient list in `DEFAULT_RECIPIENT_CUSTOM` and/or the `role_recipients_custom` array. -The following is a sample `custom_sender` function to send an SMS via an imaginary HTTPS endpoint to the SMS gateway: +The following is a sample `custom_sender` function in `health_alarm_notify.conf`, to send an SMS via an imaginary HTTPS endpoint to the SMS gateway: + ``` custom_sender() { # example human readable SMS @@ -37,45 +39,45 @@ The following is a sample `custom_sender` function to send an SMS via an imagina Variables available to the custom_sender: - - ${to_custom} the list of recipients for the alarm - - ${host} the host generated this event - - ${url_host} same as ${host} but URL encoded - - ${unique_id} the unique id of this event - - ${alarm_id} the unique id of the alarm that generated this event - - ${event_id} the incremental id of the event, for this alarm id - - ${when} the timestamp this event occurred - - ${name} the name of the alarm, as given in netdata health.d entries - - ${url_name} same as ${name} but URL encoded - - ${chart} the name of the chart (type.id) - - ${url_chart} same as ${chart} but URL encoded - - ${family} the family of the chart - - ${url_family} same as ${family} but URL encoded - - ${status} the current status : REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL - - ${old_status} the previous status: REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL - - ${value} the current value of the alarm - - ${old_value} the previous value of the alarm - - ${src} the line number and file the alarm has been configured - - ${duration} the duration in seconds of the previous alarm state - - ${duration_txt} same as ${duration} for humans - - ${non_clear_duration} the total duration in seconds this is/was non-clear - - ${non_clear_duration_txt} same as ${non_clear_duration} for humans - - ${units} the units of the value - - ${info} a short description of the alarm - - ${value_string} friendly value (with units) - - ${old_value_string} friendly old value (with units) - - ${image} the URL of an image to represent the status of the alarm - - ${color} a color in #AABBCC format for the alarm - - ${goto_url} the URL the user can click to see the netdata dashboard - - ${calc_expression} the expression evaluated to provide the value for the alarm - - ${calc_param_values} the value of the variables in the evaluated expression - - ${total_warnings} the total number of alarms in WARNING state on the host - - ${total_critical} the total number of alarms in CRITICAL state on the host + - `${to_custom}` the list of recipients for the alarm + - `${host}` the host generated this event + - `${url_host}` same as `${host}` but URL encoded + - `${unique_id}` the unique id of this event + - `${alarm_id}` the unique id of the alarm that generated this event + - `${event_id}` the incremental id of the event, for this alarm id + - `${when}` the timestamp this event occurred + - `${name}` the name of the alarm, as given in netdata health.d entries + - `${url_name}` same as `${name}` but URL encoded + - `${chart}` the name of the chart (type.id) + - `${url_chart}` same as `${chart}` but URL encoded + - `${family}` the family of the chart + - `${url_family}` same as `${family}` but URL encoded + - `${status}` the current status : REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL + - `${old_status}` the previous status: REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL + - `${value}` the current value of the alarm + - `${old_value}` the previous value of the alarm + - `${src}` the line number and file the alarm has been configured + - `${duration}` the duration in seconds of the previous alarm state + - `${duration_txt}` same as `${duration}` for humans + - `${non_clear_duration}` the total duration in seconds this is/was non-clear + - `${non_clear_duration_txt}` same as `${non_clear_duration}` for humans + - `${units}` the units of the value + - `${info}` a short description of the alarm + - `${value_string}` friendly value (with units) + - `${old_value_string}` friendly old value (with units) + - `${image}` the URL of an image to represent the status of the alarm + - `${color}` a color in #AABBCC format for the alarm + - `${goto_url}` the URL the user can click to see the netdata dashboard + - `${calc_expression}` the expression evaluated to provide the value for the alarm + - `${calc_param_values}` the value of the variables in the evaluated expression + - `${total_warnings}` the total number of alarms in WARNING state on the host + - `${total_critical}` the total number of alarms in CRITICAL state on the host The following are more human friendly: - - ${alarm} like "name = value units" - - ${status_message} like "needs attention", "recovered", "is critical" - - ${severity} like "Escalated to CRITICAL", "Recovered from WARNING" - - ${raised_for} like "(alarm was raised for 10 minutes)" + - `${alarm}` like "name = value units" + - `${status_message}` like "needs attention", "recovered", "is critical" + - `${severity}` like "Escalated to CRITICAL", "Recovered from WARNING" + - `${raised_for}` like "(alarm was raised for 10 minutes)" [![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fhealth%2Fnotifications%2Fcustom%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)]() diff --git a/health/notifications/email/README.md b/health/notifications/email/README.md index 163839b6..84a9e0ce 100644 --- a/health/notifications/email/README.md +++ b/health/notifications/email/README.md @@ -30,4 +30,6 @@ sudo su -s /bin/bash netdata Where `[ROLE]` is the role you want to test. The default (if you don't give a `[ROLE]`) is `sysadmin`. +Note that in versions before 1.16, the plugins.d directory may be installed in a different location in certain OSs (e.g. under `/usr/lib/netdata`). You can always find the location of the alarm-notify.sh script in `netdata.conf`. + [![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fhealth%2Fnotifications%2Femail%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)]() diff --git a/libnetdata/Makefile.am b/libnetdata/Makefile.am index d2710f0a..87f12b32 100644 --- a/libnetdata/Makefile.am +++ b/libnetdata/Makefile.am @@ -11,6 +11,8 @@ SUBDIRS = \ config \ dictionary \ eval \ + json \ + health \ locks \ log \ popen \ diff --git a/libnetdata/config/appconfig.c b/libnetdata/config/appconfig.c index 9e6a0c02..65c36c28 100644 --- a/libnetdata/config/appconfig.c +++ b/libnetdata/config/appconfig.c @@ -411,6 +411,27 @@ int appconfig_set_boolean(struct config *root, const char *section, const char * return value; } +int appconfig_get_duration(struct config *root, const char *section, const char *name, const char *value) +{ + int result = 0; + const char *s; + + s = appconfig_get(root, section, name, value); + if(!s) goto fallback; + + if(!config_parse_duration(s, &result)) { + error("config option '[%s].%s = %s' is configured with an valid duration", section, name, s); + goto fallback; + } + + return result; + + fallback: + if(!config_parse_duration(value, &result)) + error("INTERNAL ERROR: default duration supplied for option '[%s].%s = %s' is not a valid duration", section, name, value); + + return result; +} // ---------------------------------------------------------------------------- // config load/save @@ -586,3 +607,65 @@ void appconfig_generate(struct config *root, BUFFER *wb, int only_changed) appconfig_unlock(root); } } + +/** + * Parse Duration + * + * Parse the string setting the result + * + * @param string the timestamp string + * @param result the output variable + * + * @return It returns 1 on success and 0 otherwise + */ +int config_parse_duration(const char* string, int* result) { + while(*string && isspace(*string)) string++; + + if(unlikely(!*string)) goto fallback; + + if(*string == 'n' && !strcmp(string, "never")) { + // this is a valid option + *result = 0; + return 1; + } + + // make sure it is a number + if(!(isdigit(*string) || *string == '+' || *string == '-')) goto fallback; + + char *e = NULL; + calculated_number n = str2ld(string, &e); + if(e && *e) { + switch (*e) { + case 'Y': + *result = (int) (n * 31536000); + break; + case 'M': + *result = (int) (n * 2592000); + break; + case 'w': + *result = (int) (n * 604800); + break; + case 'd': + *result = (int) (n * 86400); + break; + case 'h': + *result = (int) (n * 3600); + break; + case 'm': + *result = (int) (n * 60); + break; + case 's': + default: + *result = (int) (n); + break; + } + } + else + *result = (int)(n); + + return 1; + + fallback: + *result = 0; + return 0; +} diff --git a/libnetdata/config/appconfig.h b/libnetdata/config/appconfig.h index 78099aad..32e289f9 100644 --- a/libnetdata/config/appconfig.h +++ b/libnetdata/config/appconfig.h @@ -119,6 +119,7 @@ extern long long appconfig_get_number(struct config *root, const char *section, extern LONG_DOUBLE appconfig_get_float(struct config *root, const char *section, const char *name, LONG_DOUBLE value); extern int appconfig_get_boolean(struct config *root, const char *section, const char *name, int value); extern int appconfig_get_boolean_ondemand(struct config *root, const char *section, const char *name, int value); +extern int appconfig_get_duration(struct config *root, const char *section, const char *name, const char *value); extern const char *appconfig_set(struct config *root, const char *section, const char *name, const char *value); extern const char *appconfig_set_default(struct config *root, const char *section, const char *name, const char *value); @@ -133,4 +134,6 @@ extern void appconfig_generate(struct config *root, BUFFER *wb, int only_changed extern int appconfig_section_compare(void *a, void *b); +extern int config_parse_duration(const char* string, int* result); + #endif /* NETDATA_CONFIG_H */ diff --git a/libnetdata/health/Makefile.am b/libnetdata/health/Makefile.am new file mode 100644 index 00000000..9b7995f1 --- /dev/null +++ b/libnetdata/health/Makefile.am @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-3.0-or-later + +AUTOMAKE_OPTIONS = subdir-objects +MAINTAINERCLEANFILES = $(srcdir)/Makefile.in + + +dist_noinst_DATA = \ + $(NULL) diff --git a/libnetdata/health/health.c b/libnetdata/health/health.c new file mode 100644 index 00000000..b93de8b9 --- /dev/null +++ b/libnetdata/health/health.c @@ -0,0 +1,170 @@ +#include "health.h" + +/** + * Create Silencer + * + * Allocate a new silencer to Netdata. + * + * @return It returns the address off the silencer on success and NULL otherwise + */ +SILENCER *create_silencer(void) { + SILENCER *t = callocz(1, sizeof(SILENCER)); + debug(D_HEALTH, "HEALTH command API: Created empty silencer"); + + return t; +} + +/** + * Health Silencers add + * + * Add more one silencer to the list of silenecers. + * + * @param silencer + */ +void health_silencers_add(SILENCER *silencer) { + // Add the created instance to the linked list in silencers + silencer->next = silencers->silencers; + silencers->silencers = silencer; + debug(D_HEALTH, "HEALTH command API: Added silencer %s:%s:%s:%s:%s", silencer->alarms, + silencer->charts, silencer->contexts, silencer->hosts, silencer->families + ); +} + +/** + * Silencers Add Parameter + * + * Create a new silencer and adjust the variables + * + * @param silencer a pointer to the silencer that will be adjusted + * @param key the key value sent by client + * @param value the value sent to the key + * + * @return It returns the silencer configured on success and NULL otherwise + */ +SILENCER *health_silencers_addparam(SILENCER *silencer, char *key, char *value) { + static uint32_t + hash_alarm = 0, + hash_template = 0, + hash_chart = 0, + hash_context = 0, + hash_host = 0, + hash_families = 0; + + if (unlikely(!hash_alarm)) { + hash_alarm = simple_uhash(HEALTH_ALARM_KEY); + hash_template = simple_uhash(HEALTH_TEMPLATE_KEY); + hash_chart = simple_uhash(HEALTH_CHART_KEY); + hash_context = simple_uhash(HEALTH_CONTEXT_KEY); + hash_host = simple_uhash(HEALTH_HOST_KEY); + hash_families = simple_uhash(HEALTH_FAMILIES_KEY); + } + + uint32_t hash = simple_uhash(key); + if (unlikely(silencer == NULL)) { + if ( + (hash == hash_alarm && !strcasecmp(key, HEALTH_ALARM_KEY)) || + (hash == hash_template && !strcasecmp(key, HEALTH_TEMPLATE_KEY)) || + (hash == hash_chart && !strcasecmp(key, HEALTH_CHART_KEY)) || + (hash == hash_context && !strcasecmp(key, HEALTH_CONTEXT_KEY)) || + (hash == hash_host && !strcasecmp(key, HEALTH_HOST_KEY)) || + (hash == hash_families && !strcasecmp(key, HEALTH_FAMILIES_KEY)) + ) { + silencer = create_silencer(); + if(!silencer) { + error("Cannot add a new silencer to Netdata"); + return NULL; + } + } + } + + if (hash == hash_alarm && !strcasecmp(key, HEALTH_ALARM_KEY)) { + silencer->alarms = strdupz(value); + silencer->alarms_pattern = simple_pattern_create(silencer->alarms, NULL, SIMPLE_PATTERN_EXACT); + } else if (hash == hash_chart && !strcasecmp(key, HEALTH_CHART_KEY)) { + silencer->charts = strdupz(value); + silencer->charts_pattern = simple_pattern_create(silencer->charts, NULL, SIMPLE_PATTERN_EXACT); + } else if (hash == hash_context && !strcasecmp(key, HEALTH_CONTEXT_KEY)) { + silencer->contexts = strdupz(value); + silencer->contexts_pattern = simple_pattern_create(silencer->contexts, NULL, SIMPLE_PATTERN_EXACT); + } else if (hash == hash_host && !strcasecmp(key, HEALTH_HOST_KEY)) { + silencer->hosts = strdupz(value); + silencer->hosts_pattern = simple_pattern_create(silencer->hosts, NULL, SIMPLE_PATTERN_EXACT); + } else if (hash == hash_families && !strcasecmp(key, HEALTH_FAMILIES_KEY)) { + silencer->families = strdupz(value); + silencer->families_pattern = simple_pattern_create(silencer->families, NULL, SIMPLE_PATTERN_EXACT); + } + + return silencer; +} + +/** + * JSON Read Callback + * + * Callback called by netdata to create the silencer. + * + * @param e the main json structure + * + * @return It always return 0. + */ +int health_silencers_json_read_callback(JSON_ENTRY *e) +{ + switch(e->type) { + case JSON_OBJECT: +#ifndef ENABLE_JSONC + e->callback_function = health_silencers_json_read_callback; + if(e->name && strcmp(e->name,"")) { + // init silencer + debug(D_HEALTH, "JSON: Got object with a name, initializing new silencer for %s",e->name); +#endif + e->callback_data = create_silencer(); + if(e->callback_data) { + health_silencers_add(e->callback_data); + } +#ifndef ENABLE_JSONC + } +#endif + break; + + case JSON_ARRAY: + e->callback_function = health_silencers_json_read_callback; + break; + + case JSON_STRING: + if(!strcmp(e->name,"type")) { + debug(D_HEALTH, "JSON: Processing type=%s",e->data.string); + if (!strcmp(e->data.string,"SILENCE")) silencers->stype = STYPE_SILENCE_NOTIFICATIONS; + else if (!strcmp(e->data.string,"DISABLE")) silencers->stype = STYPE_DISABLE_ALARMS; + } else { + debug(D_HEALTH, "JSON: Adding %s=%s", e->name, e->data.string); + health_silencers_addparam(e->callback_data, e->name, e->data.string); + } + break; + + case JSON_BOOLEAN: + debug(D_HEALTH, "JSON: Processing all_alarms"); + silencers->all_alarms=e->data.boolean?1:0; + break; + + case JSON_NUMBER: + case JSON_NULL: + break; + } + + return 0; +} + +/** + * Initialize Global Silencers + * + * Initialize the silencer for the whole netdata system. + * + * @return It returns 0 on success and -1 otherwise + */ +int health_initialize_global_silencers() { + silencers = mallocz(sizeof(SILENCERS)); + silencers->all_alarms=0; + silencers->stype=STYPE_NONE; + silencers->silencers=NULL; + + return 0; +}
\ No newline at end of file diff --git a/libnetdata/health/health.h b/libnetdata/health/health.h new file mode 100644 index 00000000..a3dc0775 --- /dev/null +++ b/libnetdata/health/health.h @@ -0,0 +1,55 @@ +#ifndef NETDATA_HEALTH_LIB +# define NETDATA_HEALTH_LIB 1 + +# include "../libnetdata.h" + +#define HEALTH_ALARM_KEY "alarm" +#define HEALTH_TEMPLATE_KEY "template" +#define HEALTH_CONTEXT_KEY "context" +#define HEALTH_CHART_KEY "chart" +#define HEALTH_HOST_KEY "hosts" +#define HEALTH_OS_KEY "os" +#define HEALTH_FAMILIES_KEY "families" +#define HEALTH_LOOKUP_KEY "lookup" +#define HEALTH_CALC_KEY "calc" + +typedef struct silencer { + char *alarms; + SIMPLE_PATTERN *alarms_pattern; + + char *hosts; + SIMPLE_PATTERN *hosts_pattern; + + char *contexts; + SIMPLE_PATTERN *contexts_pattern; + + char *charts; + SIMPLE_PATTERN *charts_pattern; + + char *families; + SIMPLE_PATTERN *families_pattern; + + struct silencer *next; +} SILENCER; + +typedef enum silence_type { + STYPE_NONE, + STYPE_DISABLE_ALARMS, + STYPE_SILENCE_NOTIFICATIONS +} SILENCE_TYPE; + +typedef struct silencers { + int all_alarms; + SILENCE_TYPE stype; + SILENCER *silencers; +} SILENCERS; + +SILENCERS *silencers; + +extern SILENCER *create_silencer(void); +extern int health_silencers_json_read_callback(JSON_ENTRY *e); +extern void health_silencers_add(SILENCER *silencer); +extern SILENCER * health_silencers_addparam(SILENCER *silencer, char *key, char *value); +extern int health_initialize_global_silencers(); + +#endif diff --git a/libnetdata/json/Makefile.am b/libnetdata/json/Makefile.am new file mode 100644 index 00000000..1cb69ed9 --- /dev/null +++ b/libnetdata/json/Makefile.am @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-3.0-or-later + +AUTOMAKE_OPTIONS = subdir-objects +MAINTAINERCLEANFILES = $(srcdir)/Makefile.in + + +dist_noinst_DATA = \ + README.md \ + $(NULL) diff --git a/libnetdata/json/README.md b/libnetdata/json/README.md new file mode 100644 index 00000000..fd6cb0f3 --- /dev/null +++ b/libnetdata/json/README.md @@ -0,0 +1,5 @@ +# json + +`json` contains a parser for json strings, based on `jsmn` (https://github.com/zserge/jsmn), but case you have installed the JSON-C library, the installation script will prefer it, you can also force its use with `--enable-jsonc` in the compilation time. + +[![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Flibnetdata%2Fjson%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)]() diff --git a/libnetdata/json/jsmn.c b/libnetdata/json/jsmn.c new file mode 100644 index 00000000..c8d9e73d --- /dev/null +++ b/libnetdata/json/jsmn.c @@ -0,0 +1,326 @@ +#include <stdlib.h> + +#include "jsmn.h" + +/** + * Alloc token + * + * Allocates a fresh unused token from the token pull. + * + * @param parser the controller + * @param tokens the tokens I am working + * @param num_tokens the number total of tokens. + * + * @return it returns the next token to work. + */ +static jsmntok_t *jsmn_alloc_token(jsmn_parser *parser, + jsmntok_t *tokens, size_t num_tokens) { + jsmntok_t *tok; + if (parser->toknext >= num_tokens) { + return NULL; + } + tok = &tokens[parser->toknext++]; + tok->start = tok->end = -1; + tok->size = 0; +#ifdef JSMN_PARENT_LINKS + tok->parent = -1; +#endif + return tok; +} + +/** + * Fill Token + * + * Fills token type and boundaries. + * + * @param token the structure to set the values + * @param type is the token type + * @param start is the first position of the value + * @param end is the end of the value + */ +static void jsmn_fill_token(jsmntok_t *token, jsmntype_t type, + int start, int end) { + token->type = type; + token->start = start; + token->end = end; + token->size = 0; +} + +/** + * Parse primitive + * + * Fills next available token with JSON primitive. + * + * @param parser is the control structure + * @param js is the json string + * @param type is the token type + */ +static jsmnerr_t jsmn_parse_primitive(jsmn_parser *parser, const char *js, + size_t len, jsmntok_t *tokens, size_t num_tokens) { + jsmntok_t *token; + int start; + + start = parser->pos; + + for (; parser->pos < len && js[parser->pos] != '\0'; parser->pos++) { + switch (js[parser->pos]) { +#ifndef JSMN_STRICT + /* In strict mode primitive must be followed by "," or "}" or "]" */ + case ':': +#endif + case '\t' : case '\r' : case '\n' : case ' ' : + case ',' : case ']' : case '}' : + goto found; + } + if (js[parser->pos] < 32 || js[parser->pos] >= 127) { + parser->pos = start; + return JSMN_ERROR_INVAL; + } + } +#ifdef JSMN_STRICT + /* In strict mode primitive must be followed by a comma/object/array */ + parser->pos = start; + return JSMN_ERROR_PART; +#endif + + found: + if (tokens == NULL) { + parser->pos--; + return 0; + } + token = jsmn_alloc_token(parser, tokens, num_tokens); + if (token == NULL) { + parser->pos = start; + return JSMN_ERROR_NOMEM; + } + jsmn_fill_token(token, JSMN_PRIMITIVE, start, parser->pos); +#ifdef JSMN_PARENT_LINKS + token->parent = parser->toksuper; +#endif + parser->pos--; + return 0; +} + +/** + * Parse string + * + * Fills next token with JSON string. + * + * @param parser is the control structure + * @param js is the json string + * @param len is the js length + * @param tokens is structure with the tokens mapped. + * @param num_tokens is the total number of tokens + * + * @return It returns 0 on success and another integer otherwise + */ +static jsmnerr_t jsmn_parse_string(jsmn_parser *parser, const char *js, + size_t len, jsmntok_t *tokens, size_t num_tokens) { + jsmntok_t *token; + + int start = parser->pos; + + parser->pos++; + + /* Skip starting quote */ + for (; parser->pos < len && js[parser->pos] != '\0'; parser->pos++) { + char c = js[parser->pos]; + + /* Quote: end of string */ + if (c == '\"') { + if (tokens == NULL) { + return 0; + } + token = jsmn_alloc_token(parser, tokens, num_tokens); + if (token == NULL) { + parser->pos = start; + return JSMN_ERROR_NOMEM; + } + jsmn_fill_token(token, JSMN_STRING, start+1, parser->pos); +#ifdef JSMN_PARENT_LINKS + token->parent = parser->toksuper; +#endif + return 0; + } + + /* Backslash: Quoted symbol expected */ + if (c == '\\') { + parser->pos++; + switch (js[parser->pos]) { + /* Allowed escaped symbols */ + case '\"': case '/' : case '\\' : case 'b' : + case 'f' : case 'r' : case 'n' : case 't' : + break; + /* Allows escaped symbol \uXXXX */ + case 'u': + parser->pos++; + int i = 0; + for(; i < 4 && js[parser->pos] != '\0'; i++) { + /* If it isn't a hex character we have an error */ + if(!((js[parser->pos] >= 48 && js[parser->pos] <= 57) || /* 0-9 */ + (js[parser->pos] >= 65 && js[parser->pos] <= 70) || /* A-F */ + (js[parser->pos] >= 97 && js[parser->pos] <= 102))) { /* a-f */ + parser->pos = start; + return JSMN_ERROR_INVAL; + } + parser->pos++; + } + parser->pos--; + break; + /* Unexpected symbol */ + default: + parser->pos = start; + return JSMN_ERROR_INVAL; + } + } + } + parser->pos = start; + return JSMN_ERROR_PART; +} + +/** + * JSMN Parse + * + * Parse JSON string and fill tokens. + * + * @param parser the auxiliar vector used to parser + * @param js the string to parse + * @param len the string length + * @param tokens the place to map the tokens + * @param num_tokens the number of tokens present in the tokens structure. + * + * @return It returns the number of tokens present in the string on success or a negative number otherwise + */ +jsmnerr_t jsmn_parse(jsmn_parser *parser, const char *js, size_t len, + jsmntok_t *tokens, unsigned int num_tokens) { + jsmnerr_t r; + int i; + jsmntok_t *token; + int count = 0; + + for (; parser->pos < len && js[parser->pos] != '\0'; parser->pos++) { + char c; + jsmntype_t type; + + c = js[parser->pos]; + switch (c) { + case '{': case '[': + count++; + if (tokens == NULL) { + break; + } + token = jsmn_alloc_token(parser, tokens, num_tokens); + if (token == NULL) + return JSMN_ERROR_NOMEM; + if (parser->toksuper != -1) { + tokens[parser->toksuper].size++; +#ifdef JSMN_PARENT_LINKS + token->parent = parser->toksuper; +#endif + } + token->type = (c == '{' ? JSMN_OBJECT : JSMN_ARRAY); + token->start = parser->pos; + parser->toksuper = parser->toknext - 1; + break; + case '}': case ']': + if (tokens == NULL) + break; + type = (c == '}' ? JSMN_OBJECT : JSMN_ARRAY); +#ifdef JSMN_PARENT_LINKS + if (parser->toknext < 1) { + return JSMN_ERROR_INVAL; + } + token = &tokens[parser->toknext - 1]; + for (;;) { + if (token->start != -1 && token->end == -1) { + if (token->type != type) { + return JSMN_ERROR_INVAL; + } + token->end = parser->pos + 1; + parser->toksuper = token->parent; + break; + } + if (token->parent == -1) { + break; + } + token = &tokens[token->parent]; + } +#else + for (i = parser->toknext - 1; i >= 0; i--) { + token = &tokens[i]; + if (token->start != -1 && token->end == -1) { + if (token->type != type) { + return JSMN_ERROR_INVAL; + } + parser->toksuper = -1; + token->end = parser->pos + 1; + break; + } + } + /* Error if unmatched closing bracket */ + if (i == -1) return JSMN_ERROR_INVAL; + for (; i >= 0; i--) { + token = &tokens[i]; + if (token->start != -1 && token->end == -1) { + parser->toksuper = i; + break; + } + } +#endif + break; + case '\"': + r = jsmn_parse_string(parser, js, len, tokens, num_tokens); + if (r < 0) return r; + count++; + if (parser->toksuper != -1 && tokens != NULL) + tokens[parser->toksuper].size++; + break; + case '\t' : case '\r' : case '\n' : case ':' : case ',': case ' ': + break; +#ifdef JSMN_STRICT + /* In strict mode primitives are: numbers and booleans */ + case '-': case '0': case '1' : case '2': case '3' : case '4': + case '5': case '6': case '7' : case '8': case '9': + case 't': case 'f': case 'n' : +#else + /* In non-strict mode every unquoted value is a primitive */ + default: +#endif + r = jsmn_parse_primitive(parser, js, len, tokens, num_tokens); + if (r < 0) return r; + count++; + if (parser->toksuper != -1 && tokens != NULL) + tokens[parser->toksuper].size++; + break; + +#ifdef JSMN_STRICT + /* Unexpected char in strict mode */ + default: + return JSMN_ERROR_INVAL; +#endif + } + } + + for (i = parser->toknext - 1; i >= 0; i--) { + /* Unmatched opened object or array */ + if (tokens[i].start != -1 && tokens[i].end == -1) { + return JSMN_ERROR_PART; + } + } + + return count; +} + +/** + * JSMN Init + * + * Creates a new parser based over a given buffer with an array of tokens + * available. + * + * @param parser is the structure with values to reset + */ +void jsmn_init(jsmn_parser *parser) { + parser->pos = 0; + parser->toknext = 0; + parser->toksuper = -1; +}
\ No newline at end of file diff --git a/libnetdata/json/jsmn.h b/libnetdata/json/jsmn.h new file mode 100644 index 00000000..beff586c --- /dev/null +++ b/libnetdata/json/jsmn.h @@ -0,0 +1,75 @@ +#ifndef __JSMN_H_ +#define __JSMN_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <stddef.h> +/** + * JSON type identifier. Basic types are: + * o Object + * o Array + * o String + * o Other primitive: number, boolean (true/false) or null + */ +typedef enum { + JSMN_PRIMITIVE = 0, + JSMN_OBJECT = 1, + JSMN_ARRAY = 2, + JSMN_STRING = 3 +} jsmntype_t; + +typedef enum { + /* Not enough tokens were provided */ + JSMN_ERROR_NOMEM = -1, + /* Invalid character inside JSON string */ + JSMN_ERROR_INVAL = -2, + /* The string is not a full JSON packet, more bytes expected */ + JSMN_ERROR_PART = -3, +} jsmnerr_t; + +/** + * JSON token description. + * + * @param type type (object, array, string etc.) + * @param start start position in JSON data string + * @param end end position in JSON data string + */ +typedef struct { + jsmntype_t type; + int start; + int end; + int size; +#ifdef JSMN_PARENT_LINKS + int parent; +#endif +} jsmntok_t; + +/** + * JSON parser. Contains an array of token blocks available. Also stores + * the string being parsed now and current position in that string + */ +typedef struct { + unsigned int pos; /* offset in the JSON string */ + unsigned int toknext; /* next token to allocate */ + int toksuper; /* superior token node, e.g parent object or array */ +} jsmn_parser; + +/** + * Create JSON parser over an array of tokens + */ +void jsmn_init(jsmn_parser *parser); + +/** + * Run JSON parser. It parses a JSON data string into and array of tokens, each describing + * a single JSON object. + */ +jsmnerr_t jsmn_parse(jsmn_parser *parser, const char *js, size_t len, + jsmntok_t *tokens, unsigned int num_tokens); + +#ifdef __cplusplus +} +#endif + +#endif /* __JSMN_H_ */
\ No newline at end of file diff --git a/libnetdata/json/json.c b/libnetdata/json/json.c new file mode 100644 index 00000000..c9ff39b0 --- /dev/null +++ b/libnetdata/json/json.c @@ -0,0 +1,546 @@ +#include "jsmn.h" +#include "../libnetdata.h" +#include "json.h" +#include "libnetdata/libnetdata.h" +#include "../../health/health.h" + +#define JSON_TOKENS 1024 + +int json_tokens = JSON_TOKENS; + +/** + * Json Tokenise + * + * Map the string given inside tokens. + * + * @param js is the string used to create the tokens + * @param len is the string length + * @param count the number of tokens present in the string + * + * @return it returns the json parsed in tokens + */ +#ifdef ENABLE_JSONC +json_object *json_tokenise(char *js) { + if(!js) { + error("JSON: json string is empty."); + return NULL; + } + + json_object *token = json_tokener_parse(js); + if(!token) { + error("JSON: Invalid json string."); + return NULL; + } + + return token; +} +#else +jsmntok_t *json_tokenise(char *js, size_t len, size_t *count) +{ + int n = json_tokens; + if(!js || !len) { + error("JSON: json string is empty."); + return NULL; + } + + jsmn_parser parser; + jsmn_init(&parser); + + jsmntok_t *tokens = mallocz(sizeof(jsmntok_t) * n); + if(!tokens) return NULL; + + int ret = jsmn_parse(&parser, js, len, tokens, n); + while (ret == JSMN_ERROR_NOMEM) { + n *= 2; + jsmntok_t *new = reallocz(tokens, sizeof(jsmntok_t) * n); + if(!new) { + freez(tokens); + return NULL; + } + tokens = new; + ret = jsmn_parse(&parser, js, len, tokens, n); + } + + if (ret == JSMN_ERROR_INVAL) { + error("JSON: Invalid json string."); + freez(tokens); + return NULL; + } + else if (ret == JSMN_ERROR_PART) { + error("JSON: Truncated JSON string."); + freez(tokens); + return NULL; + } + + if(count) *count = (size_t)ret; + + if(json_tokens < n) json_tokens = n; + return tokens; +} +#endif + +/** + * Callback Print + * + * Set callback print case necesary and wrinte an information inside a buffer to write in the log. + * + * @param e a pointer for a structure that has the complete information about json structure. + * + * @return It always return 0 + */ +int json_callback_print(JSON_ENTRY *e) +{ + BUFFER *wb=buffer_create(300); + + buffer_sprintf(wb,"%s = ", e->name); + char txt[50]; + switch(e->type) { + case JSON_OBJECT: + e->callback_function = json_callback_print; + buffer_strcat(wb,"OBJECT"); + break; + + case JSON_ARRAY: + e->callback_function = json_callback_print; + sprintf(txt,"ARRAY[%lu]", e->data.items); + buffer_strcat(wb, txt); + break; + + case JSON_STRING: + buffer_strcat(wb, e->data.string); + break; + + case JSON_NUMBER: + sprintf(txt,"%Lf", e->data.number); + buffer_strcat(wb,txt); + + break; + + case JSON_BOOLEAN: + buffer_strcat(wb, e->data.boolean?"TRUE":"FALSE"); + break; + + case JSON_NULL: + buffer_strcat(wb,"NULL"); + break; + } + info("JSON: %s", buffer_tostring(wb)); + buffer_free(wb); + return 0; +} + +/** + * JSONC Set String + * + * Set the string value of the structure JSON_ENTRY. + * + * @param e the output structure + */ +static inline void json_jsonc_set_string(JSON_ENTRY *e,char *key,const char *value) { + size_t length = strlen(key); + e->type = JSON_STRING; + memcpy(e->name,key,length); + e->name[length] = 0x00; + e->data.string = (char *) value; +} + + +#ifdef ENABLE_JSONC +/** + * JSONC set Boolean + * + * Set the boolean value of the structure JSON_ENTRY + * + * @param e the output structure + * @param value the input value + */ +static inline void json_jsonc_set_boolean(JSON_ENTRY *e,int value) { + e->type = JSON_BOOLEAN; + e->data.boolean = value; +} + +/** + * Parse Array + * + * Parse the array object. + * + * @param ptr the pointer for the object that we will parse. + * @param callback_data additional data to be used together the callback function + * @param callback_function function used to create a silencer. + */ +static inline void json_jsonc_parse_array(json_object *ptr, void *callback_data,int (*callback_function)(struct json_entry *)) { + int end = json_object_array_length(ptr); + JSON_ENTRY e; + + if(end) { + int i; + i = 0; + + enum json_type type; + do { + json_object *jvalue = json_object_array_get_idx(ptr, i); + if(jvalue) { + e.callback_data = callback_data; + e.type = JSON_OBJECT; + callback_function(&e); + json_object_object_foreach(jvalue, key, val) { + type = json_object_get_type(val); + if (type == json_type_array) { + e.type = JSON_ARRAY; + json_jsonc_parse_array(val, callback_data, callback_function); + } else if (type == json_type_object) { + json_walk(val,callback_data,callback_function); + } else if (type == json_type_string) { + json_jsonc_set_string(&e,key,json_object_get_string(val)); + callback_function(&e); + } else if (type == json_type_boolean) { + json_jsonc_set_boolean(&e,json_object_get_boolean(val)); + callback_function(&e); + } + } + } + + } while (++i < end); + } +} +#else + +/** + * Walk string + * + * Set JSON_ENTRY to string and map the values from jsmntok_t. + * + * @param js the original string + * @param t the tokens + * @param start the first position + * @param e the output structure. + * + * @return It always return 1 + */ +size_t json_walk_string(char *js, jsmntok_t *t, size_t start, JSON_ENTRY *e) +{ + char old = js[t[start].end]; + js[t[start].end] = '\0'; + e->original_string = &js[t[start].start]; + + e->type = JSON_STRING; + e->data.string = e->original_string; + if(e->callback_function) e->callback_function(e); + js[t[start].end] = old; + return 1; +} + +/** + * Walk Primitive + * + * Define the data type of the string + * + * @param js the original string + * @param t the tokens + * @param start the first position + * @param e the output structure. + * + * @return It always return 1 + */ +size_t json_walk_primitive(char *js, jsmntok_t *t, size_t start, JSON_ENTRY *e) +{ + char old = js[t[start].end]; + js[t[start].end] = '\0'; + e->original_string = &js[t[start].start]; + + switch(e->original_string[0]) { + case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': + case '8': case '9': case '-': case '.': + e->type = JSON_NUMBER; + e->data.number = strtold(e->original_string, NULL); + break; + + case 't': case 'T': + e->type = JSON_BOOLEAN; + e->data.boolean = 1; + break; + + case 'f': case 'F': + e->type = JSON_BOOLEAN; + e->data.boolean = 0; + break; + + case 'n': case 'N': + default: + e->type = JSON_NULL; + break; + } + if(e->callback_function) e->callback_function(e); + js[t[start].end] = old; + return 1; +} + +/** + * Array + * + * Measure the array length + * + * @param js the original string + * @param t the tokens + * @param nest the length of structure t + * @param start the first position + * @param e the output structure. + * + * @return It returns the array length + */ +size_t json_walk_array(char *js, jsmntok_t *t, size_t nest, size_t start, JSON_ENTRY *e) +{ + JSON_ENTRY ne = { + .name = "", + .fullname = "", + .callback_data = NULL, + .callback_function = NULL + }; + + char old = js[t[start].end]; + js[t[start].end] = '\0'; + ne.original_string = &js[t[start].start]; + + memcpy(&ne, e, sizeof(JSON_ENTRY)); + ne.type = JSON_ARRAY; + ne.data.items = t[start].size; + ne.callback_function = NULL; + ne.name[0]='\0'; + ne.fullname[0]='\0'; + if(e->callback_function) e->callback_function(&ne); + js[t[start].end] = old; + + size_t i, init = start, size = t[start].size; + + start++; + for(i = 0; i < size ; i++) { + ne.pos = i; + if (!e->name || !e->fullname || strlen(e->name) > JSON_NAME_LEN - 24 || strlen(e->fullname) > JSON_FULLNAME_LEN -24) { + info("JSON: JSON walk_array ignoring element with name:%s fullname:%s",e->name, e->fullname); + continue; + } + sprintf(ne.name, "%s[%lu]", e->name, i); + sprintf(ne.fullname, "%s[%lu]", e->fullname, i); + + switch(t[start].type) { + case JSMN_PRIMITIVE: + start += json_walk_primitive(js, t, start, &ne); + break; + + case JSMN_OBJECT: + start += json_walk_object(js, t, nest + 1, start, &ne); + break; + + case JSMN_ARRAY: + start += json_walk_array(js, t, nest + 1, start, &ne); + break; + + case JSMN_STRING: + start += json_walk_string(js, t, start, &ne); + break; + } + } + return start - init; +} + +/** + * Object + * + * Measure the Object length + * + * @param js the original string + * @param t the tokens + * @param nest the length of structure t + * @param start the first position + * @param e the output structure. + * + * @return It returns the Object length + */ +size_t json_walk_object(char *js, jsmntok_t *t, size_t nest, size_t start, JSON_ENTRY *e) +{ + JSON_ENTRY ne = { + .name = "", + .fullname = "", + .callback_data = NULL, + .callback_function = NULL + }; + + char old = js[t[start].end]; + js[t[start].end] = '\0'; + ne.original_string = &js[t[start].start]; + memcpy(&ne, e, sizeof(JSON_ENTRY)); + ne.type = JSON_OBJECT; + ne.callback_function = NULL; + if(e->callback_function) e->callback_function(&ne); + js[t[start].end] = old; + + int key = 1; + size_t i, init = start, size = t[start].size; + + start++; + for(i = 0; i < size ; i++) { + switch(t[start].type) { + case JSMN_PRIMITIVE: + start += json_walk_primitive(js, t, start, &ne); + key = 1; + break; + + case JSMN_OBJECT: + start += json_walk_object(js, t, nest + 1, start, &ne); + key = 1; + break; + + case JSMN_ARRAY: + start += json_walk_array(js, t, nest + 1, start, &ne); + key = 1; + break; + + case JSMN_STRING: + default: + if(key) { + int len = t[start].end - t[start].start; + if (unlikely(len>JSON_NAME_LEN)) len=JSON_NAME_LEN; + strncpy(ne.name, &js[t[start].start], len); + ne.name[len] = '\0'; + len=strlen(e->fullname) + strlen(e->fullname[0]?".":"") + strlen(ne.name); + char *c = mallocz((len+1)*sizeof(char)); + sprintf(c,"%s%s%s", e->fullname, e->fullname[0]?".":"", ne.name); + if (unlikely(len>JSON_FULLNAME_LEN)) len=JSON_FULLNAME_LEN; + strncpy(ne.fullname, c, len); + freez(c); + start++; + key = 0; + } + else { + start += json_walk_string(js, t, start, &ne); + key = 1; + } + break; + } + } + return start - init; +} +#endif + +/** + * Tree + * + * Call the correct walk function according its type. + * + * @param t the json object to work + * @param callback_data additional data to be used together the callback function + * @param callback_function function used to create a silencer. + * + * @return It always return 1 + */ +#ifdef ENABLE_JSONC +size_t json_walk(json_object *t, void *callback_data, int (*callback_function)(struct json_entry *)) { + JSON_ENTRY e; + + e.callback_data = callback_data; + enum json_type type; + json_object_object_foreach(t, key, val) { + type = json_object_get_type(val); + if (type == json_type_array) { + e.type = JSON_ARRAY; + json_jsonc_parse_array(val,NULL,health_silencers_json_read_callback); + } else if (type == json_type_object) { + e.type = JSON_OBJECT; + } else if (type == json_type_string) { + json_jsonc_set_string(&e,key,json_object_get_string(val)); + callback_function(&e); + } else if (type == json_type_boolean) { + json_jsonc_set_boolean(&e,json_object_get_boolean(val)); + callback_function(&e); + } + } + + return 1; +} +#else +/** + * Tree + * + * Call the correct walk function according its type. + * + * @param js the original string + * @param t the tokens + * @param callback_data additional data to be used together the callback function + * @param callback_function function used to create a silencer. + * + * @return It always return 1 + */ +size_t json_walk_tree(char *js, jsmntok_t *t, void *callback_data, int (*callback_function)(struct json_entry *)) +{ + JSON_ENTRY e = { + .name = "", + .fullname = "", + .callback_data = callback_data, + .callback_function = callback_function + }; + + switch (t[0].type) { + case JSMN_OBJECT: + e.type = JSON_OBJECT; + json_walk_object(js, t, 0, 0, &e); + break; + + case JSMN_ARRAY: + e.type = JSON_ARRAY; + json_walk_array(js, t, 0, 0, &e); + break; + + case JSMN_PRIMITIVE: + case JSMN_STRING: + break; + } + + return 1; +} +#endif + +/** + * JSON Parse + * + * Parse the json message with the callback function + * + * @param js the string that the callback function will parse + * @param callback_data additional data to be used together the callback function + * @param callback_function function used to create a silencer. + * + * @return JSON_OK case everything happend as expected, JSON_CANNOT_PARSE case there were errors in the + * parsing procces and JSON_CANNOT_DOWNLOAD case the string given(js) is NULL. + */ +int json_parse(char *js, void *callback_data, int (*callback_function)(JSON_ENTRY *)) +{ + if(js) { +#ifdef ENABLE_JSONC + json_object *tokens = json_tokenise(js); +#else + size_t count; + jsmntok_t *tokens = json_tokenise(js, strlen(js), &count); +#endif + + if(tokens) { +#ifdef ENABLE_JSONC + json_walk(tokens, callback_data, callback_function); + json_object_put(tokens); +#else + json_walk_tree(js, tokens, callback_data, callback_function); + freez(tokens); +#endif + return JSON_OK; + } + + return JSON_CANNOT_PARSE; + } + + return JSON_CANNOT_DOWNLOAD; +} + +/* +int json_test(char *str) +{ + return json_parse(str, NULL, json_callback_print); +} + */
\ No newline at end of file diff --git a/libnetdata/json/json.h b/libnetdata/json/json.h new file mode 100644 index 00000000..79b58b17 --- /dev/null +++ b/libnetdata/json/json.h @@ -0,0 +1,72 @@ +#ifndef CHECKIN_JSON_H +#define CHECKIN_JSON_H 1 + + +#if ENABLE_JSONC +# include <json-c/json.h> +#endif + +#include "jsmn.h" + +//https://www.ibm.com/support/knowledgecenter/en/SS9H2Y_7.6.0/com.ibm.dp.doc/json_parserlimits.html +#define JSON_NAME_LEN 256 +#define JSON_FULLNAME_LEN 1024 + +typedef enum { + JSON_OBJECT = 0, + JSON_ARRAY = 1, + JSON_STRING = 2, + JSON_NUMBER = 3, + JSON_BOOLEAN = 4, + JSON_NULL = 5, +} JSON_ENTRY_TYPE; + +typedef struct json_entry { + JSON_ENTRY_TYPE type; + char name[JSON_NAME_LEN + 1]; + char fullname[JSON_FULLNAME_LEN + 1]; + union { + char *string; // type == JSON_STRING + long double number; // type == JSON_NUMBER + int boolean; // type == JSON_BOOLEAN + size_t items; // type == JSON_ARRAY + } data; + size_t pos; // the position of this item in its parent + + char *original_string; + + void *callback_data; + int (*callback_function)(struct json_entry *); +} JSON_ENTRY; + +// ---------------------------------------------------------------------------- +// public functions + +#define JSON_OK 0 +#define JSON_CANNOT_DOWNLOAD 1 +#define JSON_CANNOT_PARSE 2 + +int json_parse(char *js, void *callback_data, int (*callback_function)(JSON_ENTRY *)); + + +// ---------------------------------------------------------------------------- +// private functions + +#ifdef ENABLE_JSONC +json_object *json_tokenise(char *js); +size_t json_walk(json_object *t, void *callback_data, int (*callback_function)(struct json_entry *)); +#else +jsmntok_t *json_tokenise(char *js, size_t len, size_t *count); +size_t json_walk_tree(char *js, jsmntok_t *t, void *callback_data, int (*callback_function)(struct json_entry *)); +#endif + +size_t json_walk_object(char *js, jsmntok_t *t, size_t nest, size_t start, JSON_ENTRY *e); +size_t json_walk_array(char *js, jsmntok_t *t, size_t nest, size_t start, JSON_ENTRY *e); +size_t json_walk_string(char *js, jsmntok_t *t, size_t start, JSON_ENTRY *e); +size_t json_walk_primitive(char *js, jsmntok_t *t, size_t start, JSON_ENTRY *e); + +int json_callback_print(JSON_ENTRY *e); + + + +#endif
\ No newline at end of file diff --git a/libnetdata/libnetdata.h b/libnetdata/libnetdata.h index 230dc244..43dc1e04 100644 --- a/libnetdata/libnetdata.h +++ b/libnetdata/libnetdata.h @@ -298,6 +298,9 @@ extern char *netdata_configured_host_prefix; #include "clocks/clocks.h" #include "popen/popen.h" #include "simple_pattern/simple_pattern.h" +#ifdef ENABLE_HTTPS +# include "socket/security.h" +#endif #include "socket/socket.h" #include "config/appconfig.h" #include "log/log.h" @@ -307,5 +310,7 @@ extern char *netdata_configured_host_prefix; #include "statistical/statistical.h" #include "adaptive_resortable_list/adaptive_resortable_list.h" #include "url/url.h" +#include "json/json.h" +#include "health/health.h" #endif // NETDATA_LIB_H diff --git a/libnetdata/socket/security.c b/libnetdata/socket/security.c new file mode 100644 index 00000000..dcbd3f65 --- /dev/null +++ b/libnetdata/socket/security.c @@ -0,0 +1,277 @@ +#include "../libnetdata.h" + +#ifdef ENABLE_HTTPS + +SSL_CTX *netdata_opentsdb_ctx=NULL; +SSL_CTX *netdata_client_ctx=NULL; +SSL_CTX *netdata_srv_ctx=NULL; +const char *security_key=NULL; +const char *security_cert=NULL; +int netdata_use_ssl_on_stream = NETDATA_SSL_OPTIONAL; +int netdata_use_ssl_on_http = NETDATA_SSL_FORCE; //We force SSL due safety reasons +int netdata_validate_server = NETDATA_SSL_VALID_CERTIFICATE; + +/** + * Info Callback + * + * Function used as callback for the OpenSSL Library + * + * @param ssl a pointer to the SSL structure of the client + * @param where the variable with the flags set. + * @param ret the return of the caller + */ +static void security_info_callback(const SSL *ssl, int where, int ret) { + (void)ssl; + if (where & SSL_CB_ALERT) { + debug(D_WEB_CLIENT,"SSL INFO CALLBACK %s %s", SSL_alert_type_string(ret), SSL_alert_desc_string_long(ret)); + } +} + +/** + * OpenSSL Library + * + * Starts the openssl library for the Netdata. + */ +void security_openssl_library() +{ +#if OPENSSL_VERSION_NUMBER < 0x10100000L +# if (SSLEAY_VERSION_NUMBER >= 0x0907000L) + OPENSSL_config(NULL); +# endif + +# if OPENSSL_API_COMPAT < 0x10100000L + SSL_load_error_strings(); +# endif + + SSL_library_init(); +#else + if (OPENSSL_init_ssl(OPENSSL_INIT_LOAD_CONFIG, NULL) != 1) { + error("SSL library cannot be initialized."); + } +#endif +} + +/** + * OpenSSL common options + * + * Clients and SERVER have common options, this function is responsible to set them in the context. + * + * @param ctx + */ +void security_openssl_common_options(SSL_CTX *ctx) { +#if OPENSSL_VERSION_NUMBER >= 0x10100000L + static char *ciphers = {"ECDHE-RSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-SHA:!aNULL:!eNULL:!EXPORT:!DES:!RC4:!MD5:!PSK:!aECDH:!EDH-DSS-DES-CBC3-SHA:!EDH-RSA-DES-CBC3-SHA:!KRB5-DES-CBC3-SHA"}; +#endif +#if OPENSSL_VERSION_NUMBER < 0x10100000L + SSL_CTX_set_options (ctx,SSL_OP_NO_SSLv2|SSL_OP_NO_SSLv3|SSL_OP_NO_COMPRESSION); +#else + SSL_CTX_set_min_proto_version(ctx, TLS1_2_VERSION); + //We are avoiding the TLS v1.3 for while, because Google Chrome + //is giving the message net::ERR_SSL_VERSION_INTERFERENCE with it. + SSL_CTX_set_max_proto_version(ctx, TLS1_2_VERSION); +#endif + SSL_CTX_set_mode(ctx, SSL_MODE_ACCEPT_MOVING_WRITE_BUFFER); + +#if OPENSSL_VERSION_NUMBER >= 0x10100000L + if (!SSL_CTX_set_cipher_list(ctx, ciphers)) { + error("SSL error. cannot set the cipher list"); + } +#endif +} + +/** + * Initialize Openssl Client + * + * Starts the client context with TLS 1.2. + * + * @return It returns the context on success or NULL otherwise + */ +static SSL_CTX * security_initialize_openssl_client() { + SSL_CTX *ctx; +#if OPENSSL_VERSION_NUMBER < 0x10100000L + ctx = SSL_CTX_new(SSLv23_client_method()); +#else + ctx = SSL_CTX_new(TLS_client_method()); +#endif + if(ctx) { + security_openssl_common_options(ctx); + } + + return ctx; +} + +/** + * Initialize OpenSSL server + * + * Starts the server context with TLS 1.2 and load the certificate. + * + * @return It returns the context on success or NULL otherwise + */ +static SSL_CTX * security_initialize_openssl_server() { + SSL_CTX *ctx; + char lerror[512]; + static int netdata_id_context = 1; + + //TO DO: Confirm the necessity to check return for other OPENSSL function +#if OPENSSL_VERSION_NUMBER < 0x10100000L + ctx = SSL_CTX_new(SSLv23_server_method()); + if (!ctx) { + error("Cannot create a new SSL context, netdata won't encrypt communication"); + return NULL; + } + + SSL_CTX_use_certificate_file(ctx, security_cert, SSL_FILETYPE_PEM); +#else + ctx = SSL_CTX_new(TLS_server_method()); + if (!ctx) { + error("Cannot create a new SSL context, netdata won't encrypt communication"); + return NULL; + } + + SSL_CTX_use_certificate_chain_file(ctx, security_cert); +#endif + security_openssl_common_options(ctx); + + SSL_CTX_use_PrivateKey_file(ctx,security_key,SSL_FILETYPE_PEM); + + if (!SSL_CTX_check_private_key(ctx)) { + ERR_error_string_n(ERR_get_error(),lerror,sizeof(lerror)); + error("SSL cannot check the private key: %s",lerror); + SSL_CTX_free(ctx); + return NULL; + } + + SSL_CTX_set_session_id_context(ctx,(void*)&netdata_id_context,(unsigned int)sizeof(netdata_id_context)); + SSL_CTX_set_info_callback(ctx,security_info_callback); + +#if (OPENSSL_VERSION_NUMBER < 0x00905100L) + SSL_CTX_set_verify_depth(ctx,1); +#endif + debug(D_WEB_CLIENT,"SSL GLOBAL CONTEXT STARTED\n"); + + return ctx; +} + +/** + * Start SSL + * + * Call the correct function to start the SSL context. + * + * @param selector informs the context that must be initialized, the following list has the valid values: + * NETDATA_SSL_CONTEXT_SERVER - the server context + * NETDATA_SSL_CONTEXT_STREAMING - Starts the streaming context. + * NETDATA_SSL_CONTEXT_OPENTSDB - Starts the OpenTSDB contextv + */ +void security_start_ssl(int selector) { + switch (selector) { + case NETDATA_SSL_CONTEXT_SERVER: { + struct stat statbuf; + if (stat(security_key,&statbuf) || stat(security_cert,&statbuf)) { + info("To use encryption it is necessary to set \"ssl certificate\" and \"ssl key\" in [web] !\n"); + return; + } + + netdata_srv_ctx = security_initialize_openssl_server(); + break; + } + case NETDATA_SSL_CONTEXT_STREAMING: { + netdata_client_ctx = security_initialize_openssl_client(); + break; + } + case NETDATA_SSL_CONTEXT_OPENTSDB: { + netdata_opentsdb_ctx = security_initialize_openssl_client(); + break; + } + } +} + +void security_clean_openssl() { + if (netdata_srv_ctx) + { + SSL_CTX_free(netdata_srv_ctx); + } + + if (netdata_client_ctx) + { + SSL_CTX_free(netdata_client_ctx); + } + + if ( netdata_opentsdb_ctx ) + { + SSL_CTX_free(netdata_opentsdb_ctx); + } + +#if OPENSSL_VERSION_NUMBER < 0x10100000L + ERR_free_strings(); +#endif +} + +int security_process_accept(SSL *ssl,int msg) { + int sock = SSL_get_fd(ssl); + int test; + if (msg > 0x17) + { + return NETDATA_SSL_NO_HANDSHAKE; + } + + ERR_clear_error(); + if ((test = SSL_accept(ssl)) <= 0) { + int sslerrno = SSL_get_error(ssl, test); + switch(sslerrno) { + case SSL_ERROR_WANT_READ: + { + error("SSL handshake did not finish and it wanna read on socket %d!", sock); + return NETDATA_SSL_WANT_READ; + } + case SSL_ERROR_WANT_WRITE: + { + error("SSL handshake did not finish and it wanna read on socket %d!", sock); + return NETDATA_SSL_WANT_WRITE; + } + case SSL_ERROR_NONE: + case SSL_ERROR_SSL: + case SSL_ERROR_SYSCALL: + default: + { + u_long err; + char buf[256]; + int counter = 0; + while ((err = ERR_get_error()) != 0) { + ERR_error_string_n(err, buf, sizeof(buf)); + info("%d SSL Handshake error (%s) on socket %d ", counter++, ERR_error_string((long)SSL_get_error(ssl, test), NULL), sock); + } + return NETDATA_SSL_NO_HANDSHAKE; + } + } + } + + if (SSL_is_init_finished(ssl)) + { + debug(D_WEB_CLIENT_ACCESS,"SSL Handshake finished %s errno %d on socket fd %d", ERR_error_string((long)SSL_get_error(ssl, test), NULL), errno, sock); + } + + return 0; +} + +int security_test_certificate(SSL *ssl) { + X509* cert = SSL_get_peer_certificate(ssl); + int ret; + long status; + if (!cert) { + return -1; + } + + status = SSL_get_verify_result(ssl); + if((X509_V_OK != status)) + { + char error[512]; + ERR_error_string_n(ERR_get_error(), error, sizeof(error)); + error("SSL RFC4158 check: We have a invalid certificate, the tests result with %ld and message %s", status, error); + ret = -1; + } else { + ret = 0; + } + return ret; +} + +#endif diff --git a/libnetdata/socket/security.h b/libnetdata/socket/security.h new file mode 100644 index 00000000..8beb9672 --- /dev/null +++ b/libnetdata/socket/security.h @@ -0,0 +1,47 @@ +#ifndef NETDATA_SECURITY_H +# define NETDATA_SECURITY_H + +# define NETDATA_SSL_HANDSHAKE_COMPLETE 0 //All the steps were successful +# define NETDATA_SSL_START 1 //Starting handshake, conn variable is NULL +# define NETDATA_SSL_WANT_READ 2 //The connection wanna read from socket +# define NETDATA_SSL_WANT_WRITE 4 //The connection wanna write on socket +# define NETDATA_SSL_NO_HANDSHAKE 8 //Continue without encrypt connection. +# define NETDATA_SSL_OPTIONAL 16 //Flag to define the HTTP request +# define NETDATA_SSL_FORCE 32 //We only accepts HTTPS request +# define NETDATA_SSL_INVALID_CERTIFICATE 64 //Accepts invalid certificate +# define NETDATA_SSL_VALID_CERTIFICATE 128 //Accepts invalid certificate + +#define NETDATA_SSL_CONTEXT_SERVER 0 +#define NETDATA_SSL_CONTEXT_STREAMING 1 +#define NETDATA_SSL_CONTEXT_OPENTSDB 2 + +# ifdef ENABLE_HTTPS + +# include <openssl/ssl.h> +# include <openssl/err.h> +# if (SSLEAY_VERSION_NUMBER >= 0x0907000L) && (OPENSSL_VERSION_NUMBER < 0x10100000L) +# include <openssl/conf.h> +# endif + +struct netdata_ssl{ + SSL *conn; //SSL connection + int flags; +}; + +extern SSL_CTX *netdata_opentsdb_ctx; +extern SSL_CTX *netdata_client_ctx; +extern SSL_CTX *netdata_srv_ctx; +extern const char *security_key; +extern const char *security_cert; +extern int netdata_use_ssl_on_stream; +extern int netdata_use_ssl_on_http; +extern int netdata_validate_server; + +void security_openssl_library(); +void security_clean_openssl(); +void security_start_ssl(int selector); +int security_process_accept(SSL *ssl,int msg); +int security_test_certificate(SSL *ssl); + +# endif //ENABLE_HTTPS +#endif //NETDATA_SECURITY_H diff --git a/libnetdata/socket/socket.c b/libnetdata/socket/socket.c index bf9c60ea..28271008 100644 --- a/libnetdata/socket/socket.c +++ b/libnetdata/socket/socket.c @@ -301,14 +301,39 @@ void listen_sockets_close(LISTEN_SOCKETS *sockets) { sockets->failed = 0; } +WEB_CLIENT_ACL socket_ssl_acl(char *ssl) { +#ifdef ENABLE_HTTPS + if (!strcmp(ssl,"optional")) { + netdata_use_ssl_on_http = NETDATA_SSL_OPTIONAL; + return WEB_CLIENT_ACL_DASHBOARD | WEB_CLIENT_ACL_REGISTRY | WEB_CLIENT_ACL_BADGE | WEB_CLIENT_ACL_MGMT | WEB_CLIENT_ACL_NETDATACONF | WEB_CLIENT_ACL_STREAMING; + } + else if (!strcmp(ssl,"force")) { + netdata_use_ssl_on_stream = NETDATA_SSL_FORCE; + return WEB_CLIENT_ACL_DASHBOARD | WEB_CLIENT_ACL_REGISTRY | WEB_CLIENT_ACL_BADGE | WEB_CLIENT_ACL_MGMT | WEB_CLIENT_ACL_NETDATACONF | WEB_CLIENT_ACL_STREAMING; + } +#endif + + return WEB_CLIENT_ACL_NONE; +} + WEB_CLIENT_ACL read_acl(char *st) { + char *ssl = strchr(st,'^'); + if (ssl) { + ssl++; + if (!strncmp("SSL=",ssl,4)) { + ssl += 4; + } + socket_ssl_acl(ssl); + } + if (!strcmp(st,"dashboard")) return WEB_CLIENT_ACL_DASHBOARD; if (!strcmp(st,"registry")) return WEB_CLIENT_ACL_REGISTRY; if (!strcmp(st,"badges")) return WEB_CLIENT_ACL_BADGE; if (!strcmp(st,"management")) return WEB_CLIENT_ACL_MGMT; if (!strcmp(st,"streaming")) return WEB_CLIENT_ACL_STREAMING; if (!strcmp(st,"netdata.conf")) return WEB_CLIENT_ACL_NETDATACONF; - return WEB_CLIENT_ACL_NONE; + + return socket_ssl_acl(st); } static inline int bind_to_this(LISTEN_SOCKETS *sockets, const char *definition, uint16_t default_port, int listen_backlog) { @@ -794,11 +819,15 @@ int connect_to_one_of(const char *destination, int default_port, struct timeval while(*s) { const char *e = s; + // skip path, moving both s(tart) and e(nd) + if(*e == '/') + while(!isspace(*e) && *e != ',') s = ++e; + // skip separators, moving both s(tart) and e(nd) while(isspace(*e) || *e == ',') s = ++e; // move e(nd) to the first separator - while(*e && !isspace(*e) && *e != ',') e++; + while(*e && !isspace(*e) && *e != ',' && *e != '/') e++; // is there anything? if(!*s || s == e) break; @@ -824,7 +853,12 @@ int connect_to_one_of(const char *destination, int default_port, struct timeval // -------------------------------------------------------------------------------------------------------------------- // helpers to send/receive data in one call, in blocking mode, with a timeout +#ifdef ENABLE_HTTPS +ssize_t recv_timeout(struct netdata_ssl *ssl,int sockfd, void *buf, size_t len, int flags, int timeout) { +#else ssize_t recv_timeout(int sockfd, void *buf, size_t len, int flags, int timeout) { +#endif + for(;;) { struct pollfd fd = { .fd = sockfd, @@ -852,10 +886,22 @@ ssize_t recv_timeout(int sockfd, void *buf, size_t len, int flags, int timeout) if(fd.events & POLLIN) break; } +#ifdef ENABLE_HTTPS + if (ssl->conn) { + if (!ssl->flags) { + return SSL_read(ssl->conn,buf,len); + } + } +#endif return recv(sockfd, buf, len, flags); } +#ifdef ENABLE_HTTPS +ssize_t send_timeout(struct netdata_ssl *ssl,int sockfd, void *buf, size_t len, int flags, int timeout) { +#else ssize_t send_timeout(int sockfd, void *buf, size_t len, int flags, int timeout) { +#endif + for(;;) { struct pollfd fd = { .fd = sockfd, @@ -883,6 +929,13 @@ ssize_t send_timeout(int sockfd, void *buf, size_t len, int flags, int timeout) if(fd.events & POLLOUT) break; } +#ifdef ENABLE_HTTPS + if(ssl->conn) { + if (!ssl->flags) { + return SSL_write(ssl->conn, buf, len); + } + } +#endif return send(sockfd, buf, len, flags); } @@ -1291,6 +1344,8 @@ static void poll_events_process(POLLJOB *p, POLLINFO *pi, struct pollfd *pf, sho do { char client_ip[NI_MAXHOST + 1]; char client_port[NI_MAXSERV + 1]; + client_ip[0] = 0x00; + client_port[0] = 0x00; debug(D_POLLFD, "POLLFD: LISTENER: calling accept4() slot %zu (fd %d)", i, fd); nfd = accept_socket(fd, SOCK_NONBLOCK, client_ip, NI_MAXHOST + 1, client_port, NI_MAXSERV + 1, p->access_list); diff --git a/libnetdata/socket/socket.h b/libnetdata/socket/socket.h index c69d4897..9ea83bcc 100644 --- a/libnetdata/socket/socket.h +++ b/libnetdata/socket/socket.h @@ -51,8 +51,13 @@ extern void listen_sockets_close(LISTEN_SOCKETS *sockets); extern int connect_to_this(const char *definition, int default_port, struct timeval *timeout); extern int connect_to_one_of(const char *destination, int default_port, struct timeval *timeout, size_t *reconnects_counter, char *connected_to, size_t connected_to_size); +#ifdef ENABLE_HTTPS +extern ssize_t recv_timeout(struct netdata_ssl *ssl,int sockfd, void *buf, size_t len, int flags, int timeout); +extern ssize_t send_timeout(struct netdata_ssl *ssl,int sockfd, void *buf, size_t len, int flags, int timeout); +#else extern ssize_t recv_timeout(int sockfd, void *buf, size_t len, int flags, int timeout); extern ssize_t send_timeout(int sockfd, void *buf, size_t len, int flags, int timeout); +#endif extern int sock_setnonblock(int fd); extern int sock_delnonblock(int fd); diff --git a/netdata-installer.sh b/netdata-installer.sh index 4ed4050f..a0c3f828 100755 --- a/netdata-installer.sh +++ b/netdata-installer.sh @@ -44,15 +44,21 @@ else source "${NETDATA_SOURCE_DIR}/packaging/installer/functions.sh" || exit 1 fi -download() { +download_go() { url="${1}" dest="${2}" + if command -v curl >/dev/null 2>&1; then - run curl -sSL --connect-timeout 10 --retry 3 "${url}" >"${dest}" || fatal "Cannot download ${url}" + run curl -sSL --connect-timeout 10 --retry 3 "${url}" > "${dest}" elif command -v wget >/dev/null 2>&1; then - run wget -T 15 -O - "${url}" >"${dest}" || fatal "Cannot download ${url}" + run wget -T 15 -O - "${url}" > "${dest}" else - fatal "I need curl or wget to proceed, but neither is available on this system." + echo >&2 + echo >&2 "Downloading go.d plugin from '${url}' failed because of missing mandatory packages." + echo >&2 "Either add packages or disable it by issuing '--disable-go' in the installer" + echo >&2 + + run_failed "I need curl or wget to proceed, but neither is available on this system." fi } @@ -159,6 +165,9 @@ USAGE: ${PROGRAM} [options] --enable-backend-kinesis Enable AWS Kinesis backend. Default: enable it when libaws_cpp_sdk_kinesis and libraries it depends on are available. --disable-backend-kinesis + --enable-backend-prometheus-remote-write Enable Prometheus remote write backend. Default: enable it when libprotobuf and + libsnappy are available. + --disable-backend-prometheus-remote-write --enable-lto Enable Link-Time-Optimization. Default: enabled --disable-lto --disable-x86-sse Disable SSE instructions. By default SSE optimizations are enabled. @@ -204,8 +213,10 @@ while [ -n "${1}" ]; do "--disable-plugin-nfacct") NETDATA_CONFIGURE_OPTIONS="${NETDATA_CONFIGURE_OPTIONS//--disable-plugin-nfacct/} --disable-plugin-nfacct";; "--enable-plugin-xenstat") NETDATA_CONFIGURE_OPTIONS="${NETDATA_CONFIGURE_OPTIONS//--enable-plugin-xenstat/} --enable-plugin-xenstat";; "--disable-plugin-xenstat") NETDATA_CONFIGURE_OPTIONS="${NETDATA_CONFIGURE_OPTIONS//--disable-plugin-xenstat/} --disable-plugin-xenstat";; - "--enable-backend-kinesis") NETDATA_CONFIGURE_OPTIONS="${NETDATA_CONFIGURE_OPTIONS//--enable-backend-kinesis/} --enable-backend-kinesis";; - "--disable-backend-kinesis") NETDATA_CONFIGURE_OPTIONS="${NETDATA_CONFIGURE_OPTIONS//--disable-backend-kinesis/} --disable-backend-kinesis";; + "--enable-backend-kinesis") NETDATA_CONFIGURE_OPTIONS="${NETDATA_CONFIGURE_OPTIONS//--enable-backend-kinesis/} --enable-backend-kinesis";; + "--disable-backend-kinesis") NETDATA_CONFIGURE_OPTIONS="${NETDATA_CONFIGURE_OPTIONS//--disable-backend-kinesis/} --disable-backend-kinesis";; + "--enable-backend-prometheus-remote-write") NETDATA_CONFIGURE_OPTIONS="${NETDATA_CONFIGURE_OPTIONS//--enable-backend-prometheus-remote-write/} --enable-backend-prometheus-remote-write";; + "--disable-backend-prometheus-remote-write") NETDATA_CONFIGURE_OPTIONS="${NETDATA_CONFIGURE_OPTIONS//--disable-backend-prometheus-remote-write/} --disable-backend-prometheus-remote-write";; "--enable-lto") NETDATA_CONFIGURE_OPTIONS="${NETDATA_CONFIGURE_OPTIONS//--enable-lto/} --enable-lto";; "--disable-lto") NETDATA_CONFIGURE_OPTIONS="${NETDATA_CONFIGURE_OPTIONS//--disable-lto/} --disable-lto";; "--disable-x86-sse") NETDATA_CONFIGURE_OPTIONS="${NETDATA_CONFIGURE_OPTIONS//--disable-x86-sse/} --disable-x86-sse";; @@ -383,6 +394,7 @@ run ./configure \ --prefix="${NETDATA_PREFIX}/usr" \ --sysconfdir="${NETDATA_PREFIX}/etc" \ --localstatedir="${NETDATA_PREFIX}/var" \ + --libexecdir="${NETDATA_PREFIX}/usr/libexec" \ --with-zlib \ --with-math \ --with-user=netdata \ @@ -540,6 +552,7 @@ progress "Install logrotate configuration for netdata" install_netdata_logrotate + # ----------------------------------------------------------------------------- progress "Read installation options from netdata.conf" @@ -628,7 +641,7 @@ fi # --- conf dir ---- -for x in "python.d" "charts.d" "node.d" "health.d" "statsd.d" "go.d"; do +for x in "python.d" "charts.d" "node.d" "health.d" "statsd.d" "go.d" "custom-plugins.d" "ssl"; do if [ ! -d "${NETDATA_USER_CONFIG_DIR}/${x}" ]; then echo >&2 "Creating directory '${NETDATA_USER_CONFIG_DIR}/${x}'" run mkdir -p "${NETDATA_USER_CONFIG_DIR}/${x}" || exit 1 @@ -723,15 +736,20 @@ if [ "${UID}" -eq 0 ]; then run chmod 4750 "${NETDATA_PREFIX}/usr/libexec/netdata/plugins.d/nfacct.plugin" fi - if [ -f "${NETDATA_PREFIX}/usr/libexec/netdata/plugins.d/xenstat.plugin" ]; then - run chown root:${NETDATA_GROUP} "${NETDATA_PREFIX}/usr/libexec/netdata/plugins.d/xenstat.plugin" - run chmod 4750 "${NETDATA_PREFIX}/usr/libexec/netdata/plugins.d/xenstat.plugin" - fi + if [ -f "${NETDATA_PREFIX}/usr/libexec/netdata/plugins.d/xenstat.plugin" ]; then + run chown root:${NETDATA_GROUP} "${NETDATA_PREFIX}/usr/libexec/netdata/plugins.d/xenstat.plugin" + run chmod 4750 "${NETDATA_PREFIX}/usr/libexec/netdata/plugins.d/xenstat.plugin" + fi - if [ -f "${NETDATA_PREFIX}/usr/libexec/netdata/plugins.d/ioping" ]; then - run chown root:${NETDATA_GROUP} "${NETDATA_PREFIX}/usr/libexec/netdata/plugins.d/ioping" - run chmod 4750 "${NETDATA_PREFIX}/usr/libexec/netdata/plugins.d/ioping" - fi + if [ -f "${NETDATA_PREFIX}/usr/libexec/netdata/plugins.d/perf.plugin" ]; then + run chown root:${NETDATA_GROUP} "${NETDATA_PREFIX}/usr/libexec/netdata/plugins.d/perf.plugin" + run chmod 4750 "${NETDATA_PREFIX}/usr/libexec/netdata/plugins.d/perf.plugin" + fi + + if [ -f "${NETDATA_PREFIX}/usr/libexec/netdata/plugins.d/ioping" ]; then + run chown root:${NETDATA_GROUP} "${NETDATA_PREFIX}/usr/libexec/netdata/plugins.d/ioping" + run chmod 4750 "${NETDATA_PREFIX}/usr/libexec/netdata/plugins.d/ioping" + fi if [ -f "${NETDATA_PREFIX}/usr/libexec/netdata/plugins.d/cgroup-network" ]; then run chown "root:${NETDATA_GROUP}" "${NETDATA_PREFIX}/usr/libexec/netdata/plugins.d/cgroup-network" @@ -755,7 +773,7 @@ fi install_go() { # When updating this value, ensure correct checksums in packaging/go.d.checksums - GO_PACKAGE_VERSION="v0.5.0" + GO_PACKAGE_VERSION="v0.7.0" ARCH_MAP=( 'i386::386' 'i686::386' @@ -775,24 +793,37 @@ install_go() { for index in "${ARCH_MAP[@]}" ; do KEY="${index%%::*}" VALUE="${index##*::}" - if [ "$KEY" == "$ARCH" ]; then + if [ "$KEY" = "$ARCH" ]; then ARCH="${VALUE}" break fi done tmp=$(mktemp -d /tmp/netdata-go-XXXXXX) - GO_PACKAGE_BASENAME="go.d.plugin-$GO_PACKAGE_VERSION.$OS-$ARCH" + GO_PACKAGE_BASENAME="go.d.plugin-${GO_PACKAGE_VERSION}.${OS}-${ARCH}" + + download_go "https://github.com/netdata/go.d.plugin/releases/download/${GO_PACKAGE_VERSION}/${GO_PACKAGE_BASENAME}" "${tmp}/${GO_PACKAGE_BASENAME}" - download "https://github.com/netdata/go.d.plugin/releases/download/$GO_PACKAGE_VERSION/$GO_PACKAGE_BASENAME" "${tmp}/$GO_PACKAGE_BASENAME" + download_go "https://github.com/netdata/go.d.plugin/releases/download/${GO_PACKAGE_VERSION}/config.tar.gz" "${tmp}/config.tar.gz" + + if [ ! -f "${tmp}/${GO_PACKAGE_BASENAME}" ] || [ ! -f "${tmp}/config.tar.gz" ] || [ ! -s "${tmp}/config.tar.gz" ] || [ ! -s "${tmp}/${GO_PACKAGE_BASENAME}" ]; then + run_failed "go.d plugin download failed, go.d plugin will not be available" + echo >&2 "Either check the error or consider disabling it by issuing '--disable-go' in the installer" + echo >&2 + return 0 + fi - download "https://github.com/netdata/go.d.plugin/releases/download/$GO_PACKAGE_VERSION/config.tar.gz" "${tmp}/config.tar.gz" grep "${GO_PACKAGE_BASENAME}\$" "${INSTALLER_DIR}/packaging/go.d.checksums" > "${tmp}/sha256sums.txt" 2>/dev/null grep "config.tar.gz" "${INSTALLER_DIR}/packaging/go.d.checksums" >> "${tmp}/sha256sums.txt" 2>/dev/null # Checksum validation if ! (cd "${tmp}" && safe_sha256sum -c "sha256sums.txt"); then + + echo >&2 "go.d plugin checksum validation failure." + echo >&2 "Either check the error or consider disabling it by issuing '--disable-go' in the installer" + echo >&2 + run_failed "go.d.plugin package files checksum validation failed." - return 1 + return 0 fi # Install new files @@ -812,12 +843,23 @@ install_go() { install_go # ----------------------------------------------------------------------------- +progress "Telemetry configuration" + +# Opt-out from telemetry program +if [ -n "${NETDATA_DISABLE_TELEMETRY+x}" ]; then + run touch "${NETDATA_USER_CONFIG_DIR}/.opt-out-from-anonymous-statistics" +else + printf "You can opt out from anonymous statistics via the --disable-telemetry option, or by creating an empty file ${NETDATA_USER_CONFIG_DIR}/.opt-out-from-anonymous-statistics \n\n" +fi + +# ----------------------------------------------------------------------------- progress "Install netdata at system init" NETDATA_START_CMD="${NETDATA_PREFIX}/usr/sbin/netdata" if grep -q docker /proc/1/cgroup >/dev/null 2>&1; then echo >&2 "We are running within a docker container, will not be installing netdata service" + echo >&2 else install_netdata_service || run_failed "Cannot install netdata init service." fi @@ -839,7 +881,7 @@ else create_netdata_conf "${NETDATA_PREFIX}/etc/netdata/netdata.conf" "http://localhost:${NETDATA_PORT}/netdata.conf" fi if [ "${UID}" -eq 0 ]; then - run chown "${NETDATA_USER}" "${NETDATA_PREFIX}/etc/netdata/netdata.conf" + run chown "${NETDATA_USER}" "${NETDATA_PREFIX}/etc/netdata/netdata.conf" fi run chmod 0664 "${NETDATA_PREFIX}/etc/netdata/netdata.conf" @@ -1035,10 +1077,6 @@ RELEASE_CHANNEL="${RELEASE_CHANNEL}" NETDATA_TARBALL_CHECKSUM="new_installation" EOF -# Opt-out from telemetry program -if [ -n "${NETDATA_DISABLE_TELEMETRY+x}" ]; then - touch "${NETDATA_USER_CONFIG_DIR}/.opt-out-from-anonymous-statistics" -fi # ----------------------------------------------------------------------------- echo >&2 diff --git a/netdata.spec.in b/netdata.spec.in index e201f4a1..25b5f9a4 100644 --- a/netdata.spec.in +++ b/netdata.spec.in @@ -6,11 +6,15 @@ # error. %global __os_install_post %{nil} +# Mitigate the cross-distro mayhem by strictly defining the libexec destination +%define _prefix /usr +%define _sysconfdir /etc +%define _localstatedir /var +%define _libexecdir /usr/libexec + # # Conditional build: %bcond_without systemd # systemd -%bcond_with nfacct # build with nfacct plugin -%bcond_with freeipmi # build with freeipmi plugin %bcond_with netns # build with netns support (cgroup-network) %if 0%{?fedora} || 0%{?rhel} >= 7 || 0%{?suse_version} >= 1140 @@ -27,8 +31,12 @@ BuildRequires: systemd-rpm-macros \ %global netdata_initd_requires \ %{?systemd_requires} \ %{nil} -%global netdata_init_post %service_add_post netdata.service -%global netdata_init_preun %service_del_preun netdata.service +%global netdata_init_post %service_add_post netdata.service \ +/sbin/service netdata restart > /dev/null 2>&1 \ +%{nil} +%global netdata_init_preun %service_del_preun netdata.service \ +/sbin/service netdata stop > /dev/null 2>&1 \ +%{nil} %global netdata_init_postun %service_del_postun netdata.service %else %global netdata_initd_buildrequires \ @@ -38,7 +46,11 @@ Requires(preun): systemd-units \ Requires(postun): systemd-units \ Requires(post): systemd-units \ %{nil} -%global netdata_init_post %systemd_post netdata.service +%global netdata_init_post %systemd_post netdata.service \ +/usr/bin/systemctl enable netdata.service \ +/usr/bin/systemctl daemon-reload \ +/usr/bin/systemctl restart netdata.service \ +%{nil} %global netdata_init_preun %systemd_preun netdata.service %global netdata_init_postun %systemd_postun_with_restart netdata.service %endif @@ -49,6 +61,7 @@ Requires(post): chkconfig \ %{nil} %global netdata_init_post \ /sbin/chkconfig --add netdata \ +/sbin/service netdata restart > /dev/null 2>&1 \ %{nil} %global netdata_init_preun %{nil} \ if [ $1 = 0 ]; then \ @@ -63,23 +76,7 @@ fi \ %{nil} %endif -%if 0%{?_fedora} -%global netdata_recommends \ -Recommends: curl \ -Recommends: iproute-tc \ -Recommends: lm_sensors \ -Recommends: nmap-ncat \ -Recommends: nodejs \ -Recommends: python \ -Recommends: PyYAML \ -Recommends: python2-PyMySQL \ -Recommends: python2-psycopg2 \ -%{nil} -%else -%global netdata_recommends %{nil} -%endif - -Summary: Real-time performance monitoring, done right +Summary: Real-time performance monitoring, done right! Name: netdata Version: @PACKAGE_VERSION@ Release: 1%{?dist} @@ -87,43 +84,122 @@ License: GPLv3+ Group: Applications/System Source0: https://github.com/netdata/%{name}/releases/download/%{version}/%{name}-%{version}.tar.gz URL: http://my-netdata.io -BuildRequires: pkgconfig -BuildRequires: xz -BuildRequires: zlib-devel -BuildRequires: libuuid-devel -BuildRequires: autoconf -BuildRequires: automake -Requires: zlib -Requires: libuuid - -# Packages can be found in the EPEL repo -%if %{with nfacct} -BuildRequires: libmnl-devel -BuildRequires: libnetfilter_acct-devel -Requires: libmnl -Requires: libnetfilter_acct + +# ##################################################################### +# Core build/install/runtime dependencies +# ##################################################################### + +# Build dependencies +# +BuildRequires: gcc +BuildRequires: make +BuildRequires: git +BuildRequires: autoconf +%if 0%{?fedora} || 0%{?rhel} >= 7 || 0%{?suse_version} >= 1140 +BuildRequires: autoconf-archive +BuildRequires: autogen +%endif +BuildRequires: automake +BuildRequires: pkgconfig +BuildRequires: curl +BuildRequires: findutils +BuildRequires: zlib-devel +BuildRequires: libuuid-devel +BuildRequires: libuv-devel >= 1 +BuildRequires: openssl-devel +%if 0%{?suse_version} +BuildRequires: judy-devel +BuildRequires: liblz4-devel +BuildRequires: netcat-openbsd +BuildRequires: json-glib-devel +%else +BuildRequires: Judy-devel +BuildRequires: lz4-devel +BuildRequires: nc +BuildRequires: json-c-devel %endif -%if %{with freeipmi} -BuildRequires: freeipmi-devel -Requires: freeipmi +# Core build requirements for service install +%{netdata_initd_buildrequires} + +# Runtime dependencies +# +Requires: python +Requires: zlib +%if 0%{?suse_version} +# for libuv, Requires version >= 1 +Requires: libuv1 +Requires: libJudy1 +Requires: json-glib +Requires: libuuid1 +%else +# for libuv, Requires version >= 1 +Requires: libuv >= 1 +Requires: Judy +Requires: json-c +Requires: libuuid %endif +Requires: openssl +Requires: lz4 +# Core requirements for the install to succeed Requires(pre): /usr/sbin/groupadd Requires(pre): /usr/sbin/useradd +%if 0%{?suse_version} >= 1140 +Requires(post): libcap1 +%else Requires(post): libcap +%endif -%{netdata_initd_buildrequires} -%{netdata_recommends} %{netdata_initd_requires} +# ##################################################################### +# Functionality-dependent package dependencies +# ##################################################################### +# Note: Some or all of the Packages may be found in the EPEL repo, +# rather than the standard ones + +# nfacct plugin dependencies +BuildRequires: libmnl-devel +%if 0%{?fedora} || 0%{?suse_version} >= 1140 +BuildRequires: libnetfilter_acct-devel +%endif + +%if 0%{?suse_version} +Requires: libmnl0 +%else +Requires: libmnl +%endif + +%if 0%{?fedora} +Requires: libnetfilter_acct +%else +%if 0%{?suse_version} >= 1140 +Requires: libnetfilter_acct1 +%endif +%endif +# end nfacct plugin dependencies + +# freeipmi plugin dependencies +BuildRequires: freeipmi-devel +Requires: freeipmi +# end - freeipmi plugin dependencies + +# CUPS plugin dependencies +BuildRequires: cups-devel +Requires: cups +# end - cups plugin dependencies + +# ##################################################################### +# End of dependency management configuration +# ##################################################################### + %description -netdata is the fastest way to visualize metrics. It is a resource + netdata is the fastest way to visualize metrics. It is a resource efficient, highly optimized system for collecting and visualizing any type of realtime timeseries data, from CPU usage, disk activity, SQL queries, API calls, web site visitors, etc. - -netdata tries to visualize the truth of now, in its greatest detail, + netdata tries to visualize the truth of now, in its greatest detail, so that you can get insights of what is happening now and what just happened, on your systems and applications. @@ -131,25 +207,57 @@ happened, on your systems and applications. %setup -q -n %{name}-%{version} %build -autoreconf -i +# Conf step +autoreconf -ivf %configure \ + --prefix="%{_prefix}" \ + --sysconfdir="%{_sysconfdir}" \ + --localstatedir="%{_localstatedir}" \ + --libexecdir="%{_libexecdir}" \ --with-zlib \ --with-math \ - %{?with_nfacct:--enable-plugin-nfacct} \ - %{?with_freeipmi:--enable-plugin-freeipmi} \ - --with-user=netdata + --with-user=netdata \ + +# Build step %{__make} %{?_smp_mflags} %install + +# ########################################################### +# Clear the directory, if already exists and install rm -rf "${RPM_BUILD_ROOT}" %{__make} %{?_smp_mflags} DESTDIR="${RPM_BUILD_ROOT}" install find "${RPM_BUILD_ROOT}" -name .keep -delete install -m 644 -p system/netdata.conf "${RPM_BUILD_ROOT}%{_sysconfdir}/%{name}" + +# ########################################################### +# logrotate settings install -m 755 -d "${RPM_BUILD_ROOT}%{_sysconfdir}/logrotate.d" install -m 644 -p system/netdata.logrotate "${RPM_BUILD_ROOT}%{_sysconfdir}/logrotate.d/%{name}" +# ########################################################### +# Install freeipmi +install -m 4750 -p freeipmi.plugin "${RPM_BUILD_ROOT}%{_libexecdir}/%{name}/plugins.d/freeipmi.plugin" + +# ########################################################### +# Install apps.plugin +install -m 4750 -p apps.plugin "${RPM_BUILD_ROOT}%{_libexecdir}/%{name}/plugins.d/apps.plugin" + +# ########################################################### +# Install perf.plugin +install -m 4750 -p perf.plugin "${RPM_BUILD_ROOT}%{_libexecdir}/%{name}/plugins.d/perf.plugin" + +# ########################################################### +# Install registry directory +install -m 755 -d "${RPM_BUILD_ROOT}%{_localstatedir}/lib/%{name}/registry" +install -m 755 -d "${RPM_BUILD_ROOT}%{_sysconfdir}/%{name}/custom-plugins.d" +install -m 755 -d "${RPM_BUILD_ROOT}%{_sysconfdir}/%{name}/go.d" +install -m 755 -d "${RPM_BUILD_ROOT}%{_sysconfdir}/%{name}/ssl" + +# ########################################################### +# Install netdata service %if %{with systemd} install -m 755 -d "${RPM_BUILD_ROOT}%{_unitdir}" install -m 644 -p system/netdata.service "${RPM_BUILD_ROOT}%{_unitdir}/netdata.service" @@ -160,7 +268,100 @@ install -m 755 system/netdata-init-d \ "${RPM_BUILD_ROOT}/etc/rc.d/init.d/netdata" %endif +# ############################################################ +# Package Go within netdata (TBD: Package it separately) +safe_sha256sum() { + # Within the contexct of the installer, we only use -c option that is common between the two commands + # We will have to reconsider if we start non-common options + if command -v sha256sum >/dev/null 2>&1; then + sha256sum $@ + elif command -v shasum >/dev/null 2>&1; then + shasum -a 256 $@ + else + fatal "I could not find a suitable checksum binary to use" + fi +} + +download_go() { + url="${1}" + dest="${2}" + + if command -v curl >/dev/null 2>&1; then + curl -sSL --connect-timeout 10 --retry 3 "${url}" > "${dest}" + elif command -v wget >/dev/null 2>&1; then + wget -T 15 -O - "${url}" > "${dest}" + else + echo >&2 + echo >&2 "Downloading go.d plugin from '${url}' failed because of missing mandatory packages." + echo >&2 "Either add packages or disable it by issuing '--disable-go' in the installer" + echo >&2 + exit 1 + fi +} + +install_go() { + # When updating this value, ensure correct checksums in packaging/go.d.checksums + GO_PACKAGE_VERSION="v0.7.0" + ARCH_MAP=( + 'i386::386' + 'i686::386' + 'x86_64::amd64' + 'aarch64::arm64' + 'armv64::arm64' + 'armv6l::arm' + 'armv7l::arm' + 'armv5tel::arm' + ) + + if [ -z "${NETDATA_DISABLE_GO+x}" ]; then + echo >&2 "Install go.d.plugin" + ARCH=$(uname -m) + OS=$(uname -s | tr '[:upper:]' '[:lower:]') + + for index in "${ARCH_MAP[@]}" ; do + KEY="${index%%::*}" + VALUE="${index##*::}" + if [ "$KEY" = "$ARCH" ]; then + ARCH="${VALUE}" + break + fi + done + tmp=$(mktemp -d /tmp/netdata-go-XXXXXX) + GO_PACKAGE_BASENAME="go.d.plugin-${GO_PACKAGE_VERSION}.${OS}-${ARCH}" + download_go "https://github.com/netdata/go.d.plugin/releases/download/${GO_PACKAGE_VERSION}/${GO_PACKAGE_BASENAME}" "${tmp}/${GO_PACKAGE_BASENAME}" + download_go "https://github.com/netdata/go.d.plugin/releases/download/${GO_PACKAGE_VERSION}/config.tar.gz" "${tmp}/config.tar.gz" + + if [ ! -f "${tmp}/${GO_PACKAGE_BASENAME}" ] || [ ! -f "${tmp}/config.tar.gz" ] || [ ! -s "${tmp}/config.tar.gz" ] || [ ! -s "${tmp}/${GO_PACKAGE_BASENAME}" ]; then + echo >&2 "Either check the error or consider disabling it by issuing '--disable-go' in the installer" + echo >&2 + return 1 + fi + + grep "${GO_PACKAGE_BASENAME}\$" "packaging/go.d.checksums" > "${tmp}/sha256sums.txt" 2>/dev/null + grep "config.tar.gz" "packaging/go.d.checksums" >> "${tmp}/sha256sums.txt" 2>/dev/null + + # Checksum validation + if ! (cd "${tmp}" && safe_sha256sum -c "sha256sums.txt"); then + + echo >&2 "go.d plugin checksum validation failure." + echo >&2 "Either check the error or consider disabling it by issuing '--disable-go' in the installer" + echo >&2 + + echo "go.d.plugin package files checksum validation failed." + exit 1 + fi + + # Install files + tar -xf "${tmp}/config.tar.gz" -C "${RPM_BUILD_ROOT}%{_libdir}/%{name}/conf.d/" + mv "${tmp}/$GO_PACKAGE_BASENAME" "${RPM_BUILD_ROOT}%{_libexecdir}/%{name}/plugins.d/go.d.plugin" + fi + return 0 +} +install_go + %pre + +# User/Group creations, as needed getent group netdata >/dev/null || groupadd -r netdata getent group docker >/dev/null || groupadd -r docker getent passwd netdata >/dev/null || \ @@ -181,24 +382,28 @@ rm -rf "${RPM_BUILD_ROOT}" %files %doc README.md -%defattr(-,root,root) +%defattr(-,root,netdata) %dir %{_sysconfdir}/%{name} %dir %{_libdir}/%{name} -%config %{_sysconfdir}/%{name}/*.conf -#%config %{_sysconfdir}/%{name}/charts.d/*.conf -#%config %{_sysconfdir}/%{name}/health.d/*.conf -#%config %{_sysconfdir}/%{name}/node.d/*.conf -#%config %{_sysconfdir}/%{name}/python.d/*.conf -#%config %{_sysconfdir}/%{name}/statsd.d/*.conf -%config %{_sysconfdir}/logrotate.d/%{name} +%config(noreplace) %{_sysconfdir}/%{name}/*.conf +%config(noreplace) %{_sysconfdir}/logrotate.d/%{name} %{_libdir}/%{name} + +%defattr(0755,netdata,netdata,0755) %{_libexecdir}/%{name} %{_sbindir}/%{name} %{_sysconfdir}/%{name}/edit-config +%defattr(4750,root,netdata,0750) + +%dir %{_libexecdir}/%{name}/python.d +%dir %{_libexecdir}/%{name}/charts.d +%dir %{_libexecdir}/%{name}/plugins.d +%dir %{_libexecdir}/%{name}/node.d + %caps(cap_dac_read_search,cap_sys_ptrace=ep) %attr(0550,root,netdata) %{_libexecdir}/%{name}/plugins.d/apps.plugin %if %{with netns} @@ -209,27 +414,25 @@ rm -rf "${RPM_BUILD_ROOT}" %attr(0550,root,root) %{_libexecdir}/%{name}/plugins.d/cgroup-network-helper.sh %endif -%if %{with freeipmi} -%caps(cap_setuid=ep) %attr(4550,root,netdata) %{_libexecdir}/%{name}/plugins.d/freeipmi.plugin -%endif +# perf plugin +%caps(cap_setuid=ep) %attr(4750,root,netdata) %{_libexecdir}/%{name}/plugins.d/perf.plugin -%attr(0770,netdata,netdata) %dir %{_localstatedir}/cache/%{name} -%attr(0755,netdata,root) %dir %{_localstatedir}/log/%{name} -%attr(0770,netdata,netdata) %dir %{_localstatedir}/lib/%{name} +# freeipmi files +%caps(cap_setuid=ep) %attr(4550,root,netdata) %{_libexecdir}/%{name}/plugins.d/freeipmi.plugin %dir %{_datadir}/%{name} +%defattr(0750,netdata,netdata,0755) + %dir %{_sysconfdir}/%{name}/health.d %dir %{_sysconfdir}/%{name}/python.d %dir %{_sysconfdir}/%{name}/charts.d +%dir %{_sysconfdir}/%{name}/custom-plugins.d +%dir %{_sysconfdir}/%{name}/go.d +%dir %{_sysconfdir}/%{name}/ssl %dir %{_sysconfdir}/%{name}/node.d %dir %{_sysconfdir}/%{name}/statsd.d - -%dir %{_libdir}/%{name}/conf.d/health.d -%dir %{_libdir}/%{name}/conf.d/python.d -%dir %{_libdir}/%{name}/conf.d/charts.d -#%dir %{_libdir}/%{name}/conf.d/node.d -%dir %{_libdir}/%{name}/conf.d/statsd.d +%{_libdir}/%{name}/conf.d/ %if %{with systemd} %{_unitdir}/netdata.service @@ -242,7 +445,31 @@ rm -rf "${RPM_BUILD_ROOT}" %defattr(0644,root,netdata,0755) %{_datadir}/%{name}/web +# Enforce 0660 for files and 0770 for directories +# for the netdata lib, cache and log dirs +%defattr(0660,root,netdata,0770) +%attr(0770,netdata,netdata) %dir %{_localstatedir}/cache/%{name} +%attr(0755,netdata,root) %dir %{_localstatedir}/log/%{name} +%attr(0770,netdata,netdata) %dir %{_localstatedir}/lib/%{name} +%attr(0770,netdata,netdata) %dir %{_localstatedir}/lib/%{name}/registry + + %changelog +* Fri Jun 28 2019 Pavlos Emm. Katsoulakis <paul@netdata.cloud> - 0.0.0-7 +- Raise the path overrides to the spec file level, not just the configure. +- Adjust tighter permissions on some folders, based on what we did on our installer +- Introduce go.d plugin download and install, to include it on the package (Temporarily, to become separate package on next iteration) +* Tue Jun 25 2019 Pavlos Emm. Katsoulakis <paul@netdata.cloud> - 0.0.0-6 +- Adjust dependency list: Some packages are missing on some distros, adopt to build successfully +* Mon Jun 24 2019 Pavlos Emm. Katsoulakis <paul@netdata.cloud> - 0.0.0-5 +Another pass on cleaning up pre/post installation steps +- Sync permission and ownership on files and directories +* Sun Jun 16 2019 Pavlos Emm. Katsoulakis <paul@netdata.cloud> - 0.0.0-4 +First draft refactor on package dependencies section +- Remove freeipmi/nfacct plugin flags. We auto-detect all plugins by decision +- Start refactor of package dependencies +- Add missing dependencies, with respect to distro peculiarities +- Adjust existing dependencies, so that distro-specific package names is applied * Wed Jan 02 2019 Pawel Krupa <pkrupa@redhat.com> - 0.0.0-3 - Temporary set version statically - Fix changelog ordering diff --git a/package.json b/package.json index 69f74bcf..2bd614ca 100644 --- a/package.json +++ b/package.json @@ -1,23 +1,57 @@ { "devDependencies": { "coffee-script": "^1.12.7", + "dictionary-en-us": "^2.0.0", + "gaze": "^1.1.2", + "grunt": "^1.0.1", + "grunt-exec": "^2.0.0", "jasmine": "^2.6.0", "jasmine-core": "^2.6.4", + "jasmine-growl-reporter": "^1.0.1", + "jasmine-node": "BrainDoctor/jasmine-node", + "jasmine-reporters": "^2.2.1", "karma": "^1.7.0", "karma-chrome-launcher": "^2.2.0", "karma-coverage": "^1.1.1", "karma-firefox-launcher": "^1.0.1", "karma-jasmine": "^1.1.0", - "walkdir": "^0.0.11", - "underscore": "^1.8.3", - "gaze": "^1.1.2", - "mkdirp": "^0.5.1", "minimist": "^1.2.0", - "jasmine-growl-reporter": "^1.0.1", - "xml2js": "^0.4.17", - "grunt": "^1.0.1", - "grunt-exec": "^2.0.0", - "jasmine-reporters": "^2.2.1", - "jasmine-node": "BrainDoctor/jasmine-node" + "mkdirp": "^0.5.1", + "remark-cli": "^6.0.1", + "remark-frontmatter": "^1.3.1", + "remark-lint-heading-whitespace": "^1.0.0", + "remark-lint-no-dead-urls": "^0.4.1", + "remark-lint-unordered-list-marker-style": "^1.0.2", + "remark-lint-write-good": "^1.1.0", + "remark-preset-lint-consistent": "^2.0.2", + "remark-preset-lint-markdown-style-guide": "^2.1.2", + "remark-preset-lint-recommended": "^3.0.2", + "remark-retext": "^3.1.2", + "remark-stringify": "^6.0.4", + "remark-validate-links": "^8.0.2", + "retext-contractions": "^2.1.3", + "retext-diacritics": "^1.2.2", + "retext-english": "^3.0.2", + "retext-equality": "^3.9.1", + "retext-indefinite-article": "^1.1.6", + "retext-overuse": "^1.1.1", + "retext-passive": "^1.3.2", + "retext-profanities": "^4.6.0", + "retext-quotes": "^2.0.3", + "retext-readability": "^4.2.1", + "retext-redundant-acronyms": "^1.2.2", + "retext-repeated-words": "^1.2.2", + "retext-sentence-spacing": "^2.1.0", + "retext-spell": "^2.4.0", + "retext-syntax-urls": "^1.0.1", + "retext-usage": "^0.5.0", + "underscore": "^1.8.3", + "walkdir": "^0.0.11", + "xml2js": "^0.4.17" + }, + "scripts": { + "lint-md": "remark .", + "lint-md-path": "remark", + "fix-md": "remark collectors --output" } } diff --git a/packaging/docker/README.md b/packaging/docker/README.md index 6ae299f1..0bf416cd 100644 --- a/packaging/docker/README.md +++ b/packaging/docker/README.md @@ -54,10 +54,28 @@ services: ### Docker container names resolution -If you want to have your container names resolved by netdata it needs to have access to docker group. To achive that just add environment variable `PGID=999` to netdata container, where `999` is a docker group id from your host. This number can be found by running: -```bash -grep docker /etc/group | cut -d ':' -f 3 -``` +If you want to have your container names resolved by netdata, you need to do two things: +1) Make netdata user be part of the group that owns the socket. + To achieve that just add environment variable `PGID=[GROUP NUMBER]` to the netdata container, + where `[GROUP NUMBER]` is practically the group id of the group assigned to the docker socket, on your host. + This group number can be found by running the following (if socket group ownership is docker): + ```bash + grep docker /etc/group | cut -d ':' -f 3 + ``` + +2) Change docker socket access level to read/write like so: + from + ``` + /var/run/docker.sock:/var/run/docker.sock:ro + ``` + + change to + ``` + /var/run/docker.sock:/var/run/docker.sock:rw + ``` + +**Important Note**: You should seriously consider the necessity of activating this option, +as it grants to the netdata user access to the privileged socket connection of docker service ### Pass command line options to Netdata diff --git a/packaging/docker/publish.sh b/packaging/docker/publish.sh index 948787b0..fd1883af 100755 --- a/packaging/docker/publish.sh +++ b/packaging/docker/publish.sh @@ -21,6 +21,8 @@ ARCH_MAP=(["i386"]="386" ["amd64"]="amd64" ["armhf"]="arm" ["aarch64"]="arm64") DEVEL_ARCHS=(amd64) ARCHS="${!ARCH_MAP[@]}" DOCKER_CMD="docker --config ${WORKDIR}" +GIT_MAIL=${GIT_MAIL:-"bot@netdata.cloud"} +GIT_USER=${GIT_USER:-"netdatabot"} if [ -z ${REPOSITORY} ]; then REPOSITORY="${TRAVIS_REPO_SLUG}" @@ -37,6 +39,10 @@ if [ ! -z ${DEVEL+x} ]; then declare -a ARCHS=(${DEVEL_ARCHS[@]}) fi +echo "Syncing repository with latest changes (We may have updated with package versions)" +git checkout master +git pull + # Ensure there is a version, the most appropriate one if [ "${VERSION}" == "" ]; then VERSION=$(git tag --points-at) diff --git a/packaging/docker/run.sh b/packaging/docker/run.sh index 243cae8a..2b5047cd 100644..100755 --- a/packaging/docker/run.sh +++ b/packaging/docker/run.sh @@ -1,16 +1,51 @@ -#!/bin/sh - -#set -e +#!/usr/bin/env bash +# +# Entry point script for netdata +# +# Copyright: SPDX-License-Identifier: GPL-3.0-or-later +# +# Author : Pavlos Emm. Katsoulakis <paul@netdata.cloud> +set -e +echo "Netdata entrypoint script starting" if [ ${RESCRAMBLE+x} ]; then echo "Reinstalling all packages to get the latest Polymorphic Linux scramble" apk upgrade --update-cache --available fi -if [ ${PGID+x} ]; then - echo "Adding user netdata to group with id ${PGID}" - addgroup -g "${PGID}" -S hostgroup 2>/dev/null - sed -i "s/${PGID}:$/${PGID}:netdata/g" /etc/group +create_group_and_assign_to_user() { + local local_DOCKER_GROUP="$1" + local local_DOCKER_GID="$2" + local local_DOCKER_USR="$3" + + echo >&2 "Adding group with ID ${local_DOCKER_GID} and name '${local_DOCKER_GROUP}'" + addgroup -g "${local_DOCKER_GID}" "${local_DOCKER_GROUP}" || echo >&2 "Could not add group ${local_DOCKER_GROUP} with ID ${local_DOCKER_GID}, its already there probably" + + echo >&2 "Adding user '${local_DOCKER_USR}' to group '${local_DOCKER_GROUP}/${local_DOCKER_GID}'" + sed -i "s/:${local_DOCKER_GID}:$/:${local_DOCKER_GID}:${local_DOCKER_USR}/g" /etc/group + + # Make sure we use the right docker group + GRP_TO_ASSIGN="$(grep ":x:${local_DOCKER_GID}:" /etc/group | cut -d':' -f1)" + if [ -z "${GRP_TO_ASSIGN}" ]; then + echo >&2 "Could not find group ID ${local_DOCKER_GID} in /etc/group. Check your logs and report it if this is an unrecovereable error" + else + echo >&2 "Group creation and assignment completed, netdata was assigned to group ${GRP_TO_ASSIGN}/${local_DOCKER_GID}" + echo "${GRP_TO_ASSIGN}" + fi +} + +DOCKER_USR="netdata" +DOCKER_SOCKET="/var/run/docker.sock" +DOCKER_GROUP="docker" + +if [ -S "${DOCKER_SOCKET}" ] && [ -n "${PGID}" ]; then + GRP=$(create_group_and_assign_to_user "${DOCKER_GROUP}" "${PGID}" "${DOCKER_USR}") + if [ -n "${GRP}" ]; then + echo "Adjusting ownership of mapped docker socket '${DOCKER_SOCKET}' to root:${GRP}" + chown "root:${GRP}" "${DOCKER_SOCKET}" || echo "Failed to change ownership on docker socket, container name resolution might not work" + fi fi -exec /usr/sbin/netdata -u netdata -D -s /host -p "${NETDATA_PORT}" "$@" +exec /usr/sbin/netdata -u "${DOCKER_USR}" -D -s /host -p "${NETDATA_PORT}" "$@" + +echo "Netdata entrypoint script, completed!" diff --git a/packaging/go.d.checksums b/packaging/go.d.checksums index ae57b3c7..700bad0a 100644 --- a/packaging/go.d.checksums +++ b/packaging/go.d.checksums @@ -1,16 +1,16 @@ -f851c86df8248e52602e39c3198c9b0d858a70c24c5e5c3fb63d691ede5ae9c6 *config.tar.gz -a27dddfc9a783980375aa1f5c54dcfbaf38044311bd16e0371cffd94a2ebe46e *go.d.plugin-v0.5.0.darwin-386 -1d4815d92860089728944f6b893fea16dc51dd6e47a81e5a7599abfdc73ff2de *go.d.plugin-v0.5.0.darwin-amd64 -a3c76f4b806bf930d344a83b0dc2b3fabe16f747ba89b96eac7fcbdb88c4b058 *go.d.plugin-v0.5.0.freebsd-386 -673f61317b8e6f2b226f30d106cff3532d8a3ee3453997d11f984d76c55831ce *go.d.plugin-v0.5.0.freebsd-amd64 -a352b24578d497b505031b8a84e541532d8f4f2543e3ea454b674dece426982c *go.d.plugin-v0.5.0.freebsd-arm -0a3a4249dd94c2cd4bc0f9ac3e49d5f19ff3a52d91fc4540a17688a4c1b71ce8 *go.d.plugin-v0.5.0.linux-386 -40e034ec19952467b85aebda3c57b823c9e75d799318669c4a811b4296382396 *go.d.plugin-v0.5.0.linux-amd64 -74b955b838939a73455403181cf4be67c8f5d0d313f3da0504a6b47605b22ae0 *go.d.plugin-v0.5.0.linux-arm -8d564d5bc689fdf46b63fa9b4d152f8ce84bfad102d358f1d3acd390aebf1c2d *go.d.plugin-v0.5.0.linux-arm64 -dd2c9c4e842248f8d7d0588057507e4b683cc9ebef406886c3a839afbcbdee3f *go.d.plugin-v0.5.0.linux-mips -046e315f82b0dd9fa792a0cd07d25564e768d7d44c7c388f3f432e0d3a98da50 *go.d.plugin-v0.5.0.linux-mips64 -6a05c782d5b8200a51eb5334b9c0750a6d511d442078614729592582ab40da05 *go.d.plugin-v0.5.0.linux-mips64le -0f5427fb451aa34cdc71b2c3d0a2d638f63e8bc60f7cffdf62258fc88048d39e *go.d.plugin-v0.5.0.linux-mipsle -a5d21ed9c9858d9fe24ade24825e5449151e5dd114f9715c26d6c03ad6d70919 *go.d.plugin-v0.5.0.linux-ppc64 -c7ec8b4ae2b94f7689f4a6722a5fac7a8302574e9a906e4b76af70bff624557c *go.d.plugin-v0.5.0.linux-ppc64le +133e138307a52a1c3af5abeec4d368c7bcb27f3398f0f380cfacc23db57b9911 *config.tar.gz +7795ff9058852e9e03ceecd432e5c462ef141b3dd2e1f8e7c3cb13a6c4b685ce *go.d.plugin-v0.7.0.darwin-386 +a8db5312e803376bd96ab3c4cfd6f2d8288795fde97a2aefca7916cd8743f2a4 *go.d.plugin-v0.7.0.darwin-amd64 +a130a6aa7a98d37b648d41f8c3f0b939bfb8f343d1a3a6c8267a7fe604aae96f *go.d.plugin-v0.7.0.freebsd-386 +078c8a9607aea92ee8346cb2567a73b2a2ac317ea72c6975de07b47fdba2de80 *go.d.plugin-v0.7.0.freebsd-amd64 +bf7bff1f6fa32055242b627534ec5936fa1b8eb2f42edc736bbe041bee11129e *go.d.plugin-v0.7.0.freebsd-arm +2e15dc67736b29cf736ad7a05271f462467f84e80073fd1a7084dd5e2ac83115 *go.d.plugin-v0.7.0.linux-386 +3b0b5b0faa319201ecac554cb300789546b7f51847d202ff913e29339acca48b *go.d.plugin-v0.7.0.linux-amd64 +1be3860bea67e2ac789a37bf4dae24f8925f93bebe72a57cc2218c9e9a702f19 *go.d.plugin-v0.7.0.linux-arm +cba7cbfeda2e5146c8229d455aaf61f29f196d24291a509f4bf36ae12a2729e7 *go.d.plugin-v0.7.0.linux-arm64 +5f263cd5a032149618483a50486ce69c6e1a32b7e568c498d42b4d94691167f5 *go.d.plugin-v0.7.0.linux-mips +9558e7aa633331afea78c682a15fc9e6cf10ed39fb4c26f03034a7b0cbdfcc1a *go.d.plugin-v0.7.0.linux-mips64 +0f93f4cac9b21cdb28ef88b9f1ba42afcc1e913c0227deb266440c205ff9a224 *go.d.plugin-v0.7.0.linux-mips64le +51c0763f07de48e9f9dd9625a647aacecdd4a1bd39f13298b4f7c123436f4327 *go.d.plugin-v0.7.0.linux-mipsle +7e7e53fff1852c9756d6117d35d1f061a8bd97135b231b010ad1461e789b1f66 *go.d.plugin-v0.7.0.linux-ppc64 +6d4203f9c4d5778add09ef2679dc025a72914b68dce5fb816e7cc38f4f36945f *go.d.plugin-v0.7.0.linux-ppc64le diff --git a/packaging/installer/README.md b/packaging/installer/README.md index 6dc084e8..b10ffa05 100644 --- a/packaging/installer/README.md +++ b/packaging/installer/README.md @@ -20,6 +20,8 @@ The best way to install Netdata is directly from source. Our **automatic install See also the list of Netdata [package maintainers](../maintainers) for ASUSTOR NAS, OpenWRT, ReadyNAS, etc. +Note: From Netdata v1.12 and above, anonymous usage information is collected by default and sent to Google Analytics. To read more about the information collected and how to opt-out, check the [anonymous statistics page](../../docs/anonymous-statistics.md). + --- ## One line installation @@ -42,7 +44,7 @@ bash <(curl -Ss https://my-netdata.io/kickstart.sh) Verify the integrity of the script with this: ```bash -[ "fe451cd039c8f99b2ba4ca0feab88033" = "$(curl -Ss https://my-netdata.io/kickstart.sh | md5sum | cut -d ' ' -f 1)" ] && echo "OK, VALID" || echo "FAILED, INVALID" +[ "8a2b054081a108dff915994ce77f2f2d" = "$(curl -Ss https://my-netdata.io/kickstart.sh | md5sum | cut -d ' ' -f 1)" ] && echo "OK, VALID" || echo "FAILED, INVALID" ``` *It should print `OK, VALID` if the script is the one we ship.* @@ -99,7 +101,7 @@ To install Netdata with a binary package on any Linux distro, any kernel version Verify the integrity of the script with this: ```bash -[ "9ff4f5f37d23dff431f80d5349e0a25c" = "$(curl -Ss https://my-netdata.io/kickstart-static64.sh | md5sum | cut -d ' ' -f 1)" ] && echo "OK, VALID" || echo "FAILED, INVALID" +[ "8779d8717ccaa8dac18d599502eef591" = "$(curl -Ss https://my-netdata.io/kickstart-static64.sh | md5sum | cut -d ' ' -f 1)" ] && echo "OK, VALID" || echo "FAILED, INVALID" ``` *It should print `OK, VALID` if the script is the one we ship.* @@ -163,13 +165,25 @@ To install the latest git version of Netdata, please follow these 2 steps: Try our experimental automatic requirements installer (no need to be root). This will try to find the packages that should be installed on your system to build and run Netdata. It supports most major Linux distributions released after 2010: -- **Alpine** Linux and its derivatives (you have to install `bash` yourself, before using the installer) -- **Arch** Linux and its derivatives -- **Gentoo** Linux and its derivatives -- **Debian** Linux and its derivatives (including **Ubuntu**, **Mint**) -- **Fedora** and its derivatives (including **Red Hat Enterprise Linux**, **CentOS**, **Amazon Machine Image**) -- **SuSe** Linux and its derivatives (including **openSuSe**) -- **SLE12** Must have your system registered with Suse Customer Center or have the DVD. See [#1162](https://github.com/netdata/netdata/issues/1162) +* **Alpine** Linux and its derivatives + * You have to install `bash` yourself, before using the installer. + +* **Arch** Linux and its derivatives + * You need arch/aur for package Judy. + +* **Gentoo** Linux and its derivatives + +* **Debian** Linux and its derivatives (including **Ubuntu**, **Mint**) + +* **Redhat Enterprise Linux** and its derivatives (including **Fedora**, **CentOS**, **Amazon Machine Image**) + * Please note that for RHEL/CentOS you need + [EPEL](http://www.tecmint.com/how-to-enable-epel-repository-for-rhel-centos-6-5/). + In addition, RHEL/CentOS version 6 also need + [OKay](https://okay.com.mx/blog-news/rpm-repositories-for-centos-6-and-7.html) for package libuv version 1. + +* **SuSe** Linux and its derivatives (including **openSuSe**) + +* **SLE12** Must have your system registered with Suse Customer Center or have the DVD. See [#1162](https://github.com/netdata/netdata/issues/1162) Install the packages for having a **basic Netdata installation** (system monitoring and many applications, without `mysql` / `mariadb`, `postgres`, `named`, hardware sensors and `SNMP`): @@ -199,9 +213,10 @@ dnf install zlib-devel libuuid-devel libuv-devel lz4-devel Judy-devel openssl-de # CentOS / Red Hat Enterprise Linux yum install autoconf automake curl gcc git libmnl-devel libuuid-devel openssl-devel libuv-devel lz4-devel Judy-devel lm_sensors make MySQL-python nc pkgconfig python python-psycopg2 PyYAML zlib-devel -``` +# openSUSE +zypper install zlib-devel libuuid-devel libuv-devel liblz4-devel judy-devel libopenssl-devel libmnl-devel gcc make git autoconf autoconf-archive autogen automake pkgconfig curl findutils -Please note that for RHEL/CentOS you might need [EPEL](http://www.tecmint.com/how-to-enable-epel-repository-for-rhel-centos-6-5/). +``` Once Netdata is compiled, to run it the following packages are required (already installed using the above commands): diff --git a/packaging/installer/functions.sh b/packaging/installer/functions.sh index d1e94487..6f999690 100644 --- a/packaging/installer/functions.sh +++ b/packaging/installer/functions.sh @@ -303,7 +303,7 @@ install_non_systemd_init() { run rc-update add netdata default && return 0 - elif [ "${key}" = "debian-7" ] || [ "${key}" = "ubuntu-12.04" ] || [ "${key}" = "ubuntu-14.04" ]; then + elif [ "${key}" =~ ^devuan* ] || [ "${key}" = "debian-7" ] || [ "${key}" = "ubuntu-12.04" ] || [ "${key}" = "ubuntu-14.04" ]; then echo >&2 "Installing LSB init file..." run cp system/netdata-lsb /etc/init.d/netdata && run chmod 755 /etc/init.d/netdata && @@ -332,6 +332,8 @@ install_non_systemd_init() { NETDATA_START_CMD="netdata" NETDATA_STOP_CMD="killall netdata" +NETDATA_INSTALLER_START_CMD="${NETDATA_START_CMD}" +NETDATA_INSTALLER_STOP_CMD="${NETDATA_STOP_CMD}" install_netdata_service() { local uname="$(uname 2>/dev/null)" @@ -351,15 +353,23 @@ install_netdata_service() { elif [ "${uname}" = "FreeBSD" ]; then - run cp system/netdata-freebsd /etc/rc.d/netdata && - NETDATA_START_CMD="service netdata start" && - NETDATA_STOP_CMD="service netdata stop" && - return 0 + run cp system/netdata-freebsd /etc/rc.d/netdata && NETDATA_START_CMD="service netdata start" && + NETDATA_STOP_CMD="service netdata stop" && + NETDATA_INSTALLER_START_CMD="service netdata onestart" && + NETDATA_INSTALLER_STOP_CMD="${NETDATA_STOP_CMD}" + myret=$? + + echo >&2 "Note: To explicitly enable netdata automatic start, set 'netdata_enable' to 'YES' in /etc/rc.conf" + echo >&2 "" + + return ${myret} elif issystemd; then # systemd is running on this system NETDATA_START_CMD="systemctl start netdata" NETDATA_STOP_CMD="systemctl stop netdata" + NETDATA_INSTALLER_START_CMD="${NETDATA_START_CMD}" + NETDATA_INSTALLER_STOP_CMD="${NETDATA_STOP_CMD}" SYSTEMD_DIRECTORY="" @@ -390,6 +400,8 @@ install_netdata_service() { NETDATA_START_CMD="rc-service netdata start" NETDATA_STOP_CMD="rc-service netdata stop" fi + NETDATA_INSTALLER_START_CMD="${NETDATA_START_CMD}" + NETDATA_INSTALLER_STOP_CMD="${NETDATA_STOP_CMD}" fi return ${ret} @@ -429,6 +441,7 @@ stop_netdata_on_pid() { ret=$? test ${ret} -eq 0 && printf >&2 "." && sleep 2 + done echo >&2 @@ -446,8 +459,6 @@ netdata_pids() { myns="$(readlink /proc/self/ns/pid 2>/dev/null)" - # echo >&2 "Stopping a (possibly) running netdata (namespace '${myns}')..." - for p in \ $(cat /var/run/netdata.pid 2>/dev/null) \ $(cat /var/run/netdata/netdata.pid 2>/dev/null) \ @@ -477,12 +488,15 @@ restart_netdata() { local started=0 - progress "Start netdata" + progress "Restarting netdata instance" if [ "${UID}" -eq 0 ]; then - service netdata stop - stop_all_netdata - service netdata restart && started=1 + echo >&2 + echo >&2 "Stopping all netdata threads" + run stop_all_netdata + + echo >&2 "Starting netdata using command '${NETDATA_INSTALLER_START_CMD}'" + run ${NETDATA_INSTALLER_START_CMD} && started=1 if [ ${started} -eq 1 ] && [ -z "$(netdata_pids)" ]; then echo >&2 "Ooops! it seems netdata is not started." @@ -490,7 +504,8 @@ restart_netdata() { fi if [ ${started} -eq 0 ]; then - service netdata start && started=1 + echo >&2 "Attempting another netdata start using command '${NETDATA_INSTALLER_START_CMD}'" + run ${NETDATA_INSTALLER_START_CMD} && started=1 fi fi @@ -500,8 +515,8 @@ restart_netdata() { fi if [ ${started} -eq 0 ]; then - # still not started... - + # still not started... another forced attempt, just run the binary + echo >&2 "Netdata service still not started, attempting another forced restart by running '${netdata} ${@}'" run stop_all_netdata run "${netdata}" "${@}" return $? diff --git a/packaging/installer/kickstart-static64.sh b/packaging/installer/kickstart-static64.sh index 50517905..a9f11238 100755 --- a/packaging/installer/kickstart-static64.sh +++ b/packaging/installer/kickstart-static64.sh @@ -127,7 +127,7 @@ download() { } set_tarball_urls() { - if [ "$1" == "stable" ]; then + if [ "$1" = "stable" ]; then local latest # Simple version # latest="$(curl -sSL https://api.github.com/repos/netdata/netdata/releases/latest | grep tag_name | cut -d'"' -f4)" diff --git a/packaging/installer/kickstart.sh b/packaging/installer/kickstart.sh index 2db95f21..d396f139 100755 --- a/packaging/installer/kickstart.sh +++ b/packaging/installer/kickstart.sh @@ -141,7 +141,7 @@ warning() { create_tmp_directory() { # Check if tmp is mounted as noexec - if grep -Eq '^[^ ]+ /tmp [^ ]+ ([^ ]*,)?noexec[, ]' /proc/mounts; then + if grep -Eq '^[^ ]+ /tmp [^ ]+ ([^ ]*,)?noexec[, ]' /proc/mounts > /dev/null 2>&1; then pattern="$(pwd)/netdata-kickstart-XXXXXX" else pattern="/tmp/netdata-kickstart-XXXXXX" @@ -163,7 +163,7 @@ download() { } set_tarball_urls() { - if [ "$1" == "stable" ]; then + if [ "$1" = "stable" ]; then local latest # Simple version # latest="$(curl -sSL https://api.github.com/repos/netdata/netdata/releases/latest | grep tag_name | cut -d'"' -f4)" @@ -200,9 +200,9 @@ detect_bash4() { } dependencies() { - SYSTEM="$(uname -s)" - OS="$(uname -o)" - MACHINE="$(uname -m)" + SYSTEM="$(uname -s 2> /dev/null || uname -v)" + OS="$(uname -o 2> /dev/null || uname -rs)" + MACHINE="$(uname -m 2> /dev/null)" echo "System : ${SYSTEM}" echo "Operating System : ${OS}" diff --git a/packaging/installer/netdata-uninstaller.sh b/packaging/installer/netdata-uninstaller.sh index cfd858c0..0bbdaac2 100755 --- a/packaging/installer/netdata-uninstaller.sh +++ b/packaging/installer/netdata-uninstaller.sh @@ -232,7 +232,7 @@ quit_msg() { user_input() { TEXT="$1" - if [ "${INTERACTIVITY}" == "-i" ]; then + if [ "${INTERACTIVITY}" = "-i" ]; then read -r -p "$TEXT" >&2 fi } diff --git a/packaging/installer/netdata-updater.sh b/packaging/installer/netdata-updater.sh index 21a769ba..83031f3a 100755 --- a/packaging/installer/netdata-updater.sh +++ b/packaging/installer/netdata-updater.sh @@ -73,7 +73,7 @@ set_tarball_urls() { return fi - if [ "$1" == "stable" ]; then + if [ "$1" = "stable" ]; then local latest # Simple version # latest="$(curl -sSL https://api.github.com/repos/netdata/netdata/releases/latest | grep tag_name | cut -d'"' -f4)" @@ -95,7 +95,7 @@ update() { if [ -z "${NETDATA_LOCAL_TARBAL_OVERRIDE}" ]; then download "${NETDATA_TARBALL_CHECKSUM_URL}" "${tmpdir}/sha256sum.txt" >&3 2>&3 - if grep "${NETDATA_TARBALL_CHECKSUM}" sha256sum.txt >&3 2>&3; then + if [[ -n "${NETDATA_TARBALL_CHECKSUM}" ]] && grep "${NETDATA_TARBALL_CHECKSUM}" sha256sum.txt >&3 2>&3; then info "Newest version is already installed" else download "${NETDATA_TARBALL_URL}" "${tmpdir}/netdata-latest.tar.gz" diff --git a/packaging/makeself/install-or-update.sh b/packaging/makeself/install-or-update.sh index fc4e6d07..165e7920 100755 --- a/packaging/makeself/install-or-update.sh +++ b/packaging/makeself/install-or-update.sh @@ -175,7 +175,7 @@ fi progress "create user config directories" -for x in "python.d" "charts.d" "node.d" "health.d" "statsd.d" +for x in "python.d" "charts.d" "node.d" "health.d" "statsd.d" "custom-plugins.d" "ssl" do if [ ! -d "etc/netdata/${x}" ] then diff --git a/packaging/version b/packaging/version index 440ddd8f..a406138e 100644 --- a/packaging/version +++ b/packaging/version @@ -1 +1 @@ -v1.15.0 +v1.16.0 diff --git a/registry/README.md b/registry/README.md index 5a9a2b3b..73890807 100644 --- a/registry/README.md +++ b/registry/README.md @@ -1,7 +1,11 @@ # Registry -Netdata registry implements the `my-netdata` menu on netdata dashboards. -The `my-netdata` menu lists the netdata servers you have visited. +The Netdata registry implements the node menu on the top left corner of the netdata dashboards and enables the Netdata cloud features, such as the node view. +The node menu lists the netdata servers you have visited. The node view offers a lot of additional features on top of the menu, +[with many more to come](https://blog.netdata.cloud/posts/netdata-cloud-announcement/). +To enable the global Netdata registry and the cloud features, you need to Sign In to Netdata cloud. By signing in, you opt in to let the registry receive and store +the information described [here](#what-data-does-the-registry-store). +You can still get the node menu, but not the cloud features, if you [run your own registry](#run-your-own-registry). ## Why? @@ -26,11 +30,13 @@ Using netdata, your monitoring infrastructure is embedded on each server, limiti However, the netdata approach introduces a few new issues that need to be addressed, one being **the list of netdata we have installed**, i.e. the URLs our netdata servers are listening. -To solve this, netdata utilizes a **central registry**. This registry, together with certain browser features, allow netdata to provide unified cross-server dashboards. For example, when you jump from server to server using the `my-netdata` menu, several session settings (like the currently viewed charts, the current zoom and pan operations on the charts, etc.) are propagated to the new server, so that the new dashboard will come with exactly the same view. +To solve this, netdata utilizes a **central registry**. This registry, together with certain browser features, allow netdata to provide unified cross-server dashboards. +For example, when you jump from server to server using the node menu, several session settings (like the currently viewed charts, the current zoom and pan operations on the charts, etc.) are propagated to the new server, so that the new dashboard will come with exactly the same view. +Netdata cloud has a roadmap to [offer many more features](https://blog.netdata.cloud/posts/netdata-cloud-announcement/) over and above the simple node menu. -## What is the registry? +## What data does the registry store? -The registry keeps track of 3 entities: +The registry keeps track of 4 entities: 1. **machines**: i.e. the netdata installations (a random GUID generated by each netdata the first time it starts; we call this **machine_guid**) @@ -38,12 +44,17 @@ The registry keeps track of 3 entities: 2. **persons**: i.e. the web browsers accessing the netdata installations (a random GUID generated by the registry the first time it sees a new web browser; we call this **person_guid**) - For each person, the registry keeps track of the netdata installations it has accessed and their URLs. + For each person, the registry keeps track of the netdata installations it has accessed and their URLs. 3. **URLs** of netdata installations (as seen by the web browsers) For each URL, the registry keeps the URL and nothing more. Each URL is linked to *persons* and *machines*. The only way to find a URL is to know its **machine_guid** or have a **person_guid** it is linked to it. +4. **accounts**: i.e. the information used to sign-in via one of the available sign-in methods. Depending on the method, this may include an email, an email and a profile picture. + +For *persons*/*accounts* and *machines*, the registry keeps links to *URLs*, each link with 2 timestamps (first time seen, last time seen) and a counter (number of times it has been seen). +*machines*, *persons* and timestamps are stored in the netdata registry regardless of whether you sign in or not. + ## Who talks to the registry? Your web browser **only**! If sending this information is against your policies, you can [run your own registry](#run-your-own-registry) @@ -52,19 +63,11 @@ Your netdata servers do not talk to the registry. This is a UML diagram of its o ![registry](https://cloud.githubusercontent.com/assets/2662304/19448565/11a70632-94ab-11e6-9d80-f410b4acb797.png) -## What data does the registry store? - -Its database contains: - -- **random person GUIDs** (generated by the registry as a browser cookie) -- **random machine GUIDs** (generated by each netdata server on its first run), including the hostname of the server netdata is running (without the domain) -- **URLs** (the base URL for accessing a netdata server, as seen by the web browser) - -For *persons* and *machines*, the registry keeps links to *URLs*, each link with 2 timestamps (first time seen, last time seen) and a counter (number of times it has been seen). ## Which is the default registry? `https://registry.my-netdata.io`, which is currently served by `https://london.my-netdata.io`. This registry listens to both HTTP and HTTPS requests but the default is HTTPS. +`https://netdata.cloud` is the additional registry endpoint, that enables [the cloud features](https://blog.netdata.cloud/posts/netdata-cloud-announcement/). It only accepts HTTPS. ### Can this registry handle the global load of netdata installations? @@ -98,14 +101,14 @@ Note that we have not enabled the registry on the other servers. Only one netdat This is it. You have your registry now. -You may also want to give your server different names under the **my-netdata** menu (i.e. to have them sorted / grouped). You can change its registry name, by setting on each netdata server: +You may also want to give your server different names under the node menu (i.e. to have them sorted / grouped). You can change its registry name, by setting on each netdata server: ``` [registry] registry hostname = Group1 - Master DB ``` -So this server will appear in **my-netdata** as `Group1 - Master DB`. The max name length is 50 characters. +So this server will appear in the node menu as `Group1 - Master DB`. The max name length is 50 characters. ### Limiting access to the registry diff --git a/registry/registry_init.c b/registry/registry_init.c index 3cf140de..e5e66682 100644 --- a/registry/registry_init.c +++ b/registry/registry_init.c @@ -43,6 +43,7 @@ int registry_init(void) { // netdata.cloud configuration, if cloud_base_url == "", cloud functionality is disabled. registry.cloud_base_url = config_get(CONFIG_SECTION_CLOUD, "cloud base url", "https://netdata.cloud"); + setenv("NETDATA_REGISTRY_CLOUD_BASE_URL", registry.cloud_base_url, 1); setenv("NETDATA_REGISTRY_HOSTNAME", registry.hostname, 1); setenv("NETDATA_REGISTRY_URL", registry.registry_to_announce, 1); diff --git a/streaming/README.md b/streaming/README.md index 0ad9d7e2..3e58f1f0 100644 --- a/streaming/README.md +++ b/streaming/README.md @@ -18,7 +18,7 @@ a netdata performs: Local netdata (`slave`), **without any database or alarms**, collects metrics and sends them to another netdata (`master`). -The `my-netdata` menu shows a list of all "databases streamed to" the master. Clicking one of those links allows the user to view the full dashboard of the `slave` netdata. The URL has the form http://master-host:master-port/host/slave-host/. +The node menu shows a list of all "databases streamed to" the master. Clicking one of those links allows the user to view the full dashboard of the `slave` netdata. The URL has the form http://master-host:master-port/host/slave-host/. Alarms for the `slave` are served by the `master`. @@ -41,6 +41,8 @@ The `slave` and the `master` may have different data retention policies for the Alarms for the `slave` are triggered by **both** the `slave` and the `master` (and actually each can have different alarms configurations or have alarms disabled). +Take a note, that custom chart names, configured on the `slave`, should be in the form `type.name` to work correctly. The `master` will truncate the `type` part and substitute the original chart `type` to store the name in the database. + ### netdata proxies Local netdata (`slave`), with or without a database, collects metrics and sends them to another @@ -81,14 +83,14 @@ monitoring (there cannot be health monitoring without a database). ``` [web] - mode = none | static-threaded - accept a streaming request every seconds = 0 + mode = none | static-threaded + accept a streaming request every seconds = 0 ``` `[web].mode = none` disables the API (netdata will not listen to any ports). This also disables the registry (there cannot be a registry without an API). -`accept a streaming request every seconds` can be used to set a limit on how often a master Netdata server will accept streaming requests from the slaves. 0 sets no limit, 1 means maximum once every second. If this is set, you may see error log entries "... too busy to accept new streaming request. Will be allowed in X secs". +`accept a streaming request every seconds` can be used to set a limit on how often a master Netdata server will accept streaming requests from the slaves. 0 sets no limit, 1 means maximum once every second. If this is set, you may see error log entries "... too busy to accept new streaming request. Will be allowed in X secs". ``` [backend] @@ -123,7 +125,7 @@ a `proxy`). ``` [stream] enabled = yes | no - destination = IP:PORT ... + destination = IP:PORT[:SSL] ... api key = XXXXXXXXXXX ``` @@ -136,6 +138,8 @@ headless proxy|`none`|not `none`|`yes`|only for `data source = as collected`|not proxy with db|not `none`|not `none`|`yes`|possible|possible|yes central netdata|not `none`|not `none`|`no`|possible|possible|yes +For the options to encrypt the data stream between the slave and the master, refer to [securing the communication](#securing-the-communication) + ##### options for the receiving node `stream.conf` looks like this: @@ -209,11 +213,46 @@ The receiving end (`proxy` or `master`) logs entries like these: For netdata v1.9+, streaming can also be monitored via `access.log`. +### Securing the communication + +Netdata does not activate TLS encryption by default. To encrypt the connection, you first need to [enable TLS support](../web/server/#enabling-tls-support) on the master. With encryption enabled on the receiving side, we need to instruct the slave to use SSL as well. On the slave's `stream.conf`, configure the destination as follows: + +``` +[stream] + destination = host:port:SSL +``` + +The word SSL appended to the end of the destination tells the slave that the connection must be encrypted. + +#### Certificate verification + +When SSL is enabled on the slave, the default behavior will be do not connect with the master unless the server's certificate can be verified via the default chain. In case you want to avoid this check, add to the slave's `stream.conf` the following: + +``` +[stream] + ssl skip certificate verification = yes +``` + +#### Expected behaviors + +With the introduction of SSL, the master-slave communication behaves as shown in the table below, depending on the following configurations: +- Master TLS (Yes/No): Whether the `[web]` section in `netdata.conf` has `ssl key` and `ssl certificate`. +- Master port SSL (-/force/optional): Depends on whether the `[web]` section `bind to` contains a `^SSL=force` or `^SSL=optional` directive on the port(s) used for streaming. +- Slave TLS (Yes/No): Whether the destination in the slave's `stream.conf` has `:SSL` at the end. +- Slave SSL Verification (yes/no): Value of the slave's `stream.conf` `ssl skip certificate verification` parameter (default is no). + + Master TLS enabled | Master port SSL | Slave TLS | Slave SSL Ver. | Behavior +:------:|:-----:|:-----:|:-----:|:-------- +No | - | No | no | Legacy behavior. The master-slave stream is unencrypted. +Yes | force | No | no | The master rejects the slave connection. +Yes | -/optional | No | no | The master-slave stream is unencrypted (expected situation for legacy slaves and newer masters) +Yes | -/force/optional | Yes | no | The master-slave stream is encrypted, provided that the master has a valid SSL certificate. Otherwise, the slave refuses to connect. +Yes | -/force/optional | Yes | yes | The master-slave stream is encrypted. ## Viewing remote host dashboards, using mirrored databases On any receiving netdata, that maintains remote databases and has its web server enabled, -`my-netdata` menu will include a list of the mirrored databases. +The node menu will include a list of the mirrored databases. ![image](https://cloud.githubusercontent.com/assets/2662304/24080824/24cd2d3c-0caf-11e7-909d-a8dd1dbb95d7.png) @@ -289,13 +328,13 @@ On the master, edit `/etc/netdata/stream.conf` (to edit it on your system run `/ [11111111-2222-3333-4444-555555555555] # enable/disable this API key enabled = yes - + # one hour of data for each of the slaves default history = 3600 - + # do not save slave metrics on disk default memory = ram - + # alarms checks, only while the slave is connected health enabled by default = auto ``` @@ -305,6 +344,10 @@ If you used many API keys, you can add one such section for each API key. When done, restart netdata on the `master` node. It is now ready to receive metrics. +Note that `health enabled by default = auto` will still trigger `last_collected` alarms, if a connected slave does not exit gracefully. If the netdata running on the slave is +stopped, it will close the connection to the master, ensuring that no `last_collected` alarms are triggered. For example, a proper container restart would first terminate +the netdata process, but a system power issue would leave the connection open on the master side. In the second case, you will still receive alarms. + #### Configuring the `slaves` On each of the slaves, edit `/etc/netdata/stream.conf` (to edit it on your system run `/etc/netdata/edit-config stream.conf`) and set these: @@ -313,10 +356,10 @@ On each of the slaves, edit `/etc/netdata/stream.conf` (to edit it on your syste [stream] # stream metrics to another netdata enabled = yes - + # the IP and PORT of the master destination = 10.11.12.13:19999 - + # the API key to use api key = 11111111-2222-3333-4444-555555555555 ``` diff --git a/streaming/rrdpush.c b/streaming/rrdpush.c index 2e9050ff..954b1d7d 100644 --- a/streaming/rrdpush.c +++ b/streaming/rrdpush.c @@ -79,6 +79,25 @@ int rrdpush_init() { default_rrdpush_enabled = 0; } +#ifdef ENABLE_HTTPS + if (netdata_use_ssl_on_stream == NETDATA_SSL_OPTIONAL) { + if (default_rrdpush_destination){ + char *test = strstr(default_rrdpush_destination,":SSL"); + if(test){ + *test = 0X00; + netdata_use_ssl_on_stream = NETDATA_SSL_FORCE; + } + } + } + char *invalid_certificate = appconfig_get(&stream_config, CONFIG_SECTION_STREAM, "ssl skip certificate verification", "no"); + if ( !strcmp(invalid_certificate,"yes")){ + if (netdata_validate_server == NETDATA_SSL_VALID_CERTIFICATE){ + info("The Netdata is configured to accept invalid certificate."); + netdata_validate_server = NETDATA_SSL_INVALID_CERTIFICATE; + } + } +#endif + return default_rrdpush_enabled; } @@ -414,6 +433,7 @@ static inline void rrdpush_sender_thread_close_socket(RRDHOST *host) { } } +//called from client side static int rrdpush_sender_thread_connect_to_master(RRDHOST *host, int default_port, int timeout, size_t *reconnects_counter, char *connected_to, size_t connected_to_size) { struct timeval tv = { .tv_sec = timeout, @@ -442,9 +462,38 @@ static int rrdpush_sender_thread_connect_to_master(RRDHOST *host, int default_po info("STREAM %s [send to %s]: initializing communication...", host->hostname, connected_to); +#ifdef ENABLE_HTTPS + if( netdata_client_ctx ){ + host->ssl.flags = NETDATA_SSL_START; + if (!host->ssl.conn){ + host->ssl.conn = SSL_new(netdata_client_ctx); + if(!host->ssl.conn){ + error("Failed to allocate SSL structure."); + host->ssl.flags = NETDATA_SSL_NO_HANDSHAKE; + } + } + else{ + SSL_clear(host->ssl.conn); + } + + if (host->ssl.conn) + { + if (SSL_set_fd(host->ssl.conn, host->rrdpush_sender_socket) != 1) { + error("Failed to set the socket to the SSL on socket fd %d.", host->rrdpush_sender_socket); + host->ssl.flags = NETDATA_SSL_NO_HANDSHAKE; + } else{ + host->ssl.flags = NETDATA_SSL_HANDSHAKE_COMPLETE; + } + } + } + else { + host->ssl.flags = NETDATA_SSL_NO_HANDSHAKE; + } +#endif + #define HTTP_HEADER_SIZE 8192 char http[HTTP_HEADER_SIZE + 1]; - snprintfz(http, HTTP_HEADER_SIZE, + int eol = snprintfz(http, HTTP_HEADER_SIZE, "STREAM key=%s&hostname=%s®istry_hostname=%s&machine_guid=%s&update_every=%d&os=%s&timezone=%s&tags=%s" "&NETDATA_SYSTEM_OS_NAME=%s" "&NETDATA_SYSTEM_OS_ID=%s" @@ -486,8 +535,39 @@ static int rrdpush_sender_thread_connect_to_master(RRDHOST *host, int default_po , host->program_name , host->program_version ); - + http[eol] = 0x00; + +#ifdef ENABLE_HTTPS + if (!host->ssl.flags) { + ERR_clear_error(); + SSL_set_connect_state(host->ssl.conn); + int err = SSL_connect(host->ssl.conn); + if (err != 1){ + err = SSL_get_error(host->ssl.conn, err); + error("SSL cannot connect with the server: %s ",ERR_error_string((long)SSL_get_error(host->ssl.conn,err),NULL)); + if (netdata_use_ssl_on_stream == NETDATA_SSL_FORCE) { + rrdpush_sender_thread_close_socket(host); + return 0; + }else { + host->ssl.flags = NETDATA_SSL_NO_HANDSHAKE; + } + } + else { + if (netdata_use_ssl_on_stream == NETDATA_SSL_FORCE) { + if (netdata_validate_server == NETDATA_SSL_VALID_CERTIFICATE) { + if ( security_test_certificate(host->ssl.conn)) { + error("Closing the stream connection, because the server SSL certificate is not valid."); + rrdpush_sender_thread_close_socket(host); + return 0; + } + } + } + } + } + if(send_timeout(&host->ssl,host->rrdpush_sender_socket, http, strlen(http), 0, timeout) == -1) { +#else if(send_timeout(host->rrdpush_sender_socket, http, strlen(http), 0, timeout) == -1) { +#endif error("STREAM %s [send to %s]: failed to send HTTP header to remote netdata.", host->hostname, connected_to); rrdpush_sender_thread_close_socket(host); return 0; @@ -495,7 +575,11 @@ static int rrdpush_sender_thread_connect_to_master(RRDHOST *host, int default_po info("STREAM %s [send to %s]: waiting response from remote netdata...", host->hostname, connected_to); +#ifdef ENABLE_HTTPS + if(recv_timeout(&host->ssl,host->rrdpush_sender_socket, http, HTTP_HEADER_SIZE, 0, timeout) == -1) { +#else if(recv_timeout(host->rrdpush_sender_socket, http, HTTP_HEADER_SIZE, 0, timeout) == -1) { +#endif error("STREAM %s [send to %s]: remote netdata does not respond.", host->hostname, connected_to); rrdpush_sender_thread_close_socket(host); return 0; @@ -565,6 +649,12 @@ void *rrdpush_sender_thread(void *ptr) { return NULL; } +#ifdef ENABLE_HTTPS + if (netdata_use_ssl_on_stream & NETDATA_SSL_FORCE ){ + security_start_ssl(NETDATA_SSL_CONTEXT_STREAMING); + } +#endif + info("STREAM %s [send]: thread created (task id %d)", host->hostname, gettid()); int timeout = (int)appconfig_get_number(&stream_config, CONFIG_SECTION_STREAM, "timeout seconds", 60); @@ -852,6 +942,9 @@ static int rrdpush_receive(int fd , int update_every , char *client_ip , char *client_port +#ifdef ENABLE_HTTPS + , struct netdata_ssl *ssl +#endif ) { RRDHOST *host; int history = default_rrd_history_entries; @@ -965,7 +1058,11 @@ static int rrdpush_receive(int fd snprintfz(cd.cmd, PLUGINSD_CMD_MAX, "%s:%s", client_ip, client_port); info("STREAM %s [receive from [%s]:%s]: initializing communication...", host->hostname, client_ip, client_port); +#ifdef ENABLE_HTTPS + if(send_timeout(ssl,fd, START_STREAMING_PROMPT, strlen(START_STREAMING_PROMPT), 0, 60) != strlen(START_STREAMING_PROMPT)) { +#else if(send_timeout(fd, START_STREAMING_PROMPT, strlen(START_STREAMING_PROMPT), 0, 60) != strlen(START_STREAMING_PROMPT)) { +#endif log_stream_connection(client_ip, client_port, key, host->machine_guid, host->hostname, "FAILED - CANNOT REPLY"); error("STREAM %s [receive from [%s]:%s]: cannot send ready command.", host->hostname, client_ip, client_port); close(fd); @@ -1058,6 +1155,9 @@ struct rrdpush_thread { char *program_version; struct rrdhost_system_info *system_info; int update_every; +#ifdef ENABLE_HTTPS + struct netdata_ssl ssl; +#endif }; static void rrdpush_receiver_thread_cleanup(void *ptr) { @@ -1079,8 +1179,13 @@ static void rrdpush_receiver_thread_cleanup(void *ptr) { freez(rpt->client_port); freez(rpt->program_name); freez(rpt->program_version); - rrdhost_system_info_free(rpt->system_info); +#ifdef ENABLE_HTTPS + if(rpt->ssl.conn){ + SSL_free(rpt->ssl.conn); + } +#endif freez(rpt); + } } @@ -1105,6 +1210,9 @@ static void *rrdpush_receiver_thread(void *ptr) { , rpt->update_every , rpt->client_ip , rpt->client_port +#ifdef ENABLE_HTTPS + , &rpt->ssl +#endif ); netdata_thread_cleanup_pop(1); @@ -1295,6 +1403,13 @@ int rrdpush_receiver_thread_spawn(RRDHOST *host, struct web_client *w, char *url rpt->client_port = strdupz(w->client_port); rpt->update_every = update_every; rpt->system_info = system_info; +#ifdef ENABLE_HTTPS + rpt->ssl.conn = w->ssl.conn; + rpt->ssl.flags = w->ssl.flags; + + w->ssl.conn = NULL; + w->ssl.flags = NETDATA_SSL_START; +#endif if(w->user_agent && w->user_agent[0]) { char *t = strchr(w->user_agent, '/'); diff --git a/streaming/stream.conf b/streaming/stream.conf index d0d02a7c..0d360cc2 100644 --- a/streaming/stream.conf +++ b/streaming/stream.conf @@ -17,7 +17,7 @@ # Where is the receiving netdata? # A space separated list of: # - # [PROTOCOL:]HOST[%INTERFACE][:PORT] + # [PROTOCOL:]HOST[%INTERFACE][:PORT][:SSL] # # If many are given, the first available will get the metrics. # @@ -26,10 +26,21 @@ # IPv6 IPs should be given with brackets [ip:address] # INTERFACE = the network interface to use (only for IPv6) # PORT = the port number or service name (/etc/services) + # SSL = when this word appear at the end of the destination string + # the Netdata will do encrypt connection with the master. # # This communication is not HTTP (it cannot be proxied by web proxies). destination = + # Skip Certificate verification? + # + # The netdata slave is configurated to avoid invalid SSL/TLS certificate, + # so certificates that are self-signed or expired will stop the streaming. + # Case the server certificate is not valid, you can enable the use of + # 'bad' certificates setting the next option as 'yes'. + # + #ssl skip certificate verification = yes + # The API_KEY to use (as the sender) api key = @@ -114,7 +125,8 @@ # 3 possible values: # yes enable alarms # no do not enable alarms - # auto enable alarms, only when the sending netdata is connected + # auto enable alarms, only when the sending netdata is connected. For ephemeral slaves or slave system restarts, + # ensure that the netdata process on the slave is gracefully stopped, to prevent invalid last_collected alarms # You can also set it per host, below. # The default is taken from [health].enabled of netdata.conf health enabled by default = auto diff --git a/system/netdata-lsb.in b/system/netdata-lsb.in index e623f1e0..ca197a52 100644 --- a/system/netdata-lsb.in +++ b/system/netdata-lsb.in @@ -1,10 +1,18 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-3.0-or-later +#!/usr/bin/env bash # +# Netdata LSB start script +# +# Copyright: +# SPDX-License-Identifier: GPL-3.0-or-later +# +# Author: +# Costa Tsaousis <costa@netdata.cloud> +# Pavlos Emm. Katsoulakis <paul@netdata.cloud> + ### BEGIN INIT INFO # Provides: netdata -# Required-Start: $local_fs $remote_fs $network $named $time apache2 httpd squid nginx mysql named opensips upsd hostapd postfix lm_sensors -# Required-Stop: $local_fs $remote_fs $network $named $time apache2 httpd squid nginx mysql named opensips upsd hostapd postfix lm_sensors +# Required-Start: $local_fs $remote_fs $network $named $time +# Required-Stop: $local_fs $remote_fs $network $named $time # Should-Start: $local_fs $network $named $remote_fs $time $all # Should-Stop: $local_fs $network $named $remote_fs $time $all # Default-Start: 2 3 4 5 diff --git a/tests/backends/prometheus-avg-oldunits.txt b/tests/backends/prometheus-avg-oldunits.txt index b89c924d..53ee8ffa 100644 --- a/tests/backends/prometheus-avg-oldunits.txt +++ b/tests/backends/prometheus-avg-oldunits.txt @@ -1,3 +1,20 @@ +nd_apps_cpu_percent_average +nd_apps_cpu_system_percent_average +nd_apps_cpu_user_percent_average +nd_apps_files_open_files_average +nd_apps_lreads_kilobytes_persec_average +nd_apps_lwrites_kilobytes_persec_average +nd_apps_major_faults_page_faults_persec_average +nd_apps_mem_MB_average +nd_apps_minor_faults_page_faults_persec_average +nd_apps_pipes_open_pipes_average +nd_apps_preads_kilobytes_persec_average +nd_apps_processes_processes_average +nd_apps_pwrites_kilobytes_persec_average +nd_apps_sockets_open_sockets_average +nd_apps_swap_MB_average +nd_apps_threads_threads_average +nd_apps_vmem_MB_average nd_cpu_core_throttling_events_persec_average nd_cpu_cpu_percent_average nd_cpu_interrupts_interrupts_persec_average @@ -20,6 +37,7 @@ nd_ip_ecnpkts_packets_persec_average nd_ip_inerrors_packets_persec_average nd_ip_mcast_kilobits_persec_average nd_ip_mcastpkts_packets_persec_average +nd_ip_tcp_accept_queue_packets_persec_average nd_ip_tcpconnaborts_connections_persec_average nd_ip_tcpofo_packets_persec_average nd_ip_tcpreorders_packets_persec_average @@ -42,7 +60,6 @@ nd_ipv4_udperrors_events_persec_average nd_ipv4_udppackets_packets_persec_average nd_ipv6_ect_packets_persec_average nd_ipv6_errors_packets_persec_average -nd_ipv6_icmpechos_messages_persec_average nd_ipv6_icmperrors_errors_persec_average nd_ipv6_icmp_messages_persec_average nd_ipv6_icmpmldv2_reports_persec_average @@ -64,9 +81,46 @@ nd_mem_pgfaults_page_faults_persec_average nd_mem_slab_MB_average nd_mem_transparent_hugepages_MB_average nd_mem_writeback_MB_average +nd_netdata_apps_children_fix_percent_average +nd_netdata_apps_cpu_milliseconds_persec_average +nd_netdata_apps_fix_percent_average +nd_netdata_apps_sizes_files_persec_average +nd_netdata_clients_connected_clients_average +nd_netdata_compression_ratio_percent_average +nd_netdata_go_plugin_execution_time_ms_average +nd_netdata_net_kilobits_persec_average +nd_netdata_plugin_cgroups_cpu_milliseconds_persec_average +nd_netdata_plugin_diskspace_dt_milliseconds_run_average +nd_netdata_plugin_diskspace_milliseconds_persec_average +nd_netdata_plugin_proc_cpu_milliseconds_persec_average +nd_netdata_plugin_proc_modules_milliseconds_run_average +nd_netdata_plugin_tc_cpu_milliseconds_persec_average +nd_netdata_plugin_tc_time_milliseconds_run_average +nd_netdata_private_charts_charts_average +nd_netdata_pythond_runtime_ms_average +nd_netdata_requests_requests_persec_average +nd_netdata_response_time_milliseconds_request_average +nd_netdata_server_cpu_milliseconds_persec_average +nd_netdata_statsd_bytes_kilobits_persec_average +nd_netdata_statsd_cpu_milliseconds_persec_average +nd_netdata_statsd_events_events_persec_average +nd_netdata_statsd_metrics_metrics_average +nd_netdata_statsd_packets_packets_persec_average +nd_netdata_statsd_reads_reads_persec_average +nd_netdata_statsd_useful_metrics_metrics_average +nd_netdata_tcp_connected_sockets_average +nd_netdata_tcp_connects_events_average +nd_netdata_web_cpu_milliseconds_persec_average nd_net_drops_drops_persec_average nd_net_net_kilobits_persec_average nd_net_packets_packets_persec_average +nd_services_cpu_percent_average +nd_services_mem_usage_MB_average +nd_services_swap_usage_MB_average +nd_services_throttle_io_ops_read_operations_persec_average +nd_services_throttle_io_ops_write_operations_persec_average +nd_services_throttle_io_read_kilobytes_persec_average +nd_services_throttle_io_write_kilobytes_persec_average nd_system_active_processes_processes_average nd_system_cpu_percent_average nd_system_ctxt_context_switches_persec_average @@ -85,6 +139,8 @@ nd_system_net_kilobits_persec_average nd_system_pgpgio_kilobytes_persec_average nd_system_processes_processes_average nd_system_ram_MB_average +nd_system_shared_memory_bytes_bytes_average +nd_system_shared_memory_segments_segments_average nd_system_softirqs_softirqs_persec_average nd_system_softnet_stat_events_persec_average nd_system_swapio_kilobytes_persec_average diff --git a/tests/backends/prometheus-avg.txt b/tests/backends/prometheus-avg.txt index eaed4fb7..1aedff2b 100644 --- a/tests/backends/prometheus-avg.txt +++ b/tests/backends/prometheus-avg.txt @@ -1,3 +1,20 @@ +nd_apps_cpu_percentage_average +nd_apps_cpu_system_percentage_average +nd_apps_cpu_user_percentage_average +nd_apps_files_open_files_average +nd_apps_lreads_KiB_persec_average +nd_apps_lwrites_KiB_persec_average +nd_apps_major_faults_page_faults_persec_average +nd_apps_mem_MiB_average +nd_apps_minor_faults_page_faults_persec_average +nd_apps_pipes_open_pipes_average +nd_apps_preads_KiB_persec_average +nd_apps_processes_processes_average +nd_apps_pwrites_KiB_persec_average +nd_apps_sockets_open_sockets_average +nd_apps_swap_MiB_average +nd_apps_threads_threads_average +nd_apps_vmem_MiB_average nd_cpu_core_throttling_events_persec_average nd_cpu_cpu_percentage_average nd_cpu_interrupts_interrupts_persec_average @@ -20,6 +37,7 @@ nd_ip_ecnpkts_packets_persec_average nd_ip_inerrors_packets_persec_average nd_ip_mcast_kilobits_persec_average nd_ip_mcastpkts_packets_persec_average +nd_ip_tcp_accept_queue_packets_persec_average nd_ip_tcpconnaborts_connections_persec_average nd_ip_tcpofo_packets_persec_average nd_ip_tcpreorders_packets_persec_average @@ -42,7 +60,6 @@ nd_ipv4_udperrors_events_persec_average nd_ipv4_udppackets_packets_persec_average nd_ipv6_ect_packets_persec_average nd_ipv6_errors_packets_persec_average -nd_ipv6_icmpechos_messages_persec_average nd_ipv6_icmperrors_errors_persec_average nd_ipv6_icmp_messages_persec_average nd_ipv6_icmpmldv2_reports_persec_average @@ -64,9 +81,46 @@ nd_mem_pgfaults_faults_persec_average nd_mem_slab_MiB_average nd_mem_transparent_hugepages_MiB_average nd_mem_writeback_MiB_average +nd_netdata_apps_children_fix_percentage_average +nd_netdata_apps_cpu_milliseconds_persec_average +nd_netdata_apps_fix_percentage_average +nd_netdata_apps_sizes_files_persec_average +nd_netdata_clients_connected_clients_average +nd_netdata_compression_ratio_percentage_average +nd_netdata_go_plugin_execution_time_ms_average +nd_netdata_net_kilobits_persec_average +nd_netdata_plugin_cgroups_cpu_milliseconds_persec_average +nd_netdata_plugin_diskspace_dt_milliseconds_run_average +nd_netdata_plugin_diskspace_milliseconds_persec_average +nd_netdata_plugin_proc_cpu_milliseconds_persec_average +nd_netdata_plugin_proc_modules_milliseconds_run_average +nd_netdata_plugin_tc_cpu_milliseconds_persec_average +nd_netdata_plugin_tc_time_milliseconds_run_average +nd_netdata_private_charts_charts_average +nd_netdata_pythond_runtime_ms_average +nd_netdata_requests_requests_persec_average +nd_netdata_response_time_milliseconds_request_average +nd_netdata_server_cpu_milliseconds_persec_average +nd_netdata_statsd_bytes_kilobits_persec_average +nd_netdata_statsd_cpu_milliseconds_persec_average +nd_netdata_statsd_events_events_persec_average +nd_netdata_statsd_metrics_metrics_average +nd_netdata_statsd_packets_packets_persec_average +nd_netdata_statsd_reads_reads_persec_average +nd_netdata_statsd_useful_metrics_metrics_average +nd_netdata_tcp_connected_sockets_average +nd_netdata_tcp_connects_events_average +nd_netdata_web_cpu_milliseconds_persec_average nd_net_drops_drops_persec_average nd_net_net_kilobits_persec_average nd_net_packets_packets_persec_average +nd_services_cpu_percentage_average +nd_services_mem_usage_MiB_average +nd_services_swap_usage_MiB_average +nd_services_throttle_io_ops_read_operations_persec_average +nd_services_throttle_io_ops_write_operations_persec_average +nd_services_throttle_io_read_KiB_persec_average +nd_services_throttle_io_write_KiB_persec_average nd_system_active_processes_processes_average nd_system_cpu_percentage_average nd_system_ctxt_context_switches_persec_average @@ -85,6 +139,8 @@ nd_system_net_kilobits_persec_average nd_system_pgpgio_KiB_persec_average nd_system_processes_processes_average nd_system_ram_MiB_average +nd_system_shared_memory_bytes_bytes_average +nd_system_shared_memory_segments_segments_average nd_system_softirqs_softirqs_persec_average nd_system_softnet_stat_events_persec_average nd_system_swapio_KiB_persec_average diff --git a/tests/backends/prometheus-raw.txt b/tests/backends/prometheus-raw.txt index 7caffc87..2ac4c2c7 100644 --- a/tests/backends/prometheus-raw.txt +++ b/tests/backends/prometheus-raw.txt @@ -1,3 +1,20 @@ +nd_apps_cpu +nd_apps_cpu_system +nd_apps_cpu_user +nd_apps_files +nd_apps_lreads +nd_apps_lwrites +nd_apps_major_faults +nd_apps_mem +nd_apps_minor_faults +nd_apps_pipes +nd_apps_preads +nd_apps_processes +nd_apps_pwrites +nd_apps_sockets +nd_apps_swap +nd_apps_threads +nd_apps_vmem nd_cpu_core_throttling_total nd_cpu_cpu_total nd_cpu_interrupts_total @@ -20,6 +37,7 @@ nd_ip_ecnpkts_total nd_ip_inerrors_total nd_ip_mcastpkts_total nd_ip_mcast_total +nd_ip_tcp_accept_queue_total nd_ip_tcpconnaborts_total nd_ip_tcpofo_total nd_ip_tcpreorders_total @@ -42,7 +60,6 @@ nd_ipv4_udperrors_total nd_ipv4_udppackets_total nd_ipv6_ect_total nd_ipv6_errors_total -nd_ipv6_icmpechos_total nd_ipv6_icmperrors_total nd_ipv6_icmpmldv2_total nd_ipv6_icmpneighbor_total @@ -64,9 +81,54 @@ nd_mem_pgfaults_total nd_mem_slab nd_mem_transparent_hugepages nd_mem_writeback +nd_netdata_apps_children_fix +nd_netdata_apps_cpu_total +nd_netdata_apps_fix +nd_netdata_apps_sizes_calls_total +nd_netdata_apps_sizes_fds +nd_netdata_apps_sizes_filenames_total +nd_netdata_apps_sizes_files_total +nd_netdata_apps_sizes_inode_changes_total +nd_netdata_apps_sizes_link_changes_total +nd_netdata_apps_sizes_new_pids_total +nd_netdata_apps_sizes_pids +nd_netdata_apps_sizes_targets +nd_netdata_clients +nd_netdata_compression_ratio +nd_netdata_go_plugin_execution_time +nd_netdata_net_total +nd_netdata_plugin_cgroups_cpu_total +nd_netdata_plugin_diskspace_dt +nd_netdata_plugin_diskspace_total +nd_netdata_plugin_proc_cpu_total +nd_netdata_plugin_proc_modules +nd_netdata_plugin_tc_cpu_total +nd_netdata_plugin_tc_time +nd_netdata_private_charts +nd_netdata_pythond_runtime +nd_netdata_requests_total +nd_netdata_response_time +nd_netdata_server_cpu_total +nd_netdata_statsd_bytes_total +nd_netdata_statsd_cpu_total +nd_netdata_statsd_events_total +nd_netdata_statsd_metrics +nd_netdata_statsd_packets_total +nd_netdata_statsd_reads_total +nd_netdata_statsd_useful_metrics +nd_netdata_tcp_connected +nd_netdata_tcp_connects_total +nd_netdata_web_cpu_total nd_net_drops_total nd_net_net_total nd_net_packets_total +nd_services_cpu_total +nd_services_mem_usage +nd_services_swap_usage +nd_services_throttle_io_ops_read_total +nd_services_throttle_io_ops_write_total +nd_services_throttle_io_read_total +nd_services_throttle_io_write_total nd_system_active_processes nd_system_cpu_total nd_system_ctxt_total @@ -85,6 +147,8 @@ nd_system_net_total nd_system_pgpgio_total nd_system_processes nd_system_ram +nd_system_shared_memory_bytes +nd_system_shared_memory_segments nd_system_softirqs_total nd_system_softnet_stat_total nd_system_swap diff --git a/tests/backends/prometheus.bats b/tests/backends/prometheus.bats index d6ffa8d7..d52f39d5 100755 --- a/tests/backends/prometheus.bats +++ b/tests/backends/prometheus.bats @@ -5,7 +5,7 @@ validate_metrics() { params="${2}" curl -sS "http://localhost:19999/api/v1/allmetrics?format=prometheus&prefix=nd×tamps=no${params}" | - grep -E 'nd_system_|nd_cpu_|nd_system_|nd_net_|nd_disk_|nd_ip_|nd_ipv4_|nd_ipv6_|nd_mem_' | + grep -E 'nd_system_|nd_cpu_|nd_system_|nd_net_|nd_disk_|nd_ip_|nd_ipv4_|nd_ipv6_|nd_mem_|nd_netdata_|nd_apps_|nd_services_' | sed -ne 's/{.*//p' | sort | uniq > tests/backends/new-${fname} diff tests/backends/${fname} tests/backends/new-${fname} rm tests/backends/new-${fname} diff --git a/tests/health_mgmtapi/README.md b/tests/health_mgmtapi/README.md index 278c72dc..8473b35e 100644 --- a/tests/health_mgmtapi/README.md +++ b/tests/health_mgmtapi/README.md @@ -4,9 +4,8 @@ The directory `tests/health_cmdapi` contains the test script `health-cmdapi-test The script can be executed with options to prepare the system for the tests, run them and restore the system to its previous state. -It depends on the management API being accessible and on the responses to the api/v1/alarms?all requests being functional. - -Run it with `tests/health_mgmtapi/health-cmdapi-test.sh -h` to see the options. +It depends on the management API being accessible on localhost:19999 and on the responses to the api/v1/alarms?all requests being functional. +It also requires read access to the management API key that is usually under `/var/lib/netdata/netdata.api.key` (`@varlibdir_POST@/netdata.api.key`). [![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Ftests%2Fhealth_mgmtapi%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)]() diff --git a/tests/health_mgmtapi/expected_list/ALARM_CPU_IOWAIT-list.json b/tests/health_mgmtapi/expected_list/ALARM_CPU_IOWAIT-list.json new file mode 100644 index 00000000..9f05efe7 --- /dev/null +++ b/tests/health_mgmtapi/expected_list/ALARM_CPU_IOWAIT-list.json @@ -0,0 +1 @@ +{ "all": false, "type": "SILENCE", "silencers": [ { "alarm": "*10min_cpu_iowait" }, { "alarm": "*10min_cpu_usage *load_trigger" } ] } diff --git a/tests/health_mgmtapi/expected_list/ALARM_CPU_USAGE-list.json b/tests/health_mgmtapi/expected_list/ALARM_CPU_USAGE-list.json new file mode 100644 index 00000000..dbf87992 --- /dev/null +++ b/tests/health_mgmtapi/expected_list/ALARM_CPU_USAGE-list.json @@ -0,0 +1 @@ +{ "all": false, "type": "SILENCE", "silencers": [ { "alarm": "*10min_cpu_usage *load_trigger", "context": "system.cpu" }, { "alarm": "*10min_cpu_usage *load_trigger", "chart": "system.load" } ] } diff --git a/tests/health_mgmtapi/expected_list/CONTEXT_SYSTEM_CPU-list.json b/tests/health_mgmtapi/expected_list/CONTEXT_SYSTEM_CPU-list.json new file mode 100644 index 00000000..a267cfd6 --- /dev/null +++ b/tests/health_mgmtapi/expected_list/CONTEXT_SYSTEM_CPU-list.json @@ -0,0 +1 @@ +{ "all": false, "type": "DISABLE", "silencers": [ { "context": "system.cpu" }, { "chart": "system.load" } ] } diff --git a/tests/health_mgmtapi/expected_list/DISABLE-list.json b/tests/health_mgmtapi/expected_list/DISABLE-list.json new file mode 100644 index 00000000..c2c77810 --- /dev/null +++ b/tests/health_mgmtapi/expected_list/DISABLE-list.json @@ -0,0 +1 @@ +{ "all": false, "type": "DISABLE", "silencers": [ { "alarm": "*10min_cpu_usage *load_trigger" } ] } diff --git a/tests/health_mgmtapi/expected_list/DISABLE_ALL-list.json b/tests/health_mgmtapi/expected_list/DISABLE_ALL-list.json new file mode 100644 index 00000000..bbc3f4f0 --- /dev/null +++ b/tests/health_mgmtapi/expected_list/DISABLE_ALL-list.json @@ -0,0 +1 @@ +{ "all": true, "type": "DISABLE", "silencers": [] } diff --git a/tests/health_mgmtapi/expected_list/DISABLE_ALL_ERROR-list.json b/tests/health_mgmtapi/expected_list/DISABLE_ALL_ERROR-list.json new file mode 100644 index 00000000..e8aee179 --- /dev/null +++ b/tests/health_mgmtapi/expected_list/DISABLE_ALL_ERROR-list.json @@ -0,0 +1 @@ +Auth Error diff --git a/tests/health_mgmtapi/expected_list/DISABLE_SYSTEM_LOAD-list.json b/tests/health_mgmtapi/expected_list/DISABLE_SYSTEM_LOAD-list.json new file mode 100644 index 00000000..a7fc1cb8 --- /dev/null +++ b/tests/health_mgmtapi/expected_list/DISABLE_SYSTEM_LOAD-list.json @@ -0,0 +1 @@ +{ "all": false, "type": "DISABLE", "silencers": [ { "chart": "system.load" } ] } diff --git a/tests/health_mgmtapi/expected_list/FAMILIES_LOAD-list.json b/tests/health_mgmtapi/expected_list/FAMILIES_LOAD-list.json new file mode 100644 index 00000000..50119f79 --- /dev/null +++ b/tests/health_mgmtapi/expected_list/FAMILIES_LOAD-list.json @@ -0,0 +1 @@ +{ "all": false, "type": "None", "silencers": [ { "families": "load" } ] } diff --git a/tests/health_mgmtapi/expected_list/HOSTS-list.json b/tests/health_mgmtapi/expected_list/HOSTS-list.json new file mode 100644 index 00000000..9db21b6c --- /dev/null +++ b/tests/health_mgmtapi/expected_list/HOSTS-list.json @@ -0,0 +1 @@ +{ "all": false, "type": "SILENCE", "silencers": [ { "hosts": "*" } ] } diff --git a/tests/health_mgmtapi/expected_list/RESET-list.json b/tests/health_mgmtapi/expected_list/RESET-list.json new file mode 100644 index 00000000..2d3f09d6 --- /dev/null +++ b/tests/health_mgmtapi/expected_list/RESET-list.json @@ -0,0 +1 @@ +{ "all": false, "type": "None", "silencers": [] } diff --git a/tests/health_mgmtapi/expected_list/SILENCE-list.json b/tests/health_mgmtapi/expected_list/SILENCE-list.json new file mode 100644 index 00000000..d157f2d3 --- /dev/null +++ b/tests/health_mgmtapi/expected_list/SILENCE-list.json @@ -0,0 +1 @@ +{ "all": false, "type": "SILENCE", "silencers": [ { "alarm": "*10min_cpu_usage *load_trigger" } ] } diff --git a/tests/health_mgmtapi/expected_list/SILENCE_2-list.json b/tests/health_mgmtapi/expected_list/SILENCE_2-list.json new file mode 100644 index 00000000..d5e6fa2d --- /dev/null +++ b/tests/health_mgmtapi/expected_list/SILENCE_2-list.json @@ -0,0 +1 @@ +{ "all": false, "type": "SILENCE", "silencers": [ { "families": "load" } ] } diff --git a/tests/health_mgmtapi/expected_list/SILENCE_3-list.json b/tests/health_mgmtapi/expected_list/SILENCE_3-list.json new file mode 100644 index 00000000..69e98cc1 --- /dev/null +++ b/tests/health_mgmtapi/expected_list/SILENCE_3-list.json @@ -0,0 +1 @@ +{ "all": false, "type": "SILENCE", "silencers": [] } WARNING: SILENCE or DISABLE command is ineffective without defining any alarm selectors. diff --git a/tests/health_mgmtapi/expected_list/SILENCE_ALARM_CPU_USAGE-list.json b/tests/health_mgmtapi/expected_list/SILENCE_ALARM_CPU_USAGE-list.json new file mode 100644 index 00000000..dd789cd3 --- /dev/null +++ b/tests/health_mgmtapi/expected_list/SILENCE_ALARM_CPU_USAGE-list.json @@ -0,0 +1 @@ +{ "all": false, "type": "SILENCE", "silencers": [ { "alarm": "*10min_cpu_usage *load_trigger", "chart": "system.load" } ] } diff --git a/tests/health_mgmtapi/expected_list/SILENCE_ALARM_CPU_USAGE_LOAD_TRIGGER-list.json b/tests/health_mgmtapi/expected_list/SILENCE_ALARM_CPU_USAGE_LOAD_TRIGGER-list.json new file mode 100644 index 00000000..d157f2d3 --- /dev/null +++ b/tests/health_mgmtapi/expected_list/SILENCE_ALARM_CPU_USAGE_LOAD_TRIGGER-list.json @@ -0,0 +1 @@ +{ "all": false, "type": "SILENCE", "silencers": [ { "alarm": "*10min_cpu_usage *load_trigger" } ] } diff --git a/tests/health_mgmtapi/expected_list/SILENCE_ALL-list.json b/tests/health_mgmtapi/expected_list/SILENCE_ALL-list.json new file mode 100644 index 00000000..c88ef9fd --- /dev/null +++ b/tests/health_mgmtapi/expected_list/SILENCE_ALL-list.json @@ -0,0 +1 @@ +{ "all": true, "type": "SILENCE", "silencers": [] } diff --git a/tests/health_mgmtapi/health-cmdapi-test.sh.in b/tests/health_mgmtapi/health-cmdapi-test.sh.in index 1bbe269d..5abf2b17 100755 --- a/tests/health_mgmtapi/health-cmdapi-test.sh.in +++ b/tests/health_mgmtapi/health-cmdapi-test.sh.in @@ -1,56 +1,72 @@ #!/usr/bin/env bash # shellcheck disable=SC1117,SC2034,SC2059,SC2086,SC2181 -NETDATA_USER_CONFIG_DIR="@configdir_POST@" -NETDATA_STOCK_CONFIG_DIR="@libconfigdir_POST@" NETDATA_VARLIB_DIR="@varlibdir_POST@" -printhelp () { - echo "Usage: health-cmdapi-test.sh [OPTIONS] - -s SETUP config files for python example tests - -c CLEANUP config files from python example tests - -r RESTART netdata after SETUP and CLEANUP, using systemctl restart netdata. - -t TEST scenarios execution - -u <URL> changes the host:port from localhost:19999 to <URL> - " -} - check () { - echo -e "${GRAY}Check: '${1}' in 2 sec" - sleep 2 - resp=$(curl -s "http://$URL/api/v1/alarms?all") + sec=1 + echo -e " ${GRAY}Check: '${1}' in $sec sec" + sleep $sec + number=$RANDOM + resp=$(curl -s "http://$URL/api/v1/alarms?all&$number") r=$(echo "${resp}" | \ python3 -c "import sys, json; d=json.load(sys.stdin); \ print(\ - d['alarms']['example.random.example_alarm1']['disabled'], \ - d['alarms']['example.random.example_alarm1']['silenced'] , \ - d['alarms']['example.random.example_alarm2']['disabled'], \ - d['alarms']['example.random.example_alarm2']['silenced'], \ + d['alarms']['system.cpu.10min_cpu_usage']['disabled'], \ + d['alarms']['system.cpu.10min_cpu_usage']['silenced'] , \ + d['alarms']['system.cpu.10min_cpu_iowait']['disabled'], \ + d['alarms']['system.cpu.10min_cpu_iowait']['silenced'], \ d['alarms']['system.load.load_trigger']['disabled'], \ d['alarms']['system.load.load_trigger']['silenced'], \ );" 2>&1) if [ $? -ne 0 ] ; then - echo -e "${RED}ERROR: Unexpected response '$resp'" + echo -e " ${RED}ERROR: Unexpected response stored in /tmp/resp-$number.json" + echo "$resp" > /tmp/resp-$number.json err=$((err+1)) + iter=0 elif [ "${r}" != "${2}" ] ; then - echo -e "${RED}ERROR: 'Got ${r}'. Expected '${2}'" - err=$((err+1)) + echo -e " ${GRAY}WARNING: 'Got ${r}'. Expected '${2}'" + iter=$((iter+1)) + if [ $iter -lt 10 ] ; then + echo -e " ${GRAY}Repeating test " + check "$1" "$2" + else + echo -e " ${RED}ERROR: 'Got ${r}'. Expected '${2}'" + iter=0 + err=$((err+1)) + fi else - echo -e "${GREEN}Success" + echo -e " ${GREEN}Success" + iter=0 fi } cmd () { - echo -e "${WHITE}Cmd '${1}', expecting '${2}'" + echo -e "${WHITE}Cmd '${1}'" + echo -en " ${GRAY}Expecting '${2}' : " RESPONSE=$(curl -s "http://$URL/api/v1/manage/health?${1}" -H "X-Auth-Token: $TOKEN" 2>&1) if [ "${RESPONSE}" != "${2}" ] ; then - echo -e "${RED}ERROR: Response '${RESPONSE}' != '${2}'" + echo -e "${RED}ERROR: Response '${RESPONSE}'" err=$((err+1)) else echo -e "${GREEN}Success" fi } +check_list() { + RESPONSE=$(curl -s "http://$URL/api/v1/manage/health?cmd=LIST" -H "X-Auth-Token: $TOKEN" 2>&1) + + NAME="$1-list.json" + echo $RESPONSE > $NAME + diff $NAME expected_list/$NAME 1>/dev/null 2>&1 + if [ $? -eq 0 ]; then + echo -e "${GREEN}Success: The list command got the correct answer for $NAME!" + else + echo -e "${RED}ERROR: the files $NAME and expected_list/$NAME does not match." + exit 1 + fi +} + WHITE='\033[0;37m' RED='\033[0;31m' GREEN='\033[0;32m' @@ -62,63 +78,8 @@ CLEANUP=0 TEST=0 URL="localhost:19999" -while getopts :srctu: option -do - case "$option" in - s) - SETUP=1 - ;; - r) - RESTART=1 - ;; - c) - CLEANUP=1 - ;; - t) - TEST=1 - ;; - u) - URL=$OPTARG - ;; - *) - printhelp - exit 1 - ;; - esac -done - -if [ $SETUP -eq 1 ] ; then - echo "Preparing netdata configuration for testing" - # Prep netdata for tests - if [ -f "${NETDATA_USER_CONFIG_DIR}/python.d.conf" ] ; then - cp -f "${NETDATA_USER_CONFIG_DIR}/python.d.conf" /tmp/python.d.conf - else - cp "${NETDATA_STOCK_CONFIG_DIR}/python.d.conf" "${NETDATA_USER_CONFIG_DIR}/" - fi - sed -i -e "s/example: no/example: yes/g" "${NETDATA_USER_CONFIG_DIR}/python.d.conf" - - mypath=$(cd ${0%/*} && echo $PWD) - - cp -f "${mypath}/python-example.conf" "${NETDATA_USER_CONFIG_DIR}/health.d/" - - # netdata.conf - if [ -f "${NETDATA_USER_CONFIG_DIR}/netdata.conf" ] ; then - cp -f "${NETDATA_USER_CONFIG_DIR}/netdata.conf" /tmp/netdata.conf - fi - printf "[health]\nrun at least every seconds = 1\n" > "${NETDATA_USER_CONFIG_DIR}/netdata.conf" - - chmod +r "${NETDATA_USER_CONFIG_DIR}/python.d.conf" "${NETDATA_USER_CONFIG_DIR}/netdata.conf" "${NETDATA_USER_CONFIG_DIR}/health.d/python-example.conf" "${NETDATA_STOCK_CONFIG_DIR}/health.d/load.conf" - # Restart netdata - if [ $RESTART -eq 1 ] ; then - echo "Restarting netdata" - systemctl restart netdata - fi -fi - err=0 -# Execute tests -if [ $TEST -eq 1 ] ; then HEALTH_CMDAPI_MSG_AUTHERROR="Auth Error" HEALTH_CMDAPI_MSG_SILENCEALL="All alarm notifications are silenced" @@ -143,11 +104,13 @@ if [ $TEST -eq 1 ] ; then # Test default state cmd "cmd=RESET" "$HEALTH_CMDAPI_MSG_RESET" check "Default State" "False False False False False False" + check_list "RESET" # Test auth failure TOKEN="Wrong token" cmd "cmd=DISABLE ALL" "$HEALTH_CMDAPI_MSG_AUTHERROR" check "Default State" "False False False False False False" + check_list "DISABLE_ALL_ERROR" # Set correct token TOKEN="${CORRECT_TOKEN}" @@ -155,108 +118,107 @@ if [ $TEST -eq 1 ] ; then # Test disable cmd "cmd=DISABLE ALL" "$HEALTH_CMDAPI_MSG_DISABLEALL" check "All disabled" "True False True False True False" + check_list "DISABLE_ALL" # Reset cmd "cmd=RESET" "$HEALTH_CMDAPI_MSG_RESET" check "Default State" "False False False False False False" + check_list "RESET" # Test silence cmd "cmd=SILENCE ALL" "$HEALTH_CMDAPI_MSG_SILENCEALL" check "All silenced" "False True False True False True" + check_list "SILENCE_ALL" # Reset cmd "cmd=RESET" "$HEALTH_CMDAPI_MSG_RESET" check "Default State" "False False False False False False" + check_list "RESET" # Add silencer by name printf -v resp "$HEALTH_CMDAPI_MSG_SILENCE\n$HEALTH_CMDAPI_MSG_ADDED" - cmd "cmd=SILENCE&alarm=*example_alarm1 *load_trigger" "${resp}" + cmd "cmd=SILENCE&alarm=*10min_cpu_usage *load_trigger" "${resp}" check "Silence notifications for alarm1 and load_trigger" "False True False False False True" + check_list "SILENCE_ALARM_CPU_USAGE_LOAD_TRIGGER" # Convert to disable health checks cmd "cmd=DISABLE" "$HEALTH_CMDAPI_MSG_DISABLE" check "Disable notifications for alarm1 and load_trigger" "True False False False True False" + check_list "DISABLE" # Convert back to silence notifications cmd "cmd=SILENCE" "$HEALTH_CMDAPI_MSG_SILENCE" check "Silence notifications for alarm1 and load_trigger" "False True False False False True" + check_list "SILENCE" # Add second silencer by name - cmd "alarm=*example_alarm2" "$HEALTH_CMDAPI_MSG_ADDED" + cmd "alarm=*10min_cpu_iowait" "$HEALTH_CMDAPI_MSG_ADDED" check "Silence notifications for alarm1,alarm2 and load_trigger" "False True False True False True" + check_list "ALARM_CPU_IOWAIT" # Reset cmd "cmd=RESET" "$HEALTH_CMDAPI_MSG_RESET" + check_list "RESET" # Add silencer by chart printf -v resp "$HEALTH_CMDAPI_MSG_DISABLE\n$HEALTH_CMDAPI_MSG_ADDED" cmd "cmd=DISABLE&chart=system.load" "${resp}" check "Default State" "False False False False True False" + check_list "DISABLE_SYSTEM_LOAD" # Add silencer by context - cmd "context=random" "$HEALTH_CMDAPI_MSG_ADDED" + cmd "context=system.cpu" "$HEALTH_CMDAPI_MSG_ADDED" check "Default State" "True False True False True False" + check_list "CONTEXT_SYSTEM_CPU" # Reset cmd "cmd=RESET" "$HEALTH_CMDAPI_MSG_RESET" + check_list "RESET" # Add second condition to a selector (AND) printf -v resp "$HEALTH_CMDAPI_MSG_SILENCE\n$HEALTH_CMDAPI_MSG_ADDED" - cmd "cmd=SILENCE&alarm=*example_alarm1 *load_trigger&chart=system.load" "${resp}" + cmd "cmd=SILENCE&alarm=*10min_cpu_usage *load_trigger&chart=system.load" "${resp}" check "Silence notifications load_trigger" "False False False False False True" + check_list "SILENCE_ALARM_CPU_USAGE" # Add second selector with two conditions - cmd "alarm=*example_alarm1 *load_trigger&context=random" "$HEALTH_CMDAPI_MSG_ADDED" + cmd "alarm=*10min_cpu_usage *load_trigger&context=system.cpu" "$HEALTH_CMDAPI_MSG_ADDED" check "Silence notifications load_trigger" "False True False False False True" + check_list "ALARM_CPU_USAGE" # Reset cmd "cmd=RESET" "$HEALTH_CMDAPI_MSG_RESET" + check_list "RESET" # Add silencer without a command to disable or silence alarms printf -v resp "$HEALTH_CMDAPI_MSG_ADDED\n$HEALTH_CMDAPI_MSG_STYPEWARNING" cmd "families=load" "${resp}" check "Family selector with no command" "False False False False False False" + check_list "FAMILIES_LOAD" # Add silence command cmd "cmd=SILENCE" "$HEALTH_CMDAPI_MSG_SILENCE" check "Silence family load" "False False False False False True" + check_list "SILENCE_2" # Reset cmd "cmd=RESET" "$HEALTH_CMDAPI_MSG_RESET" + check_list "RESET" # Add command without silencers printf -v resp "$HEALTH_CMDAPI_MSG_SILENCE\n$HEALTH_CMDAPI_MSG_NOSELECTORWARNING" cmd "cmd=SILENCE" "${resp}" check "Command with no selector" "False False False False False False" + check_list "SILENCE_3" # Add hosts silencer cmd "hosts=*" "$HEALTH_CMDAPI_MSG_ADDED" check "Silence all hosts" "False True False True False True" + check_list "HOSTS" # Reset cmd "cmd=RESET" "$HEALTH_CMDAPI_MSG_RESET" - -fi - -# Cleanup -if [ $CLEANUP -eq 1 ] ; then - echo -e "${WHITE}Restoring netdata configuration" - for f in "python.d.conf" "netdata.conf" ; do - if [ -f "/tmp/$f" ] ; then - mv -f "/tmp/$f" "${NETDATA_USER_CONFIG_DIR}/" - else - rm -f "${NETDATA_USER_CONFIG_DIR}/$f" - fi - done - - rm -f "${NETDATA_USER_CONFIG_DIR}/health.d/python-example.conf" - - # Restart netdata - if [ $RESTART -eq 1 ] ; then - echo "Restarting netdata" - systemctl restart netdata - fi -fi + check_list "RESET" if [ $err -gt 0 ] ; then echo "$err error(s) found" diff --git a/tests/health_mgmtapi/python-example.conf b/tests/health_mgmtapi/python-example.conf deleted file mode 100644 index 66713208..00000000 --- a/tests/health_mgmtapi/python-example.conf +++ /dev/null @@ -1,16 +0,0 @@ -alarm: example_alarm1 - on: example.random - every: 2s - warn: $random1 > (($status >= $WARNING) ? (55) : (75)) - crit: $random1 > (($status == $CRITICAL) ? (75) : (95)) - info: random - to: sysadmin - -alarm: example_alarm2 - on: example.random - every: 2s - warn: $random2 > (($status >= $WARNING) ? (55) : (75)) - crit: $random2 > (($status == $CRITICAL) ? (75) : (95)) - info: random - to: sysadmin - diff --git a/tests/installer/slack.sh b/tests/installer/slack.sh index 3c5f94a4..83cb5fa7 100755 --- a/tests/installer/slack.sh +++ b/tests/installer/slack.sh @@ -13,6 +13,7 @@ post_message() { TYPE="$1" MESSAGE="$2" + CUSTOM_CHANNEL="$3" case "$TYPE" in "PLAIN_MESSAGE") @@ -24,7 +25,13 @@ post_message() { EVENT_LINE="${TRAVIS_JOB_NUMBER}: Event type '${TRAVIS_EVENT_TYPE}' #${TRAVIS_PULL_REQUEST}, on '${TRAVIS_OS_NAME}' " fi + if [ -n "${CUSTOM_CHANNEL}" ]; then + echo "Sending travis message to custom channel ${CUSTOM_CHANNEL}" + OPTIONAL_CHANNEL_INFO="\"channel\": \"${CUSTOM_CHANNEL}\"," + fi + POST_MESSAGE="{ + ${OPTIONAL_CHANNEL_INFO} \"text\": \"${TRAVIS_REPO_SLUG}, ${MESSAGE}\", \"attachments\": [{ \"text\": \"${TRAVIS_JOB_NUMBER}: Event type '${TRAVIS_EVENT_TYPE}', on '${TRAVIS_OS_NAME}' \", diff --git a/tests/updater_checks.bats b/tests/updater_checks.bats index e177fe4e..1a7eeb70 100755 --- a/tests/updater_checks.bats +++ b/tests/updater_checks.bats @@ -25,7 +25,6 @@ DIRS="usr/sbin/netdata var/log/netdata" setup() { - # If we are not in netdata git repo, at the top level directory, fail TOP_LEVEL=$(basename "$(git rev-parse --show-toplevel)") CWD=$(git rev-parse --show-cdup || echo "") diff --git a/tests/updater_checks.sh b/tests/updater_checks.sh index dce13685..9c8b6fa4 100755 --- a/tests/updater_checks.sh +++ b/tests/updater_checks.sh @@ -1,4 +1,4 @@ -#!/usr/bin/env bash +#!/usr/bin/env sh # # Wrapper script that installs the required dependencies # for the BATS script to run successfully @@ -8,9 +8,71 @@ # Author : Pavlos Emm. Katsoulakis <paul@netdata.cloud) # -echo "Installing extra dependencies.." -yum install -y epel-release -yum install -y git bats +echo "Syncing/updating repository.." + +blind_arch_grep_install() { + # There is a peculiar docker case with arch, where grep is not available + # This method will have to be triggered blindly, to inject grep so that we can process + # It starts to become a chicken-egg situation with all the distros.. + echo "* * Workaround hack * *" + echo "Attempting blind install for archlinux case" + + if command -v pacman > /dev/null 2>&1; then + echo "Executing grep installation" + pacman -Sy + pacman --noconfirm --needed -S grep + fi +} +blind_arch_grep_install || echo "Workaround failed, proceed as usual" + +running_os="$(cat /etc/os-release |grep '^ID=' | cut -d'=' -f2 | sed -e 's/"//g')" + +case "${running_os}" in +"centos"|"fedora") + echo "Running on CentOS, updating YUM repository.." + yum clean all + yum update -y + + echo "Installing extra dependencies.." + yum install -y epel-release + yum install -y bats curl + ;; +"debian"|"ubuntu") + echo "Running ${running_os}, updating APT repository" + apt-get update -y + apt-get install -y bats curl + ;; +"opensuse-leap"|"opensuse-tumbleweed") + zypper update -y + zypper install -y bats curl + ;; +"arch") + pacman -Sy + pacman --noconfirm --needed -S bash-bats curl + ;; +"alpine") + apk update + apk add bash curl bats + ;; +*) + echo "Running on ${running_os}, no repository preparation done" + ;; +esac + +# Download and run depednency scriptlet, before anything else +# +deps_tool="/tmp/deps_tool.$$.sh" +curl -Ss -o ${deps_tool} https://raw.githubusercontent.com/netdata/netdata-demo-site/master/install-required-packages.sh +if [ -f "${deps_tool}" ]; then + echo "Running dependency handling script.." + chmod +x "${deps_tool}" + ${deps_tool} --non-interactive netdata + rm -f "${deps_tool}" + echo "Done!" +else + echo "Failed to fetch dependency script, aborting the test" + exit 1 +fi echo "Running BATS file.." bats --tap tests/updater_checks.bats diff --git a/web/README.md b/web/README.md index c110ef65..5c1a06f5 100644 --- a/web/README.md +++ b/web/README.md @@ -14,7 +14,7 @@ For our convenience, Netdata provides 2 more layers: Charts information is stored at /usr/share/netdata/web/[dashboard_info.js](gui/dashboard_info.js). This file includes information that is rendered on the dashboard, controls chart colors, section and subsection heading, titles, etc. -If you change that file, your changes will be overwritten when Netdata is updated. You can preserve your settings by creating a new such file (there is /usr/share/netdata/web/[dashboard_info_custom.example.js](gui/dashboard_info_custom_example.js) you can use to start with). +If you change that file, your changes will be overwritten when Netdata is updated. You can preserve your settings by creating a new such file (there is /usr/share/netdata/web/[dashboard_info_custom_example.js](gui/dashboard_info_custom_example.js) you can use to start with). You have to copy the example file under a new name, so that it will not be overwritten with Netdata updates. diff --git a/web/api/health/README.md b/web/api/health/README.md index 2003a61e..66a80d5f 100644 --- a/web/api/health/README.md +++ b/web/api/health/README.md @@ -45,6 +45,7 @@ The following will return an SVG badge of the alarm named `NAME`, attached to th ## Health Management API Netdata v1.12 and beyond provides a command API to control health checks and notifications at runtime. The feature is especially useful for maintenance periods, during which you receive meaningless alarms. +From Netdata v1.16.0 and beyond, the configuration controlled via the API commands is [persisted across netdata restarts](#persistence). Specifically, the API allows you to: - Disable health checks completely. Alarm conditions will not be evaluated at all and no entries will be added to the alarm log. @@ -142,6 +143,43 @@ Example 2.2: Add one more selector, to also silence alarms for cpu1 and cpu2 http://localhost/api/v1/manage/health?families=cpu1 cpu2 ``` +### List silencers + +The command `LIST` was added in netdata v1.16.0 and returns a JSON with the current status of the silencers. + +``` + curl "http://myserver/api/v1/manage/health?cmd=LIST" -H "X-Auth-Token: Mytoken" +``` + +As an example, the following response shows that we have two silencers configured, one for an alarm called `samplealarm` and one for alarms with context `random` on host `myhost` +``` +json +{ + "all": false, + "type": "SILENCE", + "silencers": [ + { + "alarm": "samplealarm" + }, + { + "context": "random", + "hosts": "myhost" + } + ] +} +``` + +The response below shows that we have disabled all health checks. + +``` +json +{ + "all": true, + "type": "DISABLE", + "silencers": [] +} + + ### Responses - "Auth Error" : Token authentication failed @@ -155,6 +193,17 @@ http://localhost/api/v1/manage/health?families=cpu1 cpu2 - "WARNING: Added alarm selector to silence/disable alarms without a SILENCE or DISABLE command." : Added to the response if a selector is added without a selector-specific command. - "WARNING: SILENCE or DISABLE command is ineffective without defining any alarm selectors." : Added to the response if a selector-specific command is issued without a selector. +### Persistence + +From netdata v1.16.0 and beyond, the silencers configuration is persisted to disk and loaded when netdata starts. +The JSON string returned by the [LIST command](#list-silencers) is automatically saved to the `silencers file`, every time a command alters the silencers configuration. +The file's location is configurable in `netdata.conf`. The default is shown below: + +``` +[health] + # silencers file = /var/lib/netdata/health.silencers.json +``` + ### Further reading The test script under [tests/health_mgmtapi](../../../tests/health_mgmtapi) contains a series of tests that you can either run or read through to understand the various calls and responses better. diff --git a/web/api/health/health_cmdapi.c b/web/api/health/health_cmdapi.c index ec177751..468054c6 100644 --- a/web/api/health/health_cmdapi.c +++ b/web/api/health/health_cmdapi.c @@ -1,17 +1,16 @@ // -// Created by christopher on 11/12/18. +// Created by Christopher on 11/12/18. // #include "health_cmdapi.h" - -static SILENCER *create_silencer(void) { - SILENCER *t = callocz(1, sizeof(SILENCER)); - debug(D_HEALTH, "HEALTH command API: Created empty silencer"); - - return t; -} - +/** + * Free Silencers + * + * Clean the silencer structure + * + * @param t is the structure that will be cleaned. + */ void free_silencers(SILENCER *t) { if (!t) return; if (t->next) free_silencers(t->next); @@ -31,38 +30,104 @@ void free_silencers(SILENCER *t) { return; } +/** + * Silencers to JSON Entry + * + * Fill the buffer with the other values given. + * + * @param wb a pointer to the output buffer + * @param var the json variable + * @param val the json value + * @param hasprev has it a previous value? + * + * @return + */ +int health_silencers2json_entry(BUFFER *wb, char* var, char* val, int hasprev) { + if (val) { + buffer_sprintf(wb, "%s\n\t\t\t\"%s\": \"%s\"", (hasprev)?",":"", var, val); + return 1; + } else { + return hasprev; + } +} +/** + * Silencer to JSON + * + * Write the silencer values using JSON format inside a buffer. + * + * @param wb is the buffer to write the silencers. + */ +void health_silencers2json(BUFFER *wb) { + buffer_sprintf(wb, "{\n\t\"all\": %s," + "\n\t\"type\": \"%s\"," + "\n\t\"silencers\": [", + (silencers->all_alarms)?"true":"false", + (silencers->stype == STYPE_NONE)?"None":((silencers->stype == STYPE_DISABLE_ALARMS)?"DISABLE":"SILENCE")); + + SILENCER *silencer; + int i = 0, j = 0; + for(silencer = silencers->silencers; silencer ; silencer = silencer->next) { + if(likely(i)) buffer_strcat(wb, ","); + buffer_strcat(wb, "\n\t\t{"); + j=health_silencers2json_entry(wb, HEALTH_ALARM_KEY, silencer->alarms, j); + j=health_silencers2json_entry(wb, HEALTH_CHART_KEY, silencer->charts, j); + j=health_silencers2json_entry(wb, HEALTH_CONTEXT_KEY, silencer->contexts, j); + j=health_silencers2json_entry(wb, HEALTH_HOST_KEY, silencer->hosts, j); + health_silencers2json_entry(wb, HEALTH_FAMILIES_KEY, silencer->families, j); + j=0; + buffer_strcat(wb, "\n\t\t}"); + i++; + } + if(likely(i)) buffer_strcat(wb, "\n\t"); + buffer_strcat(wb, "]\n}\n"); +} +/** + * Silencer to FILE + * + * Write the sliencer buffer to a file. + * @param wb + */ +void health_silencers2file(BUFFER *wb) { + if (wb->len == 0) return; + + FILE *fd = fopen(silencers_filename, "wb"); + if(fd) { + size_t written = (size_t)fprintf(fd, "%s", wb->buffer) ; + if (written == wb->len ) { + info("Silencer changes written to %s", silencers_filename); + } + fclose(fd); + return; + } + error("Silencer changes could not be written to %s. Error %s", silencers_filename, strerror(errno)); +} + +/** + * Request V1 MGMT Health + * + * Function called by api to management the health. + * + * @param host main structure with client information! + * @param w is the structure with all information of the client request. + * @param url is the url that netdata is working + * + * @return It returns 200 on success and another code otherwise. + */ int web_client_api_request_v1_mgmt_health(RRDHOST *host, struct web_client *w, char *url) { int ret = 400; (void) host; - - BUFFER *wb = w->response.data; buffer_flush(wb); wb->contenttype = CT_TEXT_PLAIN; buffer_flush(w->response.data); - static uint32_t - hash_alarm = 0, - hash_template = 0, - hash_chart = 0, - hash_context = 0, - hash_host = 0, - hash_families = 0; - - if (unlikely(!hash_alarm)) { - hash_alarm = simple_uhash(HEALTH_ALARM_KEY); - hash_template = simple_uhash(HEALTH_TEMPLATE_KEY); - hash_chart = simple_uhash(HEALTH_CHART_KEY); - hash_context = simple_uhash(HEALTH_CONTEXT_KEY); - hash_host = simple_uhash(HEALTH_HOST_KEY); - hash_families = simple_uhash(HEALTH_FAMILIES_KEY); - } - + //Local instance of the silencer SILENCER *silencer = NULL; + int config_changed = 1; if (!w->auth_bearer_token) { buffer_strcat(wb, HEALTH_CMDAPI_MSG_AUTHERROR); @@ -105,50 +170,17 @@ int web_client_api_request_v1_mgmt_health(RRDHOST *host, struct web_client *w, c free_silencers(silencers->silencers); silencers->silencers = NULL; buffer_strcat(wb, HEALTH_CMDAPI_MSG_RESET); + } else if (!strcmp(value, HEALTH_CMDAPI_CMD_LIST)) { + w->response.data->contenttype = CT_APPLICATION_JSON; + health_silencers2json(wb); + config_changed=0; } } else { - uint32_t hash = simple_uhash(key); - if (unlikely(silencer == NULL)) { - if ( - (hash == hash_alarm && !strcasecmp(key, HEALTH_ALARM_KEY)) || - (hash == hash_template && !strcasecmp(key, HEALTH_TEMPLATE_KEY)) || - (hash == hash_chart && !strcasecmp(key, HEALTH_CHART_KEY)) || - (hash == hash_context && !strcasecmp(key, HEALTH_CONTEXT_KEY)) || - (hash == hash_host && !strcasecmp(key, HEALTH_HOST_KEY)) || - (hash == hash_families && !strcasecmp(key, HEALTH_FAMILIES_KEY)) - ) { - silencer = create_silencer(); - } - } - - if (hash == hash_alarm && !strcasecmp(key, HEALTH_ALARM_KEY)) { - silencer->alarms = strdupz(value); - silencer->alarms_pattern = simple_pattern_create(silencer->alarms, NULL, SIMPLE_PATTERN_EXACT); - } else if (hash == hash_chart && !strcasecmp(key, HEALTH_CHART_KEY)) { - silencer->charts = strdupz(value); - silencer->charts_pattern = simple_pattern_create(silencer->charts, NULL, SIMPLE_PATTERN_EXACT); - } else if (hash == hash_context && !strcasecmp(key, HEALTH_CONTEXT_KEY)) { - silencer->contexts = strdupz(value); - silencer->contexts_pattern = simple_pattern_create(silencer->contexts, NULL, SIMPLE_PATTERN_EXACT); - } else if (hash == hash_host && !strcasecmp(key, HEALTH_HOST_KEY)) { - silencer->hosts = strdupz(value); - silencer->hosts_pattern = simple_pattern_create(silencer->hosts, NULL, SIMPLE_PATTERN_EXACT); - } else if (hash == hash_families && !strcasecmp(key, HEALTH_FAMILIES_KEY)) { - silencer->families = strdupz(value); - silencer->families_pattern = simple_pattern_create(silencer->families, NULL, SIMPLE_PATTERN_EXACT); - } else { - buffer_strcat(wb, HEALTH_CMDAPI_MSG_INVALID_KEY); - } + silencer = health_silencers_addparam(silencer, key, value); } - } if (likely(silencer)) { - // Add the created instance to the linked list in silencers - silencer->next = silencers->silencers; - silencers->silencers = silencer; - debug(D_HEALTH, "HEALTH command API: Added silencer %s:%s:%s:%s:%s", silencer->alarms, - silencer->charts, silencer->contexts, silencer->hosts, silencer->families - ); + health_silencers_add(silencer); buffer_strcat(wb, HEALTH_CMDAPI_MSG_ADDED); if (silencers->stype == STYPE_NONE) { buffer_strcat(wb, HEALTH_CMDAPI_MSG_STYPEWARNING); @@ -162,5 +194,11 @@ int web_client_api_request_v1_mgmt_health(RRDHOST *host, struct web_client *w, c } w->response.data = wb; buffer_no_cacheable(w->response.data); + if (ret == 200 && config_changed) { + BUFFER *jsonb = buffer_create(200); + health_silencers2json(jsonb); + health_silencers2file(jsonb); + } + return ret; } diff --git a/web/api/health/health_cmdapi.h b/web/api/health/health_cmdapi.h index d0f30401..d8ec6aaa 100644 --- a/web/api/health/health_cmdapi.h +++ b/web/api/health/health_cmdapi.h @@ -12,6 +12,7 @@ #define HEALTH_CMDAPI_CMD_SILENCE "SILENCE" #define HEALTH_CMDAPI_CMD_DISABLE "DISABLE" #define HEALTH_CMDAPI_CMD_RESET "RESET" +#define HEALTH_CMDAPI_CMD_LIST "LIST" #define HEALTH_CMDAPI_MSG_AUTHERROR "Auth Error\n" #define HEALTH_CMDAPI_MSG_SILENCEALL "All alarm notifications are silenced\n" @@ -20,7 +21,6 @@ #define HEALTH_CMDAPI_MSG_DISABLE "Health checks disabled for alarms matching the selectors\n" #define HEALTH_CMDAPI_MSG_SILENCE "Alarm notifications silenced for alarms matching the selectors\n" #define HEALTH_CMDAPI_MSG_ADDED "Alarm selector added\n" -#define HEALTH_CMDAPI_MSG_INVALID_KEY "Invalid key. Ignoring it.\n" #define HEALTH_CMDAPI_MSG_STYPEWARNING "WARNING: Added alarm selector to silence/disable alarms without a SILENCE or DISABLE command.\n" #define HEALTH_CMDAPI_MSG_NOSELECTORWARNING "WARNING: SILENCE or DISABLE command is ineffective without defining any alarm selectors.\n" diff --git a/web/api/netdata-swagger.json b/web/api/netdata-swagger.json index 2fa55c4f..63bc5638 100644 --- a/web/api/netdata-swagger.json +++ b/web/api/netdata-swagger.json @@ -77,6 +77,39 @@ } } }, + "/alarm_variables": { + "get": { + "summary": "List variables available to configure alarms for a chart", + "description": "Returns the basic information of a chart and all the variables that can be used in alarm and template health configurations for the particular chart or family", + "parameters": [ + { + "name": "chart", + "in": "query", + "description": "The id of the chart as returned by the /charts call.", + "required": true, + "type": "string", + "format": "as returned by /charts" + } + ], + "responses": { + "200": { + "description": "A javascript object with information about the chart and the available variables", + "schema": { + "$ref": "#/definitions/alarm_variables" + } + }, + "400": { + "description": "Bad request - the body will include a message stating what is wrong." + }, + "404": { + "description": "No chart with the given id is found." + }, + "500": { + "description": "Internal server error. This usually means the server is out of memory." + } + } + } + }, "/data": { "get": { "summary": "Get collected data for a specific chart", @@ -631,7 +664,7 @@ { "name": "cmd", "in": "query", - "description": "DISABLE ALL: No alarm criteria are evaluated, nothing is written in the alarm log. SILENCE ALL: No notifications are sent. RESET: Return to the default state. DISABLE/SILENCE: Set the mode to be used for the alarms matching the criteria of the alarm selectors.", + "description": "DISABLE ALL: No alarm criteria are evaluated, nothing is written in the alarm log. SILENCE ALL: No notifications are sent. RESET: Return to the default state. DISABLE/SILENCE: Set the mode to be used for the alarms matching the criteria of the alarm selectors. LIST: Show active configuration.", "required": false, "type": "string", "enum": [ @@ -639,7 +672,8 @@ "SILENCE ALL", "DISABLE", "SILENCE", - "RESET" + "RESET", + "LIST" ] }, { @@ -951,6 +985,70 @@ } } }, + "alarm_variables": { + "type": "object", + "properties": { + "chart": { + "type": "string", + "description": "The unique id of the chart" + }, + "chart_name": { + "type": "string", + "description": "The name of the chart" + }, + "cnart_context": { + "type": "string", + "description": "The context of the chart. It is shared across multiple monitored software or hardware instances and used in alarm templates" + }, + "family": { + "type": "string", + "description": "The family of the chart." + }, + "host": { + "type": "string", + "description": "The host containing the chart." + }, + "chart_variables": { + "type": "object", + "properties": { + "varname1": { + "type": "number", + "format": "float" + }, + "varname2": { + "type": "number", + "format": "float" + } + } + }, + "family_variables": { + "type": "object", + "properties": { + "varname1": { + "type": "number", + "format": "float" + }, + "varname2": { + "type": "number", + "format": "float" + } + } + }, + "host_variables": { + "type": "object", + "properties": { + "varname1": { + "type": "number", + "format": "float" + }, + "varname2": { + "type": "number", + "format": "float" + } + } + } + } + }, "dimension": { "type": "object", "properties": { @@ -1208,6 +1306,14 @@ "crit_parsed": { "type": "string" }, + "warn_repeat_every": { + "type": "integer", + "format": "int32" + }, + "crit_repeat_every": { + "type": "integer", + "format": "int32" + }, "green": { "type": "string", "format": "nullable" diff --git a/web/api/netdata-swagger.yaml b/web/api/netdata-swagger.yaml index c021efef..3386e01a 100644 --- a/web/api/netdata-swagger.yaml +++ b/web/api/netdata-swagger.yaml @@ -63,6 +63,28 @@ paths: $ref: '#/definitions/chart' '404': description: 'No chart with the given id is found.' + /alarm_variables: + get: + summary: 'List variables available to configure alarms for a chart' + description: 'Returns the basic information of a chart and all the variables that can be used in alarm and template health configurations for the particular chart or family' + parameters: + - name: chart + in: query + description: 'The id of the chart as returned by the /charts call.' + required: true + type: string + format: 'as returned by /charts' + responses: + '200': + description: 'A javascript object with information about the chart and the available variables' + schema: + $ref: '#/definitions/alarm_variables' + '400': + description: 'Bad request - the body will include a message stating what is wrong.' + '404': + description: 'No chart with the given id is found.' + '500': + description: 'Internal server error. This usually means the server is out of memory.' /data: get: summary: 'Get collected data for a specific chart' @@ -415,10 +437,10 @@ paths: parameters: - name: cmd in: query - description: 'DISABLE ALL: No alarm criteria are evaluated, nothing is written in the alarm log. SILENCE ALL: No notifications are sent. RESET: Return to the default state. DISABLE/SILENCE: Set the mode to be used for the alarms matching the criteria of the alarm selectors.' + description: 'DISABLE ALL: No alarm criteria are evaluated, nothing is written in the alarm log. SILENCE ALL: No notifications are sent. RESET: Return to the default state. DISABLE/SILENCE: Set the mode to be used for the alarms matching the criteria of the alarm selectors. LIST: Show active configuration.' required: false type: string - enum: ['DISABLE ALL', 'SILENCE ALL', 'DISABLE', 'SILENCE', 'RESET'] + enum: ['DISABLE ALL', 'SILENCE ALL', 'DISABLE', 'SILENCE', 'RESET', 'LIST'] - name: alarm in: query description: 'The expression provided will match both `alarm` and `template` names.' @@ -638,6 +660,51 @@ definitions: red: type: number description: 'Chart health red trheshold' + alarm_variables: + type: object + properties: + chart: + type: string + description: 'The unique id of the chart' + chart_name: + type: string + description: 'The name of the chart' + cnart_context: + type: string + description: 'The context of the chart. It is shared across multiple monitored software or hardware instances and used in alarm templates' + family: + type: string + description: 'The family of the chart.' + host: + type: string + description: 'The host containing the chart.' + chart_variables: + type: object + properties: + varname1: + type: number + format: float + varname2: + type: number + format: float + family_variables: + type: object + properties: + varname1: + type: number + format: float + varname2: + type: number + format: float + host_variables: + type: object + properties: + varname1: + type: number + format: float + varname2: + type: number + format: float dimension: type: object properties: @@ -825,6 +892,12 @@ definitions: type: string crit_parsed: type: string + warn_repeat_every: + type: integer + format: int32 + crit_repeat_every: + type: integer + format: int32 green: type: string format: nullable diff --git a/web/gui/Makefile.am b/web/gui/Makefile.am index 7d1ceef9..ef8aa05f 100644 --- a/web/gui/Makefile.am +++ b/web/gui/Makefile.am @@ -61,6 +61,7 @@ dist_web_DATA = \ index.html \ main.css \ main.js \ + console.html \ infographic.html \ robots.txt \ refresh-badges.js \ @@ -69,12 +70,6 @@ dist_web_DATA = \ version.txt \ $(NULL) - -webconsoledir=$(webdir)/console -dist_webconsole_DATA = \ - console/index.html \ - $(NULL) - webstaticdir=$(webdir)/static/img dist_webstatic_DATA = \ static/img/netdata-logomark.svg \ diff --git a/web/gui/console/index.html b/web/gui/console.html index 72320191..942c8c3c 100644 --- a/web/gui/console/index.html +++ b/web/gui/console.html @@ -10,7 +10,7 @@ <meta name="viewport" content="width=device-width, initial-scale=1" /> <meta name="apple-mobile-web-app-capable" content="yes" /> <meta name="apple-mobile-web-app-status-bar-style" content="black-translucent" /> - <link rel="icon" href="/favicon.ico" /> + <link rel="icon" href="favicon.ico" /> <link rel="stylesheet" href="https://fonts.googleapis.com/icon?family=Material+Icons" /> <!-- Google Tag Manager --> <script>(function (w, d, s, l, i) { @@ -66,7 +66,7 @@ var netdataTheme = "white" var netdataNoBootstrap = true </script> - <script type="text/javascript" src="/dashboard.js?v20190515"></script> + <script type="text/javascript" src="dashboard.js?v20190523"></script> </body> </html>
\ No newline at end of file diff --git a/web/gui/dashboard_info.js b/web/gui/dashboard_info.js index aab11ba3..0013311e 100644 --- a/web/gui/dashboard_info.js +++ b/web/gui/dashboard_info.js @@ -267,6 +267,12 @@ netdataDashboard.menu = { info: 'Performance metrics for <b>RetroShare</b>. RetroShare is open source software for encrypted filesharing, serverless email, instant messaging, online chat, and BBS, based on a friend-to-friend network built on GNU Privacy Guard (GPG).' }, + 'riakkv': { + title: 'Riak KV', + icon: '<i class="fas fa-database"></i>', + info: 'Metrics for <b>Riak KV</b>, the distributed key-value store.' + }, + 'ipfs': { title: 'IPFS', icon: '<i class="fas fa-folder-open"></i>', @@ -279,6 +285,13 @@ netdataDashboard.menu = { info: 'Performance metrics for <b>PHP-FPM</b>, an alternative FastCGI implementation for PHP.' }, + 'pihole': { + title: 'Pi-hole', + icon: '<i class="fas fa-ban"></i>', + info: 'Metrics for <a href="https://pi-hole.net/" target="_blank">Pi-hole</a>, a black hole for Internet advertisements.' + + ' The metrics returned by Pi-Hole API is all from the last 24 hours.' + }, + 'portcheck': { title: 'Port Check', icon: '<i class="fas fa-heartbeat"></i>', @@ -324,7 +337,7 @@ netdataDashboard.menu = { 'web_log': { title: undefined, icon: '<i class="fas fa-file-alt"></i>', - info: 'Information extracted from a server log file. <code>web_log</code> plugin incrementally parses the server log file to provide, in real-time, a break down of key server performance metrics. For web servers, an extended log file format may optionally be used (for <code>nginx</code> and <code>apache</code>) offering timing information and bandwidth for both requests and responses. <code>web_log</code> plugin may also be configured to provide a break down of requests per URL pattern (check <a href="https://github.com/netdata/netdata/blob/master/conf.d/python.d/web_log.conf" target="_blank"><code>/etc/netdata/python.d/web_log.conf</code></a>).' + info: 'Information extracted from a server log file. <code>web_log</code> plugin incrementally parses the server log file to provide, in real-time, a break down of key server performance metrics. For web servers, an extended log file format may optionally be used (for <code>nginx</code> and <code>apache</code>) offering timing information and bandwidth for both requests and responses. <code>web_log</code> plugin may also be configured to provide a break down of requests per URL pattern (check <a href="https://github.com/netdata/netdata/blob/master/collectors/python.d.plugin/web_log/web_log.conf" target="_blank"><code>/etc/netdata/python.d/web_log.conf</code></a>).' }, 'named': { @@ -461,6 +474,18 @@ netdataDashboard.menu = { title: '', icon: '<i class="fas fa-th-large"></i>', info: 'Xen domain resource utilization metrics. Netdata reads this information using <b>xenstat</b> library which gives access to the resource usage information (CPU, memory, disk I/O, network) for a virtual machine.' + }, + + 'wmi': { + title: 'wmi', + icon: '<i class="fas fa-server"></i>', + info: undefined + }, + + 'perf': { + title: 'Perf Counters', + icon: '<i class="fas fa-tachometer-alt"></i>', + info: 'Performance Monitoring Counters (PMC). Data collected using <b>perf_event_open()</b> system call which utilises Hardware Performance Monitoring Units (PMU).' } }; @@ -515,7 +540,7 @@ netdataDashboard.submenu = { }, 'web_log.urls': { - info: 'Number of requests for each <code>URL pattern</code> defined in <a href="https://github.com/netdata/netdata/blob/master/conf.d/python.d/web_log.conf" target="_blank"><code>/etc/netdata/python.d/web_log.conf</code></a>. This chart counts all requests matching the URL patterns defined, independently of the web server response codes (i.e. both successful and unsuccessful).' + info: 'Number of requests for each <code>URL pattern</code> defined in <a href="https://github.com/netdata/netdata/blob/master/collectors/python.d.plugin/web_log/web_log.conf" target="_blank"><code>/etc/netdata/python.d/web_log.conf</code></a>. This chart counts all requests matching the URL patterns defined, independently of the web server response codes (i.e. both successful and unsuccessful).' }, 'web_log.clients': { @@ -1163,6 +1188,10 @@ netdataDashboard.context = { '</ul>' }, + 'mysql.innodb_deadlocks': { + info: 'A deadlock happens when two or more transactions mutually hold and request for locks, creating a cycle of dependencies. For more information about <a href="https://dev.mysql.com/doc/refman/5.7/en/innodb-deadlocks-handling.html" target="_blank">how to minimize and handle deadlocks</a>.' + }, + // ------------------------------------------------------------------------ // POSTGRESQL @@ -1806,7 +1835,7 @@ netdataDashboard.context = { }, 'web_log.clients_all': { - info: 'Unique client IPs accessing the web server since the last restart of netdata. This plugin keeps in memory all the unique IPs that have accessed the web server. On very busy web servers (several millions of unique IPs) you may want to disable this chart (check <a href="https://github.com/netdata/netdata/blob/master/conf.d/python.d/web_log.conf" target="_blank"><code>/etc/netdata/python.d/web_log.conf</code></a>).' + info: 'Unique client IPs accessing the web server since the last restart of netdata. This plugin keeps in memory all the unique IPs that have accessed the web server. On very busy web servers (several millions of unique IPs) you may want to disable this chart (check <a href="https://github.com/netdata/netdata/blob/master/collectors/python.d.plugin/web_log/web_log.conf" target="_blank"><code>/etc/netdata/python.d/web_log.conf</code></a>).' }, // ------------------------------------------------------------------------ @@ -1937,7 +1966,7 @@ netdataDashboard.context = { }, 'web_log.squid_clients_all': { - info: 'Unique client IPs accessing squid since the last restart of netdata. This plugin keeps in memory all the unique IPs that have accessed the server. On very busy squid servers (several millions of unique IPs) you may want to disable this chart (check <a href="https://github.com/netdata/netdata/blob/master/conf.d/python.d/web_log.conf" target="_blank"><code>/etc/netdata/python.d/web_log.conf</code></a>).' + info: 'Unique client IPs accessing squid since the last restart of netdata. This plugin keeps in memory all the unique IPs that have accessed the server. On very busy squid servers (several millions of unique IPs) you may want to disable this chart (check <a href="https://github.com/netdata/netdata/blob/master/collectors/python.d.plugin/web_log/web_log.conf" target="_blank"><code>/etc/netdata/python.d/web_log.conf</code></a>).' }, 'web_log.squid_transport_methods': { diff --git a/web/gui/demosites.html b/web/gui/demosites.html index f908e0b4..e00fbbfd 100644 --- a/web/gui/demosites.html +++ b/web/gui/demosites.html @@ -2,6 +2,7 @@ <!-- SPDX-License-Identifier: GPL-3.0-or-later --> <html lang=en-us xmlns="http://www.w3.org/1999/html"> <head> + <meta http-equiv="Refresh" content="0; url=https://www.netdata.cloud"> <meta charset=utf-8> <title>NetData: Get control of your Linux Servers. Simple. Effective. Awesome.</title> <meta name=author content="Costa Tsaousis"> diff --git a/web/gui/favicon.ico b/web/gui/favicon.ico Binary files differindex 857c582d..064032ae 100644 --- a/web/gui/favicon.ico +++ b/web/gui/favicon.ico diff --git a/web/gui/images/android-icon-144x144.png b/web/gui/images/android-icon-144x144.png Binary files differindex c3013cc9..69efa5a2 100644 --- a/web/gui/images/android-icon-144x144.png +++ b/web/gui/images/android-icon-144x144.png diff --git a/web/gui/images/android-icon-192x192.png b/web/gui/images/android-icon-192x192.png Binary files differindex 77d18d9c..e5744357 100644 --- a/web/gui/images/android-icon-192x192.png +++ b/web/gui/images/android-icon-192x192.png diff --git a/web/gui/images/android-icon-36x36.png b/web/gui/images/android-icon-36x36.png Binary files differindex 74576f6b..4ba804d9 100644 --- a/web/gui/images/android-icon-36x36.png +++ b/web/gui/images/android-icon-36x36.png diff --git a/web/gui/images/android-icon-48x48.png b/web/gui/images/android-icon-48x48.png Binary files differindex 5666fa10..04970d4b 100644 --- a/web/gui/images/android-icon-48x48.png +++ b/web/gui/images/android-icon-48x48.png diff --git a/web/gui/images/android-icon-72x72.png b/web/gui/images/android-icon-72x72.png Binary files differindex 7f7043f1..5cbc701e 100644 --- a/web/gui/images/android-icon-72x72.png +++ b/web/gui/images/android-icon-72x72.png diff --git a/web/gui/images/android-icon-96x96.png b/web/gui/images/android-icon-96x96.png Binary files differindex 1bbf594d..21f27cea 100644 --- a/web/gui/images/android-icon-96x96.png +++ b/web/gui/images/android-icon-96x96.png diff --git a/web/gui/images/apple-icon-114x114.png b/web/gui/images/apple-icon-114x114.png Binary files differindex 7d093e85..7993e055 100644 --- a/web/gui/images/apple-icon-114x114.png +++ b/web/gui/images/apple-icon-114x114.png diff --git a/web/gui/images/apple-icon-120x120.png b/web/gui/images/apple-icon-120x120.png Binary files differindex d4c38e7b..3fbe8fda 100644 --- a/web/gui/images/apple-icon-120x120.png +++ b/web/gui/images/apple-icon-120x120.png diff --git a/web/gui/images/apple-icon-144x144.png b/web/gui/images/apple-icon-144x144.png Binary files differindex c3013cc9..8d465692 100644 --- a/web/gui/images/apple-icon-144x144.png +++ b/web/gui/images/apple-icon-144x144.png diff --git a/web/gui/images/apple-icon-152x152.png b/web/gui/images/apple-icon-152x152.png Binary files differindex c92f3817..11a10723 100644 --- a/web/gui/images/apple-icon-152x152.png +++ b/web/gui/images/apple-icon-152x152.png diff --git a/web/gui/images/apple-icon-180x180.png b/web/gui/images/apple-icon-180x180.png Binary files differindex 1a58fdbb..314efb12 100644 --- a/web/gui/images/apple-icon-180x180.png +++ b/web/gui/images/apple-icon-180x180.png diff --git a/web/gui/images/apple-icon-57x57.png b/web/gui/images/apple-icon-57x57.png Binary files differindex 36c273ce..85283616 100644 --- a/web/gui/images/apple-icon-57x57.png +++ b/web/gui/images/apple-icon-57x57.png diff --git a/web/gui/images/apple-icon-60x60.png b/web/gui/images/apple-icon-60x60.png Binary files differindex c3c48c8b..2662e85d 100644 --- a/web/gui/images/apple-icon-60x60.png +++ b/web/gui/images/apple-icon-60x60.png diff --git a/web/gui/images/apple-icon-72x72.png b/web/gui/images/apple-icon-72x72.png Binary files differindex 7f7043f1..4a6b056e 100644 --- a/web/gui/images/apple-icon-72x72.png +++ b/web/gui/images/apple-icon-72x72.png diff --git a/web/gui/images/apple-icon-76x76.png b/web/gui/images/apple-icon-76x76.png Binary files differindex b5e73cd4..c2bf6c9f 100644 --- a/web/gui/images/apple-icon-76x76.png +++ b/web/gui/images/apple-icon-76x76.png diff --git a/web/gui/images/apple-icon-precomposed.png b/web/gui/images/apple-icon-precomposed.png Binary files differindex f69945bf..9c3e73ef 100644 --- a/web/gui/images/apple-icon-precomposed.png +++ b/web/gui/images/apple-icon-precomposed.png diff --git a/web/gui/images/apple-icon.png b/web/gui/images/apple-icon.png Binary files differindex f69945bf..9c3e73ef 100644 --- a/web/gui/images/apple-icon.png +++ b/web/gui/images/apple-icon.png diff --git a/web/gui/images/banner-icon-144x144.png b/web/gui/images/banner-icon-144x144.png Binary files differindex c3013cc9..fef3dca1 100644 --- a/web/gui/images/banner-icon-144x144.png +++ b/web/gui/images/banner-icon-144x144.png diff --git a/web/gui/images/favicon-128.png b/web/gui/images/favicon-128.png Binary files differnew file mode 100644 index 00000000..5371f920 --- /dev/null +++ b/web/gui/images/favicon-128.png diff --git a/web/gui/images/favicon-16x16.png b/web/gui/images/favicon-16x16.png Binary files differindex 43eb188f..5729f5a2 100644 --- a/web/gui/images/favicon-16x16.png +++ b/web/gui/images/favicon-16x16.png diff --git a/web/gui/images/favicon-196x196.png b/web/gui/images/favicon-196x196.png Binary files differnew file mode 100644 index 00000000..a208c27f --- /dev/null +++ b/web/gui/images/favicon-196x196.png diff --git a/web/gui/images/favicon-32x32.png b/web/gui/images/favicon-32x32.png Binary files differindex e657e921..cdb0a480 100644 --- a/web/gui/images/favicon-32x32.png +++ b/web/gui/images/favicon-32x32.png diff --git a/web/gui/images/favicon-96x96.png b/web/gui/images/favicon-96x96.png Binary files differindex 1bbf594d..dbe7dea2 100644 --- a/web/gui/images/favicon-96x96.png +++ b/web/gui/images/favicon-96x96.png diff --git a/web/gui/images/favicon.ico b/web/gui/images/favicon.ico Binary files differindex 7ed95725..064032ae 100644 --- a/web/gui/images/favicon.ico +++ b/web/gui/images/favicon.ico diff --git a/web/gui/images/ms-icon-144x144.png b/web/gui/images/ms-icon-144x144.png Binary files differindex c3013cc9..8d465692 100644 --- a/web/gui/images/ms-icon-144x144.png +++ b/web/gui/images/ms-icon-144x144.png diff --git a/web/gui/images/ms-icon-150x150.png b/web/gui/images/ms-icon-150x150.png Binary files differindex f0cf4128..4683d56a 100644 --- a/web/gui/images/ms-icon-150x150.png +++ b/web/gui/images/ms-icon-150x150.png diff --git a/web/gui/images/ms-icon-310x150.png b/web/gui/images/ms-icon-310x150.png Binary files differnew file mode 100644 index 00000000..5d4ac57b --- /dev/null +++ b/web/gui/images/ms-icon-310x150.png diff --git a/web/gui/images/ms-icon-310x310.png b/web/gui/images/ms-icon-310x310.png Binary files differindex 4f5f7e62..bdb591b2 100644 --- a/web/gui/images/ms-icon-310x310.png +++ b/web/gui/images/ms-icon-310x310.png diff --git a/web/gui/images/ms-icon-36x36.png b/web/gui/images/ms-icon-36x36.png Binary files differnew file mode 100644 index 00000000..e251302e --- /dev/null +++ b/web/gui/images/ms-icon-36x36.png diff --git a/web/gui/images/ms-icon-70x70.png b/web/gui/images/ms-icon-70x70.png Binary files differindex 70012c61..5371f920 100644 --- a/web/gui/images/ms-icon-70x70.png +++ b/web/gui/images/ms-icon-70x70.png diff --git a/web/gui/images/netdata-logomark.svg b/web/gui/images/netdata-logomark.svg index 87fb2bda..18152fb7 100644 --- a/web/gui/images/netdata-logomark.svg +++ b/web/gui/images/netdata-logomark.svg @@ -1,3 +1,8 @@ -<svg width="1723" height="1723" viewBox="0 0 1723 1723" fill="none" xmlns="http://www.w3.org/2000/svg"> -<path fill-rule="evenodd" clip-rule="evenodd" d="M0.628784 849.678C0.628784 473.909 235.042 153.621 563.766 30.7914C701.438 19.0613 843.892 50.2449 970.557 129.297C1052.47 180.42 1119.71 246.528 1170.96 321.982C1161.21 207.568 1122.97 96.4678 1058.94 0.187012C1220.56 38.587 1364.64 123.126 1476.91 239.343C1518.34 297.634 1548.55 365.545 1563.67 440.489C1578.54 514.244 1577.35 587.545 1562.5 656.661C1601.04 613.105 1632.22 563.24 1654.63 509.251C1698.41 613.852 1722.63 728.899 1722.63 849.678C1722.63 1331.55 1337.15 1722.19 861.629 1722.19C386.112 1722.19 0.628784 1331.55 0.628784 849.678ZM1178.87 1369.04C1286.71 1369.04 1374.13 1280.45 1374.13 1171.17C1374.13 1061.88 1286.71 973.293 1178.87 973.293C1071.03 973.293 983.603 1061.88 983.603 1171.17C983.603 1280.45 1071.03 1369.04 1178.87 1369.04Z" fill="#00C853"/> -</svg> +<?xml version="1.0" encoding="UTF-8"?> +<svg width="500px" height="500px" viewBox="0 0 500 500" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"> + <g id="Artboard" stroke="none" stroke-width="1" fill="none" fill-rule="evenodd"> + <g id="logo_green_fill" transform="translate(0.000000, 49.000000)" fill="#00FF00" fill-rule="nonzero"> + <path d="M307.477876,400.442478 L206.814159,400.442478 L0.486725664,2.21238938 L293.318584,2.21238938 C407.146221,2.33432589 499.391338,94.5794427 499.513274,208.40708 C499.391356,314.41476 413.485557,400.32056 307.477876,400.442478 L307.477876,400.442478 Z" id="Path"></path> + </g> + </g> +</svg>
\ No newline at end of file diff --git a/web/gui/index.html b/web/gui/index.html index c9dd89b2..4a8647dd 100644 --- a/web/gui/index.html +++ b/web/gui/index.html @@ -15,8 +15,7 @@ <link rel="stylesheet" type="text/css" href="main.css?v=5"> - <link rel="icon" href=""> - + <link rel="icon" href="" /> <!-- <link rel="apple-touch-icon" sizes="57x57" href="images/apple-icon-57x57.png"> <link rel="apple-touch-icon" sizes="60x60" href="images/apple-icon-60x60.png"> <link rel="apple-touch-icon" sizes="72x72" href="images/apple-icon-72x72.png"> @@ -124,7 +123,7 @@ </div> <nav class="collapse navbar-collapse navbar-right" role="navigation"> <ul class="nav navbar-nav"> - <li title="Nodes view" data-toggle="tooltip" data-placement="bottom"><a onclick="openAuthenticatedUrl('console/index.html');" class="btn" target="_blank"><i class="fas fa-tv"></i> <span class="hidden-sm hidden-md">Nodes<sup class="beta"> beta</sup></span></a></li> + <li title="Nodes view" data-toggle="tooltip" data-placement="bottom"><a onclick="openAuthenticatedUrl('console.html');" class="btn" target="_blank"><i class="fas fa-tv"></i> <span class="hidden-sm hidden-md">Nodes<sup class="beta"> beta</sup></span></a></li> <li id="alarmsButton" title="check the health monitoring alarms and their log" data-toggle="tooltip" data-placement="bottom"><a href="#" class="btn" data-toggle="modal" data-target="#alarmsModal"><i class="fas fa-bell"></i> <span class="hidden-sm hidden-md">Alarms </span><span id="alarms_count_badge" class="badge"></span></a></li> <li title="change dashboard settings" data-toggle="tooltip" data-placement="bottom"><a href="#" class="btn" data-toggle="modal" data-target="#optionsModal"><i class="fas fa-cog"></i> <span class="hidden-sm hidden-md">Settings</span></a></li> <li title="check for netdata updates<br/>you should keep your netdata updated" data-toggle="tooltip" data-placement="bottom" class="hidden-sm" id="updateButton"><a href="#" class="btn" data-toggle="modal" data-target="#updateModal"><i class="fas fa-cloud-download-alt"></i> <span class="hidden-sm hidden-md">Update </span><span id="update_badge" class="badge"></span></a></li> @@ -1371,6 +1370,6 @@ </div> <iframe id="ssoifrm" width="0" height="0"></iframe> <div id="hiddenDownloadLinks" style="display: none;" hidden></div> - <script type="text/javascript" src="dashboard.js?v20190130-1"></script> + <script type="text/javascript" src="dashboard.js?v20190621-1"></script> </body> </html> diff --git a/web/gui/main.css b/web/gui/main.css index 57115414..2ddb776e 100644 --- a/web/gui/main.css +++ b/web/gui/main.css @@ -322,6 +322,7 @@ body.modal-open { .sidebar-body { position: absolute; display: none; + height: 100vh; } .dashboard-section-container { diff --git a/web/gui/main.js b/web/gui/main.js index 277ae840..65c4d4a8 100644 --- a/web/gui/main.js +++ b/web/gui/main.js @@ -775,7 +775,7 @@ function renderMyNetdataMenu(machinesArray) { html += ( `<div class="agent-item"> <i class="fas fa-tv"></i> - <a onClick="openAuthenticatedUrl('console/index.html');" target="_blank">Nodes<sup class="beta"> beta</sup></a> + <a onClick="openAuthenticatedUrl('console.html');" target="_blank">Nodes<sup class="beta"> beta</sup></a> <div></div> </div> <div class="agent-item"> @@ -793,7 +793,7 @@ function renderMyNetdataMenu(machinesArray) { html += ( `<div class="agent-item"> <i class="fas fa-tv"></i> - <a onclick="openAuthenticatedUrl('console/index.html');" target="_blank">Nodes<sup class="beta"> beta</sup></a> + <a onclick="openAuthenticatedUrl('console.html');" target="_blank">Nodes<sup class="beta"> beta</sup></a> <div></div> </div> <div class="agent-item"> @@ -1993,7 +1993,7 @@ function clipboardCopyBadgeEmbed(url) { function alarmsUpdateModal() { var active = '<h3>Raised Alarms</h3><table class="table">'; var all = '<h3>All Running Alarms</h3><div class="panel-group" id="alarms_all_accordion" role="tablist" aria-multiselectable="true">'; - var footer = '<hr/><a href="https://github.com/netdata/netdata/tree/master/web/api/badges#netdata-badges" target="_blank">netdata badges</a> refresh automatically. Their color indicates the state of the alarm: <span style="color: #e05d44"><b> red </b></span> is critical, <span style="color:#fe7d37"><b> orange </b></span> is warning, <span style="color: #4c1"><b> bright green </b></span> is ok, <span style="color: #9f9f9f"><b> light grey </b></span> is undefined (i.e. no data or no status), <span style="color: #000"><b> black </b></span> is not initialized. You can copy and paste their URLs to embed them in any web page.<br/>netdata can send notifications for these alarms. Check <a href="https://github.com/netdata/netdata/blob/master/health/notifications/health_alarm_notify.conf">this configuration file</a> for more information.'; + var footer = '<hr/><a href="https://github.com/netdata/netdata/tree/master/web/api/badges#netdata-badges" target="_blank">netdata badges</a> refresh automatically. Their color indicates the state of the alarm: <span style="color: #e05d44"><b> red </b></span> is critical, <span style="color:#fe7d37"><b> orange </b></span> is warning, <span style="color: #4c1"><b> bright green </b></span> is ok, <span style="color: #9f9f9f"><b> light grey </b></span> is undefined (i.e. no data or no status), <span style="color: #000"><b> black </b></span> is not initialized. You can copy and paste their URLs to embed them in any web page.<br/>netdata can send notifications for these alarms. Check <a href="https://github.com/netdata/netdata/blob/master/health/notifications/health_alarm_notify.conf" target="_blank">this configuration file</a> for more information.'; loadClipboard(function () { }); @@ -2100,6 +2100,14 @@ function alarmsUpdateModal() { + ((chart.red !== null) ? ('<tr><td width="10%" style="text-align:right">red threshold</td><td><code>' + chart.red + ' ' + units + '</code></td></tr>') : ''); } + if (alarm.warn_repeat_every > 0) { + html += '<tr><td width="10%" style="text-align:right">repeat warning</td><td>' + NETDATA.seconds4human(alarm.warn_repeat_every) + '</td></tr>'; + } + + if (alarm.crit_repeat_every > 0) { + html += '<tr><td width="10%" style="text-align:right">repeat critical</td><td>' + NETDATA.seconds4human(alarm.crit_repeat_every) + '</td></tr>'; + } + var delay = ''; if ((alarm.delay_up_duration > 0 || alarm.delay_down_duration > 0) && alarm.delay_multiplier !== 0 && alarm.delay_max_duration > 0) { if (alarm.delay_up_duration === alarm.delay_down_duration) { @@ -4840,12 +4848,11 @@ function renderAccountUI() { container.removeAttribute("title"); container.removeAttribute("data-original-title"); container.removeAttribute("data-placement"); - // <a href="/console/index.html#/charts/${NETDATA.registry.machine_guid}" target="_blank" class="btn"> container.innerHTML = ( `<a href="#" class="dropdown-toggle" data-toggle="dropdown"><span id="amc-account-name"></span> <strong class="caret"></strong></a> <ul id="cloud-menu" class="dropdown-menu scrollable-menu inpagemenu" role="menu"> <li> - <a onclick="openAuthenticatedUrl('console/index.html');" target="_blank" class="btn"> + <a onclick="openAuthenticatedUrl('console.html');" target="_blank" class="btn"> <i class="fas fa-tv"></i> <span class="hidden-sm hidden-md">Nodes<sup class="beta"> beta</sup></span> </a> </li> diff --git a/web/gui/static/static/img/netdata-logomark.svg b/web/gui/static/static/img/netdata-logomark.svg deleted file mode 100644 index 87fb2bda..00000000 --- a/web/gui/static/static/img/netdata-logomark.svg +++ /dev/null @@ -1,3 +0,0 @@ -<svg width="1723" height="1723" viewBox="0 0 1723 1723" fill="none" xmlns="http://www.w3.org/2000/svg"> -<path fill-rule="evenodd" clip-rule="evenodd" d="M0.628784 849.678C0.628784 473.909 235.042 153.621 563.766 30.7914C701.438 19.0613 843.892 50.2449 970.557 129.297C1052.47 180.42 1119.71 246.528 1170.96 321.982C1161.21 207.568 1122.97 96.4678 1058.94 0.187012C1220.56 38.587 1364.64 123.126 1476.91 239.343C1518.34 297.634 1548.55 365.545 1563.67 440.489C1578.54 514.244 1577.35 587.545 1562.5 656.661C1601.04 613.105 1632.22 563.24 1654.63 509.251C1698.41 613.852 1722.63 728.899 1722.63 849.678C1722.63 1331.55 1337.15 1722.19 861.629 1722.19C386.112 1722.19 0.628784 1331.55 0.628784 849.678ZM1178.87 1369.04C1286.71 1369.04 1374.13 1280.45 1374.13 1171.17C1374.13 1061.88 1286.71 973.293 1178.87 973.293C1071.03 973.293 983.603 1061.88 983.603 1171.17C983.603 1280.45 1071.03 1369.04 1178.87 1369.04Z" fill="#00C853"/> -</svg> diff --git a/web/gui/tv.html b/web/gui/tv.html index bd549be9..58485b26 100644 --- a/web/gui/tv.html +++ b/web/gui/tv.html @@ -94,7 +94,7 @@ setTimeout(function(){ <div style="width: 100%; height: calc(100% - 15px); text-align: center; display: inline-block;"> <br/> <div data-netdata="system.cpu" - data-host="http://registry.my-netdata.io" + data-host="https://registry.my-netdata.io" data-title="CPU usage of registry.my-netdata.io" data-chart-library="dygraph" data-width="49%" @@ -120,7 +120,7 @@ setTimeout(function(){ </div> <div style="width: 100%; height: calc(100% - 15px); text-align: center; display: inline-block;"> <div data-netdata="system.io" - data-host="http://registry.my-netdata.io" + data-host="https://registry.my-netdata.io" data-common-max="io" data-common-min="io" data-title="I/O on registry.my-netdata.io" @@ -148,7 +148,7 @@ setTimeout(function(){ </div> <div style="width: 100%; height: calc(100% - 15px); text-align: center; display: inline-block;"> <div data-netdata="system.net" - data-host="http://registry.my-netdata.io" + data-host="https://registry.my-netdata.io" data-common-max="traffic" data-common-min="traffic" data-title="Network traffic on registry.my-netdata.io" @@ -178,7 +178,7 @@ setTimeout(function(){ registry.my-netdata.io <br/> <div data-netdata="netdata.requests" - data-host="http://registry.my-netdata.io" + data-host="https://registry.my-netdata.io" data-common-max="netdata-requests" data-decimal-digits="0" data-title="Chart Refreshes/s" @@ -189,7 +189,7 @@ setTimeout(function(){ data-points="300" ></div> <div data-netdata="netdata.clients" - data-host="http://registry.my-netdata.io" + data-host="https://registry.my-netdata.io" data-common-max="netdata-clients" data-decimal-digits="0" data-title="Sockets" @@ -204,7 +204,7 @@ setTimeout(function(){ data-dimensions="in" data-common-max="netdata-net-in" data-decimal-digits="0" - data-host="http://registry.my-netdata.io" + data-host="https://registry.my-netdata.io" data-title="Requests Traffic" data-chart-library="easypiechart" data-width="15%" @@ -216,7 +216,7 @@ setTimeout(function(){ data-dimensions="out" data-common-max="netdata-net-out" data-decimal-digits="0" - data-host="http://registry.my-netdata.io" + data-host="https://registry.my-netdata.io" data-title="Chart Data Traffic" data-chart-library="easypiechart" data-width="15%" diff --git a/web/server/README.md b/web/server/README.md index 7d74c181..df29f331 100644 --- a/web/server/README.md +++ b/web/server/README.md @@ -33,15 +33,15 @@ The ports to bind are controlled via `[web].bind to`, like this: ``` [web] default port = 19999 - bind to = 127.0.0.1=dashboard 10.1.1.1:19998=management|netdata.conf hostname:19997=badges [::]:19996=streaming localhost:19995=registry *:http=dashboard unix:/tmp/netdata.sock + bind to = 127.0.0.1=dashboard^SSL=optional 10.1.1.1:19998=management|netdata.conf hostname:19997=badges [::]:19996=streaming^SSL=force localhost:19995=registry *:http=dashboard unix:/tmp/netdata.sock ``` Using the above, netdata will bind to: -- IPv4 127.0.0.1 at port 19999 (port was used from `default port`). Only the UI (dashboard) and the read API will be accessible on this port. +- IPv4 127.0.0.1 at port 19999 (port was used from `default port`). Only the UI (dashboard) and the read API will be accessible on this port. Both HTTP and HTTPS requests will be accepted. - IPv4 10.1.1.1 at port 19998. The management API and netdata.conf will be accessible on this port. - All the IPs `hostname` resolves to (both IPv4 and IPv6 depending on the resolved IPs) at port 19997. Only badges will be accessible on this port. -- All IPv6 IPs at port 19996. Only metric streaming requests from other netdata agents will be accepted on this port. +- All IPv6 IPs at port 19996. Only metric streaming requests from other netdata agents will be accepted on this port. Only encrypted streams will be allowed (i.e. slaves also need to be [configured for TLS](../../streaming). - All the IPs `localhost` resolves to (both IPv4 and IPv6 depending the resolved IPs) at port 19996. This port will only accept registry API requests. - All IPv4 and IPv6 IPs at port `http` as set in `/etc/services`. Only the UI (dashboard) and the read API will be accessible on this port. - Unix domain socket `/tmp/netdata.sock`. All requests are serviceable on this socket. @@ -57,6 +57,65 @@ The API requests are serviced as follows: - `badges` gives access only to the badges API calls. - `management` gives access only to the management API calls. +### Enabling TLS support + + +Netdata since version 1.16 supports encrypted HTTP connections to the web server and encryption of the data stream between a slave and a master. +Inbound unix socket connections are unaffected, regardless of the SSL settings. +To enable SSL, provide the path to your certificate and private key in the `[web]` section of `netdata.conf`: + +``` +[web] + ssl key = /etc/netdata/ssl/key.pem + ssl certificate = /etc/netdata/ssl/cert.pem +``` + +Both files must be readable by the netdata user. If any of the two files does not exist or is unreadable, Netdata falls back to HTTP. + +For a master/slave connection, only the master needs these settings. + +For test purposes, you can generate self-signed certificates with the following command: + +``` +$ openssl req -newkey rsa:2048 -nodes -sha512 -x509 -days 365 -keyout key.pem -out cert.pem +``` + +TIP: If you use 4096 bits for the key and the certificate, netdata will need more CPU to process the whole communication. +rsa4096 can be until 4 times slower than rsa2048, so we recommend using 2048 bits. You can verify the difference by running + +``` +$ openssl speed rsa2048 rsa4096 +``` + +#### SSL enforcement + +When the certificates are defined and unless any other options are provided, a Netdata server will: +- Redirect all incoming HTTP web server requests to HTTPS. Applies to the dashboard, the API, netdata.conf and badges. +- Allow incoming slave connections to use both unencrypted and encrypted communications for streaming. + +To change this behavior, you need to modify the `bind to` setting in the `[web]` section of `netdata.conf`. +At the end of each port definition, you can append `^SSL=force` or `^SSL=optional`. What happens with these settings differs, depending on whether the port is used for HTTP/S requests, or for streaming. + +SSL setting | HTTP requests | HTTPS requests | Unencrypted Streams | Encrypted Streams +:------:|:-----:|:-----:|:-----:|:-------- +none | Redirected to HTTPS | Accepted | Accepted | Accepted +`force` | Redirected to HTTPS | Accepted | Denied | Accepted +`optional` | Accepted | Accepted | Accepted | Accepted + +Example: + +``` +[web] + bind to = *=dashboard|registry|badges|management|streaming|netdata.conf^SSL=force +``` + +For information how to configure the slaves to use TLS, check [securing the communication](../../streaming#securing-the-communication) in the streaming documentation. +You will find there additional details on the expected behavior for client and server nodes, when their respective SSL options are enabled. + +#### SSL error + +It is possible that when you start to use the Netdata with SSL some erros will be register in the logs, this happens due possible incompatibilities between the browser options related to SSL like Ciphers and TLS/SSL version and the Netdata internal configuration. The most common error would be `error:00000006:lib(0):func(0):EVP lib`. In a near future the Netdata will allow our users to change the internal configuration to avoid errors like this, but until there we are setting the most common and safety options to the communication. + ### Access lists Netdata supports access lists in `netdata.conf`: @@ -96,10 +155,10 @@ setting | default | info :------:|:-------:|:---- ses max window | `15` | See [single exponential smoothing](../api/queries/des/) des max window | `15` | See [double exponential smoothing](../api/queries/des/) -listen backlog | `4096` | The port backlog. Check `man 2 listen`. -web files owner | `netdata` | The user that owns the web static files. Netdata will refuse to serve a file that is not owned by this user, even if it has read access to that file. If the user given is not found, netdata will only serve files owned by user given in `run as user`. +listen backlog | `4096` | The port backlog. Check `man 2 listen`. +web files owner | `netdata` | The user that owns the web static files. Netdata will refuse to serve a file that is not owned by this user, even if it has read access to that file. If the user given is not found, netdata will only serve files owned by user given in `run as user`. web files group | `netdata` | If this is set, Netdata will check if the file is owned by this group and refuse to serve the file if it's not. -disconnect idle clients after seconds | `60` | The time in seconds to disconnect web clients after being totally idle. +disconnect idle clients after seconds | `60` | The time in seconds to disconnect web clients after being totally idle. timeout for first request | `60` | How long to wait for a client to send a request before closing the socket. Prevents slow request attacks. accept a streaming request every seconds | `0` | Can be used to set a limit on how often a master Netdata server will accept streaming requests from the slaves in a [streaming and replication setup](../../streaming) respect do not track policy | `no` | If set to `yes`, will respect the client's browser preferences on storing cookies. diff --git a/web/server/static/static-threaded.c b/web/server/static/static-threaded.c index 1945b8a3..5dda2700 100644 --- a/web/server/static/static-threaded.c +++ b/web/server/static/static-threaded.c @@ -152,10 +152,67 @@ static void *web_server_add_callback(POLLINFO *pi, short int *events, void *data struct web_client *w = web_client_create_on_fd(pi->fd, pi->client_ip, pi->client_port, pi->port_acl); w->pollinfo_slot = pi->slot; - if(unlikely(pi->socktype == AF_UNIX)) + if ( !strncmp(pi->client_port,"UNIX",4)){ web_client_set_unix(w); - else + } else { web_client_set_tcp(w); + } + +#ifdef ENABLE_HTTPS + if ((!web_client_check_unix(w)) && ( netdata_srv_ctx )) { + if( sock_delnonblock(w->ifd) < 0 ){ + error("Web server cannot remove the non-blocking flag from socket %d",w->ifd); + } + + //Read the first 7 bytes from the message, but the message + //is not removed from the queue, because we are using MSG_PEEK + char test[8]; + if ( recv(w->ifd,test, 7,MSG_PEEK) == 7 ) { + test[7] = 0x00; + } + else { + //Case I do not have success to read 7 bytes, + //this means that the mensage was not completely read, so + //I cannot identify it yet. + sock_setnonblock(w->ifd); + return w; + } + + //The next two ifs are not together because I am reusing SSL structure + if (!w->ssl.conn) + { + w->ssl.conn = SSL_new(netdata_srv_ctx); + if ( w->ssl.conn ) { + SSL_set_accept_state(w->ssl.conn); + } else { + error("Failed to create SSL context on socket fd %d.", w->ifd); + if (test[0] < 0x18){ + WEB_CLIENT_IS_DEAD(w); + sock_setnonblock(w->ifd); + return w; + } + } + } + + if (w->ssl.conn) { + if (SSL_set_fd(w->ssl.conn, w->ifd) != 1) { + error("Failed to set the socket to the SSL on socket fd %d.", w->ifd); + //The client is not set dead, because I received a normal HTTP request + //instead a Client Hello(HTTPS). + if ( test[0] < 0x18 ){ + WEB_CLIENT_IS_DEAD(w); + } + } + else{ + w->ssl.flags = security_process_accept(w->ssl.conn, (int)test[0]); + } + } + + sock_setnonblock(w->ifd); + } else{ + w->ssl.flags = NETDATA_SSL_NO_HANDSHAKE; + } +#endif debug(D_WEB_CLIENT, "%llu: ADDED CLIENT FD %d", w->id, pi->fd); return w; @@ -189,6 +246,8 @@ static int web_server_rcv_callback(POLLINFO *pi, short int *events) { struct web_client *w = (struct web_client *)pi->data; int fd = pi->fd; + //BRING IT TO HERE + if(unlikely(web_client_receive(w) < 0)) return -1; @@ -398,6 +457,9 @@ void *socket_listen_main_static_threaded(void *ptr) { if(!api_sockets.opened) fatal("LISTENER: no listen sockets available."); +#ifdef ENABLE_HTTPS + security_start_ssl(NETDATA_SSL_CONTEXT_SERVER); +#endif // 6 threads is the optimal value // since 6 are the parallel connections browsers will do // so, if the machine has more CPUs, avoid using resources unnecessarily @@ -412,7 +474,7 @@ void *socket_listen_main_static_threaded(void *ptr) { if(static_threaded_workers_count < 1) static_threaded_workers_count = 1; - size_t max_sockets = (size_t)config_get_number(CONFIG_SECTION_WEB, "web server max sockets", (long long int)(rlimit_nofile.rlim_cur / 2)); + size_t max_sockets = (size_t)config_get_number(CONFIG_SECTION_WEB, "web server max sockets", (long long int)(rlimit_nofile.rlim_cur / 4)); static_workers_private_data = callocz((size_t)static_threaded_workers_count, sizeof(struct web_server_static_threaded_worker)); diff --git a/web/server/web_client.c b/web/server/web_client.c index 3dc6ec82..bd275f5e 100644 --- a/web/server/web_client.c +++ b/web/server/web_client.c @@ -143,7 +143,9 @@ void web_client_request_done(struct web_client *w) { debug(D_WEB_CLIENT, "%llu: Closing filecopy input file descriptor %d.", w->id, w->ifd); if(web_server_mode != WEB_SERVER_MODE_STATIC_THREADED) { - if (w->ifd != -1) close(w->ifd); + if (w->ifd != -1){ + close(w->ifd); + } } w->ifd = w->ofd; @@ -688,6 +690,9 @@ const char *web_response_code_to_string(int code) { case 200: return "OK"; + case 301: + return "Moved Permanently"; + case 307: return "Temporary Redirect"; @@ -724,15 +729,21 @@ const char *web_response_code_to_string(int code) { } static inline char *http_header_parse(struct web_client *w, char *s, int parse_useragent) { - static uint32_t hash_origin = 0, hash_connection = 0, hash_accept_encoding = 0, hash_donottrack = 0, hash_useragent = 0, hash_authorization = 0; + static uint32_t hash_origin = 0, hash_connection = 0, hash_donottrack = 0, hash_useragent = 0, hash_authorization = 0, hash_host = 0; +#ifdef NETDATA_WITH_ZLIB + static uint32_t hash_accept_encoding = 0; +#endif if(unlikely(!hash_origin)) { hash_origin = simple_uhash("Origin"); hash_connection = simple_uhash("Connection"); +#ifdef NETDATA_WITH_ZLIB hash_accept_encoding = simple_uhash("Accept-Encoding"); +#endif hash_donottrack = simple_uhash("DNT"); hash_useragent = simple_uhash("User-Agent"); hash_authorization = simple_uhash("X-Auth-Token"); + hash_host = simple_uhash("Host"); } char *e = s; @@ -780,6 +791,9 @@ static inline char *http_header_parse(struct web_client *w, char *s, int parse_u } else if(hash == hash_authorization&& !strcasecmp(s, "X-Auth-Token")) { w->auth_bearer_token = strdupz(v); } + else if(hash == hash_host && !strcasecmp(s, "Host")){ + strncpyz(w->host, v, (ve - v)); + } #ifdef NETDATA_WITH_ZLIB else if(hash == hash_accept_encoding && !strcasecmp(s, "Accept-Encoding")) { if(web_enable_gzip) { @@ -807,7 +821,12 @@ static inline char *http_header_parse(struct web_client *w, char *s, int parse_u typedef enum { HTTP_VALIDATION_OK, HTTP_VALIDATION_NOT_SUPPORTED, +#ifdef ENABLE_HTTPS + HTTP_VALIDATION_INCOMPLETE, + HTTP_VALIDATION_REDIRECT +#else HTTP_VALIDATION_INCOMPLETE +#endif } HTTP_VALIDATION; static inline HTTP_VALIDATION http_request_validate(struct web_client *w) { @@ -847,6 +866,35 @@ static inline HTTP_VALIDATION http_request_validate(struct web_client *w) { w->mode = WEB_CLIENT_MODE_OPTIONS; } else if(!strncmp(s, "STREAM ", 7)) { +#ifdef ENABLE_HTTPS + if ( (w->ssl.flags) && (netdata_use_ssl_on_stream & NETDATA_SSL_FORCE)){ + w->header_parse_tries = 0; + w->header_parse_last_size = 0; + web_client_disable_wait_receive(w); + char hostname[256]; + char *copyme = strstr(s,"hostname="); + if ( copyme ){ + copyme += 9; + char *end = strchr(copyme,'&'); + if(end){ + size_t length = end - copyme; + memcpy(hostname,copyme,length); + hostname[length] = 0X00; + } + else{ + memcpy(hostname,"not available",13); + hostname[13] = 0x00; + } + } + else{ + memcpy(hostname,"not available",13); + hostname[13] = 0x00; + } + error("The server is configured to always use encrypt connection, please enable the SSL on slave with hostname '%s'.",hostname); + return HTTP_VALIDATION_NOT_SUPPORTED; + } +#endif + encoded_url = s = &s[7]; w->mode = WEB_CLIENT_MODE_STREAM; } @@ -899,6 +947,16 @@ static inline HTTP_VALIDATION http_request_validate(struct web_client *w) { // copy the URL - we are going to overwrite parts of it // TODO -- ideally we we should avoid copying buffers around strncpyz(w->last_url, w->decoded_url, NETDATA_WEB_REQUEST_URL_SIZE); +#ifdef ENABLE_HTTPS + if ( (!web_client_check_unix(w)) && (netdata_srv_ctx) ) { + if ((w->ssl.conn) && ((w->ssl.flags & NETDATA_SSL_NO_HANDSHAKE) && (netdata_use_ssl_on_http & NETDATA_SSL_FORCE) && (w->mode != WEB_CLIENT_MODE_STREAM)) ) { + w->header_parse_tries = 0; + w->header_parse_last_size = 0; + web_client_disable_wait_receive(w); + return HTTP_VALIDATION_REDIRECT; + } + } +#endif w->header_parse_tries = 0; w->header_parse_last_size = 0; @@ -918,6 +976,26 @@ static inline HTTP_VALIDATION http_request_validate(struct web_client *w) { return HTTP_VALIDATION_INCOMPLETE; } +static inline ssize_t web_client_send_data(struct web_client *w,const void *buf,size_t len, int flags) +{ + ssize_t bytes; +#ifdef ENABLE_HTTPS + if ( (!web_client_check_unix(w)) && (netdata_srv_ctx) ) { + if ( ( w->ssl.conn ) && ( !w->ssl.flags ) ){ + bytes = SSL_write(w->ssl.conn,buf, len) ; + } else { + bytes = send(w->ofd,buf, len , flags); + } + } else { + bytes = send(w->ofd,buf, len , flags); + } +#else + bytes = send(w->ofd, buf, len, flags); +#endif + + return bytes; +} + static inline void web_client_send_http_header(struct web_client *w) { if(unlikely(w->response.code != 200)) buffer_no_cacheable(w->response.data); @@ -948,6 +1026,23 @@ static inline void web_client_send_http_header(struct web_client *w) { strftime(edate, sizeof(edate), "%a, %d %b %Y %H:%M:%S %Z", tm); } + char headerbegin[8328]; + if (w->response.code == 301) { + memcpy(headerbegin,"\r\nLocation: https://",20); + size_t headerlength = strlen(w->host); + memcpy(&headerbegin[20],w->host,headerlength); + headerlength += 20; + size_t tmp = strlen(w->last_url); + memcpy(&headerbegin[headerlength],w->last_url,tmp); + headerlength += tmp; + memcpy(&headerbegin[headerlength],"\r\n",2); + headerlength += 2; + headerbegin[headerlength] = 0x00; + }else { + memcpy(headerbegin,"\r\n",2); + headerbegin[2]=0x00; + } + buffer_sprintf(w->response.header_output, "HTTP/1.1 %d %s\r\n" "Connection: %s\r\n" @@ -955,13 +1050,14 @@ static inline void web_client_send_http_header(struct web_client *w) { "Access-Control-Allow-Origin: %s\r\n" "Access-Control-Allow-Credentials: true\r\n" "Content-Type: %s\r\n" - "Date: %s\r\n" + "Date: %s%s" , w->response.code, code_msg , web_client_has_keepalive(w)?"keep-alive":"close" , VERSION , w->origin , content_type_string , date + , headerbegin ); if(unlikely(web_x_frame_options)) @@ -1046,6 +1142,37 @@ static inline void web_client_send_http_header(struct web_client *w) { size_t count = 0; ssize_t bytes; +#ifdef ENABLE_HTTPS + if ( (!web_client_check_unix(w)) && (netdata_srv_ctx) ) { + if ( ( w->ssl.conn ) && ( !w->ssl.flags ) ){ + while((bytes = SSL_write(w->ssl.conn, buffer_tostring(w->response.header_output), buffer_strlen(w->response.header_output))) < 0) { + count++; + if(count > 100 || (errno != EAGAIN && errno != EWOULDBLOCK)) { + error("Cannot send HTTP headers to web client."); + break; + } + } + } else { + while((bytes = send(w->ofd, buffer_tostring(w->response.header_output), buffer_strlen(w->response.header_output), 0)) == -1) { + count++; + + if(count > 100 || (errno != EAGAIN && errno != EWOULDBLOCK)) { + error("Cannot send HTTP headers to web client."); + break; + } + } + } + } else { + while((bytes = send(w->ofd, buffer_tostring(w->response.header_output), buffer_strlen(w->response.header_output), 0)) == -1) { + count++; + + if(count > 100 || (errno != EAGAIN && errno != EWOULDBLOCK)) { + error("Cannot send HTTP headers to web client."); + break; + } + } + } +#else while((bytes = send(w->ofd, buffer_tostring(w->response.header_output), buffer_strlen(w->response.header_output), 0)) == -1) { count++; @@ -1054,6 +1181,7 @@ static inline void web_client_send_http_header(struct web_client *w) { break; } } +#endif if(bytes != (ssize_t) buffer_strlen(w->response.header_output)) { if(bytes > 0) @@ -1303,7 +1431,16 @@ void web_client_process_request(struct web_client *w) { return; } break; - +#ifdef ENABLE_HTTPS + case HTTP_VALIDATION_REDIRECT: + { + buffer_flush(w->response.data); + w->response.data->contenttype = CT_TEXT_HTML; + buffer_strcat(w->response.data, "<!DOCTYPE html><!-- SPDX-License-Identifier: GPL-3.0-or-later --><html><body onload=\"window.location.href ='https://'+ window.location.hostname + ':' + window.location.port + window.location.pathname\">Redirecting to safety connection, case your browser does not support redirection, please click <a onclick=\"window.location.href ='https://'+ window.location.hostname + ':' + window.location.port + window.location.pathname\">here</a>.</body></html>"); + w->response.code = 301; + break; + } +#endif case HTTP_VALIDATION_NOT_SUPPORTED: debug(D_WEB_CLIENT_ACCESS, "%llu: Cannot understand '%s'.", w->id, w->response.data->buffer); @@ -1373,9 +1510,11 @@ ssize_t web_client_send_chunk_header(struct web_client *w, size_t len) { debug(D_DEFLATE, "%llu: OPEN CHUNK of %zu bytes (hex: %zx).", w->id, len, len); char buf[24]; - sprintf(buf, "%zX\r\n", len); - - ssize_t bytes = send(w->ofd, buf, strlen(buf), 0); + ssize_t bytes; + bytes = (ssize_t)sprintf(buf, "%zX\r\n", len); + buf[bytes] = 0x00; + + bytes = web_client_send_data(w,buf,strlen(buf),0); if(bytes > 0) { debug(D_DEFLATE, "%llu: Sent chunk header %zd bytes.", w->id, bytes); w->stats_sent_bytes += bytes; @@ -1397,7 +1536,8 @@ ssize_t web_client_send_chunk_close(struct web_client *w) { //debug(D_DEFLATE, "%llu: CLOSE CHUNK.", w->id); - ssize_t bytes = send(w->ofd, "\r\n", 2, 0); + ssize_t bytes; + bytes = web_client_send_data(w,"\r\n",2,0); if(bytes > 0) { debug(D_DEFLATE, "%llu: Sent chunk suffix %zd bytes.", w->id, bytes); w->stats_sent_bytes += bytes; @@ -1419,7 +1559,8 @@ ssize_t web_client_send_chunk_finalize(struct web_client *w) { //debug(D_DEFLATE, "%llu: FINALIZE CHUNK.", w->id); - ssize_t bytes = send(w->ofd, "\r\n0\r\n\r\n", 7, 0); + ssize_t bytes; + bytes = web_client_send_data(w,"\r\n0\r\n\r\n",7,0); if(bytes > 0) { debug(D_DEFLATE, "%llu: Sent chunk suffix %zd bytes.", w->id, bytes); w->stats_sent_bytes += bytes; @@ -1533,7 +1674,7 @@ ssize_t web_client_send_deflate(struct web_client *w) debug(D_WEB_CLIENT, "%llu: Sending %zu bytes of data (+%zd of chunk header).", w->id, w->response.zhave - w->response.zsent, t); - len = send(w->ofd, &w->response.zbuffer[w->response.zsent], (size_t) (w->response.zhave - w->response.zsent), MSG_DONTWAIT); + len = web_client_send_data(w,&w->response.zbuffer[w->response.zsent], (size_t) (w->response.zhave - w->response.zsent), MSG_DONTWAIT); if(len > 0) { w->stats_sent_bytes += len; w->response.zsent += len; @@ -1589,7 +1730,7 @@ ssize_t web_client_send(struct web_client *w) { return 0; } - bytes = send(w->ofd, &w->response.data->buffer[w->response.sent], w->response.data->len - w->response.sent, MSG_DONTWAIT); + bytes = web_client_send_data(w,&w->response.data->buffer[w->response.sent], w->response.data->len - w->response.sent, MSG_DONTWAIT); if(likely(bytes > 0)) { w->stats_sent_bytes += bytes; w->response.sent += bytes; @@ -1664,11 +1805,26 @@ ssize_t web_client_receive(struct web_client *w) if(unlikely(w->mode == WEB_CLIENT_MODE_FILECOPY)) return web_client_read_file(w); + ssize_t bytes; + ssize_t left = w->response.data->size - w->response.data->len; + // do we have any space for more data? buffer_need_bytes(w->response.data, NETDATA_WEB_REQUEST_RECEIVE_SIZE); - ssize_t left = w->response.data->size - w->response.data->len; - ssize_t bytes = recv(w->ifd, &w->response.data->buffer[w->response.data->len], (size_t) (left - 1), MSG_DONTWAIT); +#ifdef ENABLE_HTTPS + if ( (!web_client_check_unix(w)) && (netdata_srv_ctx) ) { + if ( ( w->ssl.conn ) && (!w->ssl.flags)) { + bytes = SSL_read(w->ssl.conn, &w->response.data->buffer[w->response.data->len], (size_t) (left - 1)); + }else { + bytes = recv(w->ifd, &w->response.data->buffer[w->response.data->len], (size_t) (left - 1), MSG_DONTWAIT); + } + } + else{ + bytes = recv(w->ifd, &w->response.data->buffer[w->response.data->len], (size_t) (left - 1), MSG_DONTWAIT); + } +#else + bytes = recv(w->ifd, &w->response.data->buffer[w->response.data->len], (size_t) (left - 1), MSG_DONTWAIT); +#endif if(likely(bytes > 0)) { w->stats_received_bytes += bytes; diff --git a/web/server/web_client.h b/web/server/web_client.h index 4263e252..0a57e8d8 100644 --- a/web/server/web_client.h +++ b/web/server/web_client.h @@ -129,6 +129,7 @@ struct web_client { char decoded_url[NETDATA_WEB_REQUEST_URL_SIZE + 1]; // we decode the URL in this buffer char last_url[NETDATA_WEB_REQUEST_URL_SIZE+1]; // we keep a copy of the decoded URL here + char host[256]; struct timeval tv_in, tv_ready; @@ -153,6 +154,9 @@ struct web_client { // STATIC-THREADED WEB SERVER MEMBERS size_t pollinfo_slot; // POLLINFO slot of the web client size_t pollinfo_filecopy_slot; // POLLINFO slot of the file read +#ifdef ENABLE_HTTPS + struct netdata_ssl ssl; +#endif }; extern uid_t web_files_uid(void); diff --git a/web/server/web_client_cache.c b/web/server/web_client_cache.c index ab470560..763e7e96 100644 --- a/web/server/web_client_cache.c +++ b/web/server/web_client_cache.c @@ -6,6 +6,18 @@ // ---------------------------------------------------------------------------- // allocate and free web_clients +#ifdef ENABLE_HTTPS + +static void web_client_reuse_ssl(struct web_client *w) { + if (netdata_srv_ctx) { + if (w->ssl.conn) { + SSL_clear(w->ssl.conn); + } + } +} +#endif + + static void web_client_zero(struct web_client *w) { // zero everything about it - but keep the buffers @@ -35,6 +47,14 @@ static void web_client_free(struct web_client *w) { buffer_free(w->response.header); buffer_free(w->response.data); freez(w->user_agent); +#ifdef ENABLE_HTTPS + if ((!web_client_check_unix(w)) && ( netdata_srv_ctx )) { + if (w->ssl.conn) { + SSL_free(w->ssl.conn); + w->ssl.conn = NULL; + } + } +#endif freez(w); } @@ -159,12 +179,25 @@ struct web_client *web_client_get_from_cache_or_allocate() { if(w->prev) w->prev->next = w->next; if(w->next) w->next->prev = w->prev; web_clients_cache.avail_count--; +#ifdef ENABLE_HTTPS + web_client_reuse_ssl(w); + SSL *ssl = w->ssl.conn; +#endif web_client_zero(w); web_clients_cache.reused++; +#ifdef ENABLE_HTTPS + w->ssl.conn = ssl; + w->ssl.flags = NETDATA_SSL_START; + debug(D_WEB_CLIENT_ACCESS,"Reusing SSL structure with (w->ssl = NULL, w->accepted = %d)",w->ssl.flags); +#endif } else { // allocate it w = web_client_alloc(); +#ifdef ENABLE_HTTPS + w->ssl.flags = NETDATA_SSL_START; + debug(D_WEB_CLIENT_ACCESS,"Starting SSL structure with (w->ssl = NULL, w->accepted = %d)",w->ssl.flags); +#endif web_clients_cache.allocated++; } @@ -205,6 +238,11 @@ void web_client_release(struct web_client *w) { if (w->ifd != -1) close(w->ifd); if (w->ofd != -1 && w->ofd != w->ifd) close(w->ofd); w->ifd = w->ofd = -1; +#ifdef ENABLE_HTTPS + web_client_reuse_ssl(w); + w->ssl.flags = NETDATA_SSL_START; +#endif + } // unlink it from the used diff --git a/web/server/web_server.c b/web/server/web_server.c index 11f7edf8..9e51c81f 100644 --- a/web/server/web_server.c +++ b/web/server/web_server.c @@ -138,5 +138,3 @@ void web_client_initialize_connection(struct web_client *w) { web_client_cache_verify(0); } - - |