# This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. import atexit import collections import itertools import json import math import os import re from collections import OrderedDict from ctypes import c_int from . import shared_telemetry_utils as utils from .shared_telemetry_utils import ParserError atexit.register(ParserError.exit_func) # Constants. MAX_LABEL_LENGTH = 20 MAX_LABEL_COUNT = 100 MAX_KEY_COUNT = 30 MAX_KEY_LENGTH = 20 MIN_CATEGORICAL_BUCKET_COUNT = 50 CPP_IDENTIFIER_PATTERN = "^[a-z][a-z0-9_]+[a-z0-9]$" ALWAYS_ALLOWED_KEYS = [ "kind", "description", "operating_systems", "expires_in_version", "alert_emails", "keyed", "releaseChannelCollection", "bug_numbers", "keys", "record_in_processes", "record_into_store", "products", ] BASE_DOC_URL = ( "https://firefox-source-docs.mozilla.org/toolkit/components/" "telemetry/telemetry/" ) HISTOGRAMS_DOC_URL = BASE_DOC_URL + "collection/histograms.html" SCALARS_DOC_URL = BASE_DOC_URL + "collection/scalars.html" GECKOVIEW_STREAMING_SUPPORTED_KINDS = [ "linear", "exponential", "categorical", ] def linear_buckets(dmin, dmax, n_buckets): ret_array = [0] * n_buckets dmin = float(dmin) dmax = float(dmax) for i in range(1, n_buckets): linear_range = (dmin * (n_buckets - 1 - i) + dmax * (i - 1)) / (n_buckets - 2) ret_array[i] = int(linear_range + 0.5) return ret_array def exponential_buckets(dmin, dmax, n_buckets): log_max = math.log(dmax) bucket_index = 2 ret_array = [0] * n_buckets current = dmin ret_array[1] = current for bucket_index in range(2, n_buckets): log_current = math.log(current) log_ratio = (log_max - log_current) / (n_buckets - bucket_index) log_next = log_current + log_ratio next_value = int(math.floor(math.exp(log_next) + 0.5)) if next_value > current: current = next_value else: current = current + 1 ret_array[bucket_index] = current return ret_array allowlists = None def load_allowlist(): global allowlists try: parsers_path = os.path.realpath(os.path.dirname(__file__)) # The parsers live in build_scripts/parsers in the Telemetry module, while # the histogram-allowlists file lives in the root of the module. Account # for that when looking for the allowlist. # NOTE: if the parsers are moved, this logic will need to be updated. telemetry_module_path = os.path.abspath( os.path.join(parsers_path, os.pardir, os.pardir) ) allowlist_path = os.path.join( telemetry_module_path, "histogram-allowlists.json" ) with open(allowlist_path, "r") as f: try: allowlists = json.load(f) for name, allowlist in allowlists.items(): allowlists[name] = set(allowlist) except ValueError: ParserError("Error parsing allowlist: %s" % allowlist_path).handle_now() except IOError: allowlists = None ParserError("Unable to parse allowlist: %s." % allowlist_path).handle_now() class Histogram: """A class for representing a histogram definition.""" def __init__(self, name, definition, strict_type_checks=False): """Initialize a histogram named name with the given definition. definition is a dict-like object that must contain at least the keys: - 'kind': The kind of histogram. Must be one of 'boolean', 'flag', 'count', 'enumerated', 'linear', or 'exponential'. - 'description': A textual description of the histogram. - 'strict_type_checks': A boolean indicating whether to use the new, stricter type checks. The server-side still has to deal with old, oddly typed submissions, so we have to skip them there by default. """ self._strict_type_checks = strict_type_checks self.verify_attributes(name, definition) self._name = name self._description = definition["description"] self._kind = definition["kind"] self._keys = definition.get("keys", []) self._keyed = definition.get("keyed", False) self._expiration = definition.get("expires_in_version") self._labels = definition.get("labels", []) self._record_in_processes = definition.get("record_in_processes") self._record_into_store = definition.get("record_into_store", ["main"]) self._products = definition.get("products") self._operating_systems = definition.get("operating_systems", ["all"]) self.compute_bucket_parameters(definition) self.set_nsITelemetry_kind() self.set_dataset(definition) def name(self): """Return the name of the histogram.""" return self._name def description(self): """Return the description of the histogram.""" return self._description def kind(self): """Return the kind of the histogram. Will be one of 'boolean', 'flag', 'count', 'enumerated', 'categorical', 'linear', or 'exponential'.""" return self._kind def expiration(self): """Return the expiration version of the histogram.""" return self._expiration def nsITelemetry_kind(self): """Return the nsITelemetry constant corresponding to the kind of the histogram.""" return self._nsITelemetry_kind def low(self): """Return the lower bound of the histogram.""" return self._low def high(self): """Return the high bound of the histogram.""" return self._high def n_buckets(self): """Return the number of buckets in the histogram.""" return self._n_buckets def keyed(self): """Returns True if this a keyed histogram, false otherwise.""" return self._keyed def keys(self): """Returns a list of allowed keys for keyed histogram, [] for others.""" return self._keys def dataset(self): """Returns the dataset this histogram belongs into.""" return self._dataset def labels(self): """Returns a list of labels for a categorical histogram, [] for others.""" return self._labels def record_in_processes(self): """Returns a list of processes this histogram is permitted to record in.""" return self._record_in_processes def record_in_processes_enum(self): """Get the non-empty list of flags representing the processes to record data in""" return [utils.process_name_to_enum(p) for p in self.record_in_processes()] def products(self): """Get the non-empty list of products to record data on""" return self._products def products_enum(self): """Get the non-empty list of flags representing products to record data on""" return [utils.product_name_to_enum(p) for p in self.products()] def operating_systems(self): """Get the list of operating systems to record data on""" return self._operating_systems def record_on_os(self, target_os): """Check if this probe should be recorded on the passed os.""" os = self.operating_systems() if "all" in os: return True canonical_os = utils.canonical_os(target_os) if "unix" in os and canonical_os in utils.UNIX_LIKE_OS: return True return canonical_os in os def record_into_store(self): """Get the non-empty list of stores to record into""" return self._record_into_store def ranges(self): """Return an array of lower bounds for each bucket in the histogram.""" bucket_fns = { "boolean": linear_buckets, "flag": linear_buckets, "count": linear_buckets, "enumerated": linear_buckets, "categorical": linear_buckets, "linear": linear_buckets, "exponential": exponential_buckets, } if self._kind not in bucket_fns: ParserError( 'Unknown kind "%s" for histogram "%s".' % (self._kind, self._name) ).handle_later() fn = bucket_fns[self._kind] return fn(self.low(), self.high(), self.n_buckets()) def compute_bucket_parameters(self, definition): bucket_fns = { "boolean": Histogram.boolean_flag_bucket_parameters, "flag": Histogram.boolean_flag_bucket_parameters, "count": Histogram.boolean_flag_bucket_parameters, "enumerated": Histogram.enumerated_bucket_parameters, "categorical": Histogram.categorical_bucket_parameters, "linear": Histogram.linear_bucket_parameters, "exponential": Histogram.exponential_bucket_parameters, } if self._kind not in bucket_fns: ParserError( 'Unknown kind "%s" for histogram "%s".' % (self._kind, self._name) ).handle_later() fn = bucket_fns[self._kind] self.set_bucket_parameters(*fn(definition)) def verify_attributes(self, name, definition): general_keys = ALWAYS_ALLOWED_KEYS + ["low", "high", "n_buckets"] table = { "boolean": ALWAYS_ALLOWED_KEYS, "flag": ALWAYS_ALLOWED_KEYS, "count": ALWAYS_ALLOWED_KEYS, "enumerated": ALWAYS_ALLOWED_KEYS + ["n_values"], "categorical": ALWAYS_ALLOWED_KEYS + ["labels", "n_values"], "linear": general_keys, "exponential": general_keys, } # We removed extended_statistics_ok on the client, but the server-side, # where _strict_type_checks==False, has to deal with historical data. if not self._strict_type_checks: table["exponential"].append("extended_statistics_ok") kind = definition["kind"] if kind not in table: ParserError( 'Unknown kind "%s" for histogram "%s".' % (kind, name) ).handle_later() allowed_keys = table[kind] self.check_name(name) self.check_keys(name, definition, allowed_keys) self.check_keys_field(name, definition) self.check_field_types(name, definition) self.check_allowlisted_kind(name, definition) self.check_allowlistable_fields(name, definition) self.check_expiration(name, definition) self.check_label_values(name, definition) self.check_record_in_processes(name, definition) self.check_products(name, definition) self.check_operating_systems(name, definition) self.check_record_into_store(name, definition) def check_name(self, name): if "#" in name: ParserError( 'Error for histogram name "%s": "#" is not allowed.' % (name) ).handle_later() # Avoid C++ identifier conflicts between histogram enums and label enum names. if name.startswith("LABELS_"): ParserError( 'Error for histogram name "%s": can not start with "LABELS_".' % (name) ).handle_later() # To make it easier to generate C++ identifiers from this etc., we restrict # the histogram names to a strict pattern. # We skip this on the server to avoid failures with old Histogram.json revisions. if self._strict_type_checks: if not re.match(CPP_IDENTIFIER_PATTERN, name, re.IGNORECASE): ParserError( 'Error for histogram name "%s": name does not conform to "%s"' % (name, CPP_IDENTIFIER_PATTERN) ).handle_later() def check_expiration(self, name, definition): field = "expires_in_version" expiration = definition.get(field) if not expiration: return # We forbid new probes from using "expires_in_version" : "default" field/value pair. # Old ones that use this are added to the allowlist. if ( expiration == "default" and allowlists is not None and name not in allowlists["expiry_default"] ): ParserError( 'New histogram "%s" cannot have "default" %s value.' % (name, field) ).handle_later() # Historical editions of Histograms.json can have the deprecated # expiration format 'N.Na1'. Fortunately, those scripts set # self._strict_type_checks to false. if ( expiration != "default" and not utils.validate_expiration_version(expiration) and self._strict_type_checks ): ParserError( ( "Error for histogram {} - invalid {}: {}." "\nSee: {}#expires-in-version" ).format(name, field, expiration, HISTOGRAMS_DOC_URL) ).handle_later() expiration = utils.add_expiration_postfix(expiration) definition[field] = expiration def check_label_values(self, name, definition): labels = definition.get("labels") if not labels: return invalid = filter(lambda l: len(l) > MAX_LABEL_LENGTH, labels) if len(list(invalid)) > 0: ParserError( 'Label values for "%s" exceed length limit of %d: %s' % (name, MAX_LABEL_LENGTH, ", ".join(invalid)) ).handle_later() if len(labels) > MAX_LABEL_COUNT: ParserError( 'Label count for "%s" exceeds limit of %d' % (name, MAX_LABEL_COUNT) ).handle_now() # To make it easier to generate C++ identifiers from this etc., we restrict # the label values to a strict pattern. invalid = filter( lambda l: not re.match(CPP_IDENTIFIER_PATTERN, l, re.IGNORECASE), labels ) if len(list(invalid)) > 0: ParserError( 'Label values for %s are not matching pattern "%s": %s' % (name, CPP_IDENTIFIER_PATTERN, ", ".join(invalid)) ).handle_later() def check_record_in_processes(self, name, definition): if not self._strict_type_checks: return field = "record_in_processes" rip = definition.get(field) DOC_URL = HISTOGRAMS_DOC_URL + "#record-in-processes" if not rip: ParserError( 'Histogram "%s" must have a "%s" field:\n%s' % (name, field, DOC_URL) ).handle_later() for process in rip: if not utils.is_valid_process_name(process): ParserError( 'Histogram "%s" has unknown process "%s" in %s.\n%s' % (name, process, field, DOC_URL) ).handle_later() def check_products(self, name, definition): if not self._strict_type_checks: return field = "products" products = definition.get(field) DOC_URL = HISTOGRAMS_DOC_URL + "#products" if not products: ParserError( 'Histogram "%s" must have a "%s" field:\n%s' % (name, field, DOC_URL) ).handle_now() for product in products: if not utils.is_valid_product(product): ParserError( 'Histogram "%s" has unknown product "%s" in %s.\n%s' % (name, product, field, DOC_URL) ).handle_later() if utils.is_geckoview_streaming_product(product): kind = definition.get("kind") if kind not in GECKOVIEW_STREAMING_SUPPORTED_KINDS: ParserError( ( 'Histogram "%s" is of kind "%s" which is unsupported for ' 'product "%s".' ) % (name, kind, product) ).handle_later() keyed = definition.get("keyed") if keyed: ParserError( 'Keyed histograms like "%s" are unsupported for product "%s"' % (name, product) ).handle_later() def check_operating_systems(self, name, definition): if not self._strict_type_checks: return field = "operating_systems" operating_systems = definition.get(field) DOC_URL = HISTOGRAMS_DOC_URL + "#operating-systems" if not operating_systems: # operating_systems is optional return for operating_system in operating_systems: if not utils.is_valid_os(operating_system): ParserError( 'Histogram "%s" has unknown operating system "%s" in %s.\n%s' % (name, operating_system, field, DOC_URL) ).handle_later() def check_record_into_store(self, name, definition): if not self._strict_type_checks: return field = "record_into_store" DOC_URL = HISTOGRAMS_DOC_URL + "#record-into-store" if field not in definition: # record_into_store is optional return record_into_store = definition.get(field) # record_into_store should not be empty if not record_into_store: ParserError( 'Histogram "%s" has empty list of stores, which is not allowed.\n%s' % (name, DOC_URL) ).handle_later() def check_keys_field(self, name, definition): keys = definition.get("keys") if not self._strict_type_checks or keys is None: return if not definition.get("keyed", False): raise ValueError( "'keys' field is not valid for %s; only allowed for keyed histograms." % (name) ) if len(keys) == 0: raise ValueError("The key list for %s cannot be empty" % (name)) if len(keys) > MAX_KEY_COUNT: raise ValueError( "Label count for %s exceeds limit of %d" % (name, MAX_KEY_COUNT) ) invalid = filter(lambda k: len(k) > MAX_KEY_LENGTH, keys) if len(list(invalid)) > 0: raise ValueError( '"keys" values for %s are exceeding length "%d": %s' % (name, MAX_KEY_LENGTH, ", ".join(invalid)) ) def check_allowlisted_kind(self, name, definition): # We don't need to run any of these checks on the server. if not self._strict_type_checks or allowlists is None: return # Disallow "flag" and "count" histograms on desktop, suggest to use # scalars instead. Allow using these histograms on Android, as we # don't support scalars there yet. hist_kind = definition.get("kind") android_target = "android" in definition.get("operating_systems", []) if ( not android_target and hist_kind in ["flag", "count"] and name not in allowlists["kind"] ): ParserError( ( 'Unsupported kind "%s" for histogram "%s":\n' 'New "%s" histograms are not supported on Desktop, you should' " use scalars instead:\n" "%s\n" "Are you trying to add a histogram on Android?" ' Add "operating_systems": ["android"] to your histogram definition.' ) % (hist_kind, name, hist_kind, SCALARS_DOC_URL) ).handle_now() # Check for the presence of fields that old histograms are allowlisted for. def check_allowlistable_fields(self, name, definition): # We don't need to run any of these checks on the server. if not self._strict_type_checks: return # In the pipeline we don't have allowlists available. if allowlists is None: return for field in ["alert_emails", "bug_numbers"]: if field not in definition and name not in allowlists[field]: ParserError( 'New histogram "%s" must have a "%s" field.' % (name, field) ).handle_later() if field in definition and name in allowlists[field]: msg = ( 'Histogram "%s" should be removed from the allowlist for "%s" in ' "histogram-allowlists.json." ) ParserError(msg % (name, field)).handle_later() def check_field_types(self, name, definition): # Define expected types for the histogram properties. type_checked_fields = { "n_buckets": int, "n_values": int, "low": int, "high": int, "keyed": bool, "expires_in_version": str, "kind": str, "description": str, "releaseChannelCollection": str, } # For list fields we check the items types. type_checked_list_fields = { "bug_numbers": int, "alert_emails": str, "labels": str, "record_in_processes": str, "keys": str, "products": str, "operating_systems": str, "record_into_store": str, } # For the server-side, where _strict_type_checks==False, we want to # skip the stricter type checks for these fields for dealing with # historical data. coerce_fields = ["low", "high", "n_values", "n_buckets"] if not self._strict_type_checks: # This handles some old non-numeric expressions. EXPRESSIONS = { "JS::GCReason::NUM_TELEMETRY_REASONS": 101, "mozilla::StartupTimeline::MAX_EVENT_ID": 12, } def try_to_coerce_to_number(v): if v in EXPRESSIONS: return EXPRESSIONS[v] try: return eval(v, {}) except Exception: return v for key in [k for k in coerce_fields if k in definition]: definition[key] = try_to_coerce_to_number(definition[key]) # This handles old "keyed":"true" definitions (bug 1271986). if definition.get("keyed", None) == "true": definition["keyed"] = True def nice_type_name(t): if t is str: return "string" return t.__name__ for key, key_type in type_checked_fields.items(): if key not in definition: continue if not isinstance(definition[key], key_type): ParserError( 'Value for key "{0}" in histogram "{1}" should be {2}.'.format( key, name, nice_type_name(key_type) ) ).handle_later() # Make sure the max range is lower than or equal to INT_MAX if "high" in definition and not c_int(definition["high"]).value > 0: ParserError( 'Value for high in histogram "{0}" should be lower or equal to INT_MAX.'.format( nice_type_name(c_int) ) ).handle_later() for key, key_type in type_checked_list_fields.items(): if key not in definition: continue if not all(isinstance(x, key_type) for x in definition[key]): ParserError( 'All values for list "{0}" in histogram "{1}" should be of type' " {2}.".format(key, name, nice_type_name(key_type)) ).handle_later() def check_keys(self, name, definition, allowed_keys): if not self._strict_type_checks: return for key in iter(definition.keys()): if key not in allowed_keys: ParserError( 'Key "%s" is not allowed for histogram "%s".' % (key, name) ).handle_later() def set_bucket_parameters(self, low, high, n_buckets): self._low = low self._high = high self._n_buckets = n_buckets max_n_buckets = 101 if self._kind in ["enumerated", "categorical"] else 100 if ( allowlists is not None and self._n_buckets > max_n_buckets and type(self._n_buckets) is int ): if self._name not in allowlists["n_buckets"]: ParserError( 'New histogram "%s" is not permitted to have more than 100 buckets.\n' "Histograms with large numbers of buckets use disproportionately high" " amounts of resources. Contact a Telemetry peer (e.g. in #telemetry)" " if you think an exception ought to be made:\n" "https://wiki.mozilla.org/Modules/Toolkit#Telemetry" % self._name ).handle_later() @staticmethod def boolean_flag_bucket_parameters(definition): return (1, 2, 3) @staticmethod def linear_bucket_parameters(definition): return (definition.get("low", 1), definition["high"], definition["n_buckets"]) @staticmethod def enumerated_bucket_parameters(definition): n_values = definition["n_values"] return (1, n_values, n_values + 1) @staticmethod def categorical_bucket_parameters(definition): # Categorical histograms default to 50 buckets to make working with them easier. # Otherwise when adding labels later we run into problems with the pipeline not # supporting bucket changes. # This can be overridden using the n_values field. n_values = max( len(definition["labels"]), definition.get("n_values", 0), MIN_CATEGORICAL_BUCKET_COUNT, ) return (1, n_values, n_values + 1) @staticmethod def exponential_bucket_parameters(definition): return (definition.get("low", 1), definition["high"], definition["n_buckets"]) def set_nsITelemetry_kind(self): # Pick a Telemetry implementation type. types = { "boolean": "BOOLEAN", "flag": "FLAG", "count": "COUNT", "enumerated": "LINEAR", "categorical": "CATEGORICAL", "linear": "LINEAR", "exponential": "EXPONENTIAL", } if self._kind not in types: ParserError( 'Unknown kind "%s" for histogram "%s".' % (self._kind, self._name) ).handle_later() self._nsITelemetry_kind = "nsITelemetry::HISTOGRAM_%s" % types[self._kind] def set_dataset(self, definition): datasets = { "opt-in": "DATASET_PRERELEASE_CHANNELS", "opt-out": "DATASET_ALL_CHANNELS", } value = definition.get("releaseChannelCollection", "opt-in") if value not in datasets: ParserError( "Unknown value for releaseChannelCollection" ' policy for histogram "%s".' % self._name ).handle_later() self._dataset = "nsITelemetry::" + datasets[value] # This hook function loads the histograms into an OrderedDict. # It will raise a ParserError if duplicate keys are found. def load_histograms_into_dict(ordered_pairs, strict_type_checks): d = collections.OrderedDict() for key, value in ordered_pairs: if strict_type_checks and key in d: ParserError( "Found duplicate key in Histograms file: %s" % key ).handle_later() d[key] = value return d # We support generating histograms from multiple different input files, not # just Histograms.json. For each file's basename, we have a specific # routine to parse that file, and return a dictionary mapping histogram # names to histogram parameters. def from_json(filename, strict_type_checks): with open(filename, "r") as f: try: def hook(ps): return load_histograms_into_dict(ps, strict_type_checks) histograms = json.load(f, object_pairs_hook=hook) except ValueError as e: ParserError( "error parsing histograms in %s: %s" % (filename, e) ).handle_now() return histograms def to_camel_case(property_name): return re.sub( "(^|_|-)([a-z0-9])", lambda m: m.group(2).upper(), property_name.strip("_").strip("-"), ) FILENAME_PARSERS = [ (lambda x: from_json if x.endswith(".json") else None), ] def from_files(filenames, strict_type_checks=True): """Return an iterator that provides a sequence of Histograms for the histograms defined in filenames. """ if strict_type_checks: load_allowlist() all_histograms = OrderedDict() for filename in filenames: parser = None for checkFn in FILENAME_PARSERS: parser = checkFn(os.path.basename(filename)) if parser is not None: break if parser is None: ParserError("Don't know how to parse %s." % filename).handle_now() histograms = parser(filename, strict_type_checks) # OrderedDicts are important, because then the iteration order over # the parsed histograms is stable, which makes the insertion into # all_histograms stable, which makes ordering in generated files # stable, which makes builds more deterministic. if not isinstance(histograms, OrderedDict): ParserError("Histogram parser did not provide an OrderedDict.").handle_now() for name, definition in histograms.items(): if name in all_histograms: ParserError('Duplicate histogram name "%s".' % name).handle_later() all_histograms[name] = definition # Check that histograms that were removed from Histograms.json etc. # are also removed from the allowlists. if allowlists is not None: all_allowlist_entries = itertools.chain.from_iterable(iter(allowlists.values())) orphaned = set(all_allowlist_entries) - set(all_histograms.keys()) if len(orphaned) > 0: msg = ( "The following entries are orphaned and should be removed from " "histogram-allowlists.json:\n%s" ) ParserError(msg % (", ".join(sorted(orphaned)))).handle_later() for name, definition in all_histograms.items(): yield Histogram(name, definition, strict_type_checks=strict_type_checks)