toolkit/components/glean/build_scripts/perf_data_review.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168

# -*- coding: utf-8 -*-

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

"""
Produce skeleton Performance Data Review Requests.

This was mostly copies from glean_parser, and should be kept in sync.
"""

import re
from pathlib import Path
from typing import Sequence

from glean_parser import parser, util


def generate(
    bug: str,
    metrics_files: Sequence[Path],
) -> int:
    """
    Commandline helper for Data Review Request template generation.

    :param bug: pattern to match in metrics' bug_numbers lists.
    :param metrics_files: List of Path objects to load metrics from.
    :return: Non-zero if there were any errors.
    """

    metrics_files = util.ensure_list(metrics_files)

    # Accept any value of expires.
    parser_options = {
        "allow_reserved": True,
        "custom_is_expired": lambda expires: False,
        "custom_validate_expires": lambda expires: True,
    }
    all_objects = parser.parse_objects(metrics_files, parser_options)

    if util.report_validation_errors(all_objects):
        return 1

    # I tried [\W\Z] but it complained. So `|` it is.
    reobj = re.compile(f"\\W{bug}\\W|\\W{bug}$")
    durations = set()
    responsible_emails = set()
    metrics_table = ""
    for category_name, metrics in all_objects.value.items():
        for metric in metrics.values():
            if not any([len(reobj.findall(bug)) == 1 for bug in metric.bugs]):
                continue

            metric_name = util.snake_case(metric.name)
            category_name = util.snake_case(category_name)
            one_line_desc = metric.description.replace("\n", " ")
            sensitivity = ", ".join([s.name for s in metric.data_sensitivity])
            last_bug = metric.bugs[-1]
            metrics_table += f"`{category_name}.{metric_name}` | "
            metrics_table += f"{one_line_desc} | {sensitivity} | {last_bug}\n"
            if metric.type == "event" and len(metric.allowed_extra_keys):
                for extra_name, extra_detail in metric.extra_keys.items():
                    extra_one_line_desc = extra_detail["description"].replace("\n", " ")
                    metrics_table += f"`{category_name}.{metric_name}#{extra_name}` | "
                    metrics_table += (
                        f"{extra_one_line_desc} | {sensitivity} | {last_bug}\n"
                    )

            durations.add(metric.expires)

            if metric.expires == "never":
                responsible_emails.update(metric.notification_emails)

    if len(durations) == 1:
        duration = next(iter(durations))
        if duration == "never":
            collection_duration = "This collection will be collected permanently."
        else:
            collection_duration = f"This collection has expiry '{duration}'"
    else:
        collection_duration = "Parts of this collection expire at different times: "
        collection_duration += f"{durations}"

    if "never" in durations:
        collection_duration += "\n" + ", ".join(responsible_emails) + " "
        collection_duration += "will be responsible for the permanent collections."

    if len(durations) == 0:
        print(f"I'm sorry, I couldn't find metrics matching the bug number {bug}.")
        return 1

    # This template is pulled from
    # https://github.com/mozilla/data-review/blob/main/request.md
    print(
        """
!! Reminder: it is your responsibility to complete and check the correctness of
!! this automatically-generated request skeleton before requesting Data
!! Collection Review. See https://wiki.mozilla.org/Data_Collection for details.

DATA REVIEW REQUEST
1. What questions will you answer with this data?

TODO: Fill this in.

2. Why does Mozilla need to answer these questions? Are there benefits for users?
   Do we need this information to address product or business requirements?

In order to guarantee the performance of our products, it is vital to monitor
real-world installs used by real-world users.

3. What alternative methods did you consider to answer these questions?
   Why were they not sufficient?

Our ability to measure the practical performance impact of changes through CI
and manual testing is limited. Monitoring the performance of our products in
the wild among real users is the only way to be sure we have an accurate
picture.

4. Can current instrumentation answer these questions?

No.

5. List all proposed measurements and indicate the category of data collection for each
   measurement, using the Firefox data collection categories found on the Mozilla wiki.

Measurement Name | Measurement Description | Data Collection Category | Tracking Bug
---------------- | ----------------------- | ------------------------ | ------------"""
    )
    print(metrics_table)
    print(
        """
6. Please provide a link to the documentation for this data collection which
   describes the ultimate data set in a public, complete, and accurate way.

This collection is Glean so is documented
[in the Glean Dictionary](https://dictionary.telemetry.mozilla.org).

7. How long will this data be collected?
"""
    )
    print(collection_duration)
    print(
        """
8. What populations will you measure?

All channels, countries, and locales. No filters.

9. If this data collection is default on, what is the opt-out mechanism for users?

These collections are Glean. The opt-out can be found in the product's preferences.

10. Please provide a general description of how you will analyze this data.

This will be continuously monitored for regression and improvement detection.

11. Where do you intend to share the results of your analysis?

Internal monitoring (GLAM, Redash, Looker, etc.).

12. Is there a third-party tool (i.e. not Telemetry) that you
    are proposing to use for this data collection?

No.
"""
    )

    return 0