src/fluent-bit/lib/librdkafka-2.1.0/packaging/nuget/cleanup-s3.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143

#!/usr/bin/env python3
#
# Clean up test builds from librdkafka's S3 bucket.
# This also covers python builds.

import re
from datetime import datetime, timezone
import boto3
import argparse

# Collects CI artifacts from S3 storage, downloading them
# to a local directory, or collecting already downloaded artifacts from
# local directory.
#
# The artifacts' folder in the S3 bucket must have the following token
# format:
#  <token>-[<value>]__   (repeat)
#
# Recognized tokens (unrecognized tokens are ignored):
#  p       - project (e.g., "confluent-kafka-python")
#  bld     - builder (e.g., "travis")
#  plat    - platform ("osx", "linux", ..)
#  arch    - arch ("x64", ..)
#  tag     - git tag
#  sha     - git sha
#  bid     - builder's build-id
#  bldtype - Release, Debug (appveyor)
#  lnk     - std, static
#
# Example:
#   librdkafka/p-librdkafka__bld-travis__plat-linux__arch-x64__tag-v0.0.62__sha-d051b2c19eb0c118991cd8bc5cf86d8e5e446cde__bid-1562.1/librdkafka.tar.gz


s3_bucket = 'librdkafka-ci-packages'


def may_delete(path):
    """ Returns true if S3 object path is eligible for deletion, e.g.
        has a non-release/rc tag. """

    # The path contains the tokens needed to perform
    # matching of project, gitref, etc.
    rinfo = re.findall(r'(?P<tag>[^-]+)-(?P<val>.*?)(?:__|$)', path)
    if rinfo is None or len(rinfo) == 0:
        print(f"Incorrect folder/file name format for {path}")
        return False

    info = dict(rinfo)

    tag = info.get('tag', None)
    if tag is not None and (len(tag) == 0 or tag.startswith('$(')):
        # AppVeyor doesn't substite $(APPVEYOR_REPO_TAG_NAME)
        # with an empty value when not set, it leaves that token
        # in the string - so translate that to no tag.
        del info['tag']
        tag = None

    if tag is None:
        return True

    if re.match(r'^v?\d+\.\d+\.\d+(-?RC\d+)?$', tag,
                flags=re.IGNORECASE) is None:
        return True

    return False


def collect_s3(s3, min_age_days=60):
    """ Collect artifacts from S3 """
    now = datetime.now(timezone.utc)
    eligible = []
    totcnt = 0
    # note: list_objects will return at most 1000 objects per call,
    #       use continuation token to read full list.
    cont_token = None
    more = True
    while more:
        if cont_token is not None:
            res = s3.list_objects_v2(Bucket=s3_bucket,
                                     ContinuationToken=cont_token)
        else:
            res = s3.list_objects_v2(Bucket=s3_bucket)

        if res.get('IsTruncated') is True:
            cont_token = res.get('NextContinuationToken')
        else:
            more = False

        for item in res.get('Contents'):
            totcnt += 1
            age = (now - item.get('LastModified')).days
            path = item.get('Key')
            if age >= min_age_days and may_delete(path):
                eligible.append(path)

    return (eligible, totcnt)


def chunk_list(lst, cnt):
    """ Split list into lists of cnt """
    for i in range(0, len(lst), cnt):
        yield lst[i:i + cnt]


if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument("--delete",
                        help="WARNING! Don't just check, actually delete "
                        "S3 objects.",
                        action="store_true")
    parser.add_argument("--age", help="Minimum object age in days.",
                        type=int, default=360)

    args = parser.parse_args()
    dry_run = args.delete is not True
    min_age_days = args.age

    if dry_run:
        op = "Eligible for deletion"
    else:
        op = "Deleting"

    s3 = boto3.client('s3')

    # Collect eligible artifacts
    eligible, totcnt = collect_s3(s3, min_age_days=min_age_days)
    print(f"{len(eligible)}/{totcnt} eligible artifacts to delete")

    # Delete in chunks of 1000 (max what the S3 API can do)
    for chunk in chunk_list(eligible, 1000):
        print(op + ":\n" + '\n'.join(chunk))
        if dry_run:
            continue

        res = s3.delete_objects(Bucket=s3_bucket,
                                Delete={
                                    'Objects': [{'Key': x} for x in chunk],
                                    'Quiet': True
                                })
        errors = res.get('Errors', [])
        if len(errors) > 0:
            raise Exception(f"Delete failed: {errors}")