-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathp4p-cachegroom.py
443 lines (371 loc) · 13.8 KB
/
p4p-cachegroom.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
#! /bin/env python
from __future__ import print_function
import argparse
import datetime
import logging
import os
import os.path
import random
import re
import sys
from bisect import bisect_left
from collections import defaultdict
logger = logging.getLogger(__name__)
"""
This program scans a perforce versioned file repository as
maintained by a Perforce proxy, in order to keep its size
under control. It will delete the files with the least
recent access times to bring the number of files or the
total disk space used down to a given limit.
Can be run once a day by a cron script
"""
class DirNode:
# a node in a directory tree, to keep track of paths
# and store them in a more efficient way than full strings.
def __init__(self):
self.name = None
self.children = defaultdict(DirNode)
self.parent = None
def insert(self, path):
node = self
for part in os.path.split(path):
child = node.children[part]
child.name = part
child.parent = node
node = child
return node
def path(self):
parts = []
node = self
while node:
if node.name:
parts.append(node.name)
node = node.parent
return os.path.join(*reversed(parts))
rootnode = DirNode()
def fake_timestamp(days):
"""
Create a fake timestamp that is days old
"""
now = datetime.datetime.utcnow()
return (now - datetime.timedelta(days=random.randint(0, days))).timestamp()
def find_versioned_files(root, fake=False):
"""
Look for versioned files in the folder, those ending with known
Perforce endings for such files. Return them as a list of
access time, filesize, pathname, tuples.
"""
if fake:
# return fake files for testing
return [
(
fake_timestamp(30),
random.randint(10, 10000000),
rootnode.insert("foo"),
"fakefile%d" % i,
)
for i in range(1000)
]
entries = []
for dirpath, dirnames, filenames in os.walk(root):
for filename in filenames:
if not is_versioned(filename):
continue
filepath = os.path.join(dirpath, filename)
s = os.stat(filepath)
entries.append((s.st_atime, s.st_size, rootnode.insert(dirpath), filename))
return entries
# text files end with ,v.
# individual binary files are in folders ending with ,d and
# have names like 1.n where n is the version number.
# all can have a .gz tacked on.
BIN = re.compile(r"1\.\d+")
def is_versioned(filename):
if filename.endswith(",v") or filename.endswith(".gz"):
return True
if BIN.match(filename):
return True
return False
def sum_size(files):
"""convenience function to add up the sizes of files"""
return sum(t[1] for t in files)
def cumulative_sum(values):
"""create a cumulative sum, e.g. of file sizes"""
it = iter(values)
last = next(it)
yield last
for v in it:
last += v
yield last
def find_size_limit_old(files, limit_size):
"""
Find the place in the list where the size limit is exceeded
"""
size = sum_size(files)
for i, f in enumerate(files):
if size <= limit_size:
return i # we found the cut-off point
size -= f[1]
return i
def find_size_limit(cumulative_sizes, limit_size):
"""
Using cumulative sizes, find the place of old files to throw away
Return the first index that we keep
"""
# what is the cumulative target to throw away?
target = cumulative_sizes[-1] - limit_size
# special case for zero or negative values: we keep everything
if target <= 0:
return 0
# find index where cumulative size is greater or equal to what is needed to throw away
return find_ge(cumulative_sizes, target) + 1
def find_atime_limit(files, atime):
"""
Find the place in the list where the access time is equal or higher than given
If everything is older than time (lower access time) then return len(files)
"""
target = (atime, 0, "")
return find_ge(files, target)
# this comes from bisect documentation
def find_ge(a, x):
"Find leftmost item greater than or equal to x"
i = bisect_left(a, x)
return i
def unlink_files(files):
"""
Unlinks files from the list of files until criteria are met regarding
number of files or filesize. Relies on the order of the list, so that
to delete the oldest files, they should be sorted by access time
"""
n = 0
for f in files:
try:
dirnode, filename = f[2], f[3]
filename = os.path.join(dirnode.path(), filename)
os.unlink(filename)
n += 1
except OSError:
logger.warning("could not remove file %r", f[2])
return n
def format_size(s):
"""
helper to format large numbers as powers of 1024
"""
pre = ["Ki", "Mi", "Gi", "Ti"]
prefix = ""
ss = float(s)
while ss >= 1024 and pre:
prefix = pre[0]
del pre[0]
ss /= 1024
return ss, prefix
def format_size2(size):
return "%.2f %sB" % format_size(size)
def timestamp_from_datetime(dt):
timestamp = (dt - datetime.datetime(1970, 1, 1)).total_seconds()
return timestamp
def main():
def unitsize(a):
"""
helper to parse sizes such as 100G or 1T
"""
size = a
for i, unit in enumerate("KMGT"):
if a.endswith(unit):
size = a[:-1]
break
else:
i = -1
try:
size = float(size)
except ValueError:
raise argparse.ArgumentTypeError(
"%s must be a number, optionally followed by K, M or G" % a
)
return size * 1024 ** (i + 1)
parser = argparse.ArgumentParser(
description="clean a perforce proxy cache directory"
)
parser.add_argument("root", help="root of versioned files")
parser.add_argument("--dry-run", action="store_true", help="do not delete anything")
parser.add_argument(
"--max-size", type=unitsize, default=0, help="maximum total size of files"
)
parser.add_argument(
"--max-size-hard",
type=unitsize,
default=0,
help="maximum total size of files, overriding --min-age",
)
parser.add_argument(
"--max-count", type=int, default=0, help="maximum total number of files"
)
parser.add_argument(
"--min-age", type=int, help="don't throw away anything younger than this (days)"
)
parser.add_argument(
"--max-age", type=int, help="don't keep anything older than this (days)"
)
parser.add_argument("--verbose", "-v", action="store_true", help="be more verbose")
parser.add_argument("--fake", action="store_true", help="dry run with fake files")
args = parser.parse_args()
if 0 and not (args.max_size or args.max_count or args.max_age):
parser.error("At least one of --max-size, --max-count, --max-age required")
if args.fake:
args.dry_run = True
print("Trimming P4 versioned file folder %r:" % args.root)
if args.verbose:
print(" max-size: %s" % format_size2(args.max_size))
print(" max-count: %r" % args.max_count)
print(" max-age: %r" % args.max_age)
print(" min-age: %r" % args.min_age)
print(" max-size-hard: %r" % args.max_age)
files = find_versioned_files(args.root, args.fake)
size = sum_size(files)
print("Found %d files, %s" % (len(files), format_size2(size)))
if not files:
return # without any files, just quit
# sort files according to access time, oldest first (lowest timestamp)
files.sort()
# and create the cumulative sum
cumulative_sizes = list(cumulative_sum([f[1] for f in files]))
# now apply the criteria
i_keep = 0 # the index of the oldest (lowest timestamp) file we keep
now = datetime.datetime.utcnow()
# max-size, max-count and max-age, all work to create a lower bound on number of
# files to store. That is, we take the largest index (youngest) file that these produce.
if args.max_count > 0:
i = i_keep
i_keep = max(i_keep, len(files) - args.max_count)
if i_keep != i:
print(
"--max-count=%r limiting kept files to %d"
% (args.max_count, len(files) - i_keep)
)
elif args.verbose:
print("--max-count=%r not limiting kept files" % (args.max_count,))
if args.max_size > 0:
i = i_keep
i_keep = max(i_keep, find_size_limit(cumulative_sizes, args.max_size))
if i_keep != i:
print(
"--max-size=%s limiting kept files to %d"
% (format_size2(args.max_size), len(files) - i_keep)
)
elif args.verbose:
print(
"--max-size=%r not limiting kept files" % (format_size2(args.max_size),)
)
if args.max_age is not None:
i = i_keep
limit = now - datetime.timedelta(days=args.max_age)
i_keep = max(i_keep, find_atime_limit(files, timestamp_from_datetime(limit)))
if i_keep != i:
print(
"--max-age=%r limiting kept files to %d"
% (args.max_age, len(files) - i_keep)
)
elif args.verbose:
print("--max-age=%r not limiting kept files" % (args.max_age,))
# But now, for min age, it overrides all this. We don't throw away anything younger
# than min age
if args.min_age:
i = i_keep
limit = now - datetime.timedelta(days=args.min_age)
i_keep = min(i_keep, find_atime_limit(files, timestamp_from_datetime(limit)))
if i_keep != i:
print(
"--min-age=%r forcing kept files to %d"
% (args.min_age, len(files) - i_keep)
)
elif args.verbose:
print("--min-age=%r not forcing kept files" % (args.min_age,))
# Even with all this, particularly with min-age, we might still have too many files.
# So, if we have a hard limit
if args.max_size_hard > 0:
i = i_keep
i_keep = max(i_keep, find_size_limit(cumulative_sizes, args.max_size_hard))
if i_keep != i:
print(
"--max-size-hard=%s limiting kept files to %d"
% (format_size2(args.max_size_hard), len(files) - i_keep)
)
elif args.verbose:
print(
"--max-size-hard=%r not limiting kept files"
% (format_size2(args.max_size_hard),)
)
# perform the action
if not args.dry_run:
print("deleting %d files" % i_keep)
n_removed = unlink_files(files[:i_keep])
else:
print("not deleting %d files (dry run)" % i_keep)
n_removed = i_keep
# output some statistics
for what, part in ("Unlinked", files[:n_removed]), ("Remaining", files[n_removed:]):
size = sum_size(part)
count = len(part)
print("%s:" % what)
print(" %6d files, %s" % (count, format_size2(size)))
if count:
oldest = datetime.datetime.utcfromtimestamp(part[0][0])
youngest = datetime.datetime.utcfromtimestamp(part[-1][0])
print(" atime: %s to %s" % (oldest.isoformat(), youngest.isoformat()))
def test():
# create a list of files with access times and sizes
files = [
(1, 100, "dir", "a"),
(2, 200, "dir", "b"),
(3, 300, "dir", "c"),
(4, 400, "dir", "d"),
(5, 500, "dir", "e"),
]
files.sort()
cumulative_sizes = list(cumulative_sum([f[1] for f in files]))
assert cumulative_sizes[-1] == sum(f[1] for f in files)
# Files are sorted by access time, oldest is first. When throwing away files, we work with the index
# of the first file we keep.
# test find_size_limit(). The method returns the first index that we keep.
# for size 0, we won't keep anything, so the first index to keep is 5 (the first index past the last)
limit = find_size_limit(cumulative_sizes, 0)
assert limit == 5
# for size 10000, we keep all files, so the first index to keep is 0
assert find_size_limit(cumulative_sizes, 10000) == 0
# for size 900, we keep the last two files, so the first index to keep is 3
assert find_size_limit(cumulative_sizes, 900) == 3
# for size 950, we still only keep the last two files: 3
assert find_size_limit(cumulative_sizes, 950) == 3
# only when we reach 1200, we keep the last three: 2
assert find_size_limit(cumulative_sizes, 1200) == 2
# check time limit
# for time 0, we keep all files, so the first index to keep is 0
assert find_atime_limit(files, 0) == 0
# for time 1, we keep all files, so the first index to keep is 0
assert find_atime_limit(files, 1) == 0
# for time 2, we throw away the oldest file, so the first index to keep is 1
assert find_atime_limit(files, 2) == 1
# for time 5, we throw away all but the last file, so the first index to keep is 4
assert find_atime_limit(files, 5) == 4
# for time 6, we throw away all files, so the first index to keep is 5
assert find_atime_limit(files, 6) == 5
# test the dirnode, by walking some paths
found = []
if len(sys.argv) > 2:
for dirpath, _, filenames in os.walk(sys.argv[2]):
for filename in filenames:
name = os.path.join(dirpath, filename)
node = rootnode.insert(name)
assert node.path() == name
found.append(node)
if len(found) > 100:
break
print("walked %d paths" % len(found))
# make sure all nodes are unique
assert len(found) == len(set(found))
if __name__ == "__main__":
if len(sys.argv) and sys.argv[1] == "--test":
test()
sys.exit(0)
logging.basicConfig()
main()