· 7 years ago · Feb 27, 2018, 05:30 AM
1#!/usr/bin/env python
2# coding: utf-8
3# copyright: 2011, Igor Katson, igor.katson@gmail.com
4
5"""What this script does, is logartihmically keep files, that means,
6when you provide a dir, or a file pattern to it, it can calculate,
7which files to keep based on the following parameters:
8
9- keep 1 file each day for --days days,
10- keep 1 file each week for --weeks weeks (after --days processing)
11- keep 1 file each month for --month months (after --month processing)
12
13Use ./rotater --help for help
14"""
15
16import os
17import datetime
18import sys
19import re
20import optparse
21import logging
22import types
23import urlparse
24import ConfigParser
25
26from django.conf import settings
27
28settings.configure()
29
30from django.core.exceptions import ImproperlyConfigured
31from django.core.files.storage import FileSystemStorage, Storage
32
33log = logging.getLogger('rotater.py')
34
35# Regexps to take the date from filename
36DATE_RE = (
37 re.compile(r'(20\d{2})-(\d{2})-(\d{2})'),
38 re.compile(r'(20\d{2})(\d{2})(\d{2})'),
39)
40
41
42class WalkingStorageMixin(object):
43
44 def __init__(self, *args, **kwargs):
45 self.walk_top = kwargs.pop('walk_top', '')
46 super(WalkingStorageMixin, self).__init__(*args, **kwargs)
47
48 def walk(self, top=None, topdown=True, onerror=None):
49 """An implementation of os.walk() which uses the Django storage for
50 listing directories."""
51 top = top or self.walk_top
52 try:
53 dirs, nondirs = self.listdir(top)
54 except os.error, err:
55 if onerror is not None:
56 onerror(err)
57 return
58
59 if topdown:
60 yield top, dirs, nondirs
61 for name in dirs:
62 new_path = os.path.join(top, name)
63 for x in self.walk(new_path):
64 yield x
65 if not topdown:
66 yield top, dirs, nondirs
67
68
69class WalkingFileStorage(WalkingStorageMixin, FileSystemStorage):
70 pass
71
72try:
73 from storages.backends.s3boto import S3BotoStorage
74
75 class WalkingS3Storage(WalkingStorageMixin, S3BotoStorage):
76 pass
77
78except ImportError:
79 S3BotoStorage = None
80 WalkingS3Storage = None
81
82
83def get_storage_by_path(path, **options):
84
85 def get_amazon_auth(options):
86 # Try to get keys from options.
87 key = options.get('amazon_access_key')
88 secret = options.get('amazon_secret_key')
89 if key and secret:
90 return key, secret
91
92 # Try to get keys from environment.
93 key = os.environ.get('AWS_ACCESS_KEY_ID')
94 secret = os.environ.get('AWS_SECRET_ACCESS_KEY')
95 if key and secret:
96 return key, secret
97
98 # Try to get keys from ~/.s3cfg, the file used by s3cmd.
99 s3cfg = os.path.expanduser('~/.s3cfg')
100 if os.path.exists(s3cfg):
101 parser = ConfigParser.ConfigParser()
102 parser.read([s3cfg])
103 key = parser.get('default', 'access_key')
104 secret = parser.get('default', 'secret_key')
105 if key and secret:
106 return key, secret
107
108 raise ValueError('AWS access credentials not provided.')
109
110 if path.startswith('s3://'):
111 parsed = urlparse.urlparse(path)
112 if not parsed.netloc:
113 raise ValueError(
114 'You should provide at least a bucket name, e.g. s3://BUCKET/')
115 path = parsed.path
116 if path:
117 path = path[1:]
118 access_key, secret_key = get_amazon_auth(options)
119 return WalkingS3Storage(
120 bucket=parsed.netloc,
121 walk_top=path,
122 access_key=access_key,
123 secret_key=secret_key,
124 )
125 return WalkingFileStorage(path)
126
127
128class BaseRotater(object):
129 """A base class for rotaters, override files_to_delete for it to work"""
130
131 def __init__(self, storage, regex=None, **kwargs):
132 assert isinstance(storage, Storage)
133 self.storage = storage
134 self.regex = regex
135 if isinstance(self.regex, basestring):
136 self.regex = re.compile(self.regex)
137 self.recurse = kwargs.pop('recurse', False)
138 self.options = kwargs
139
140 def walk(self):
141 """Walk through all files and dirs, that are candidates for removal."""
142 for dir, dirs, files in self.storage.walk():
143 if not self.regex:
144 yield dir, dirs, files
145 else:
146 yield dir, dirs, [i for i in files
147 if self.regex.match(os.path.join(dir, i))]
148 if not self.recurse:
149 raise StopIteration
150
151 def files_to_delete(self):
152 """Return a list of files to be deleted"""
153 raise NotImplementedError
154
155 def files_to_keep(self):
156 """Get files to keep based on files to delete"""
157 delete = set(self.files_to_delete())
158 for dir, dirs, files in self.walk():
159 for file in files:
160 file = os.path.join(dir, file)
161 if file not in delete:
162 yield file
163
164 def rotate(self):
165 for file in self.files_to_delete():
166 log.info('Deleting %s' % file)
167 self.storage.delete(file)
168
169 def _get_mtime(self, filename):
170 """Get modification time of the file based on filename or mtime."""
171 if self.options.get('date_from_filename', True):
172 for re in DATE_RE:
173 match = re.search(filename)
174 if match:
175 year, month, day = match.groups()
176 try:
177 mtime = datetime.date(int(year), int(month), int(day))
178 return datetime.datetime(
179 mtime.year, mtime.month, mtime.day)
180 except ValueError:
181 pass
182 return self.storage.modified_time(filename)
183
184
185class LogarithmicRotater(BaseRotater):
186
187 DEFAULT_DAYS = 14
188 DEFAULT_WEEKS = 12
189 DEFAULT_MONTHS = 36
190
191 def __init__(self, *args, **kwargs):
192
193 self.days = kwargs.pop('days', None)
194 if self.days is None:
195 self.days = self.DEFAULT_DAYS
196 self.weeks = kwargs.pop('weeks', None)
197 if self.weeks is None:
198 self.weeks = self.DEFAULT_WEEKS
199 self.months = kwargs.pop('months', None)
200 if self.months is None:
201 self.months = self.DEFAULT_MONTHS
202 super(LogarithmicRotater, self).__init__(*args, **kwargs)
203
204 def _logarithmic_rotate(self, files):
205 """Files is a list of files to check for deletion
206
207 files argument is a list of 2-tuples with mtime and filename.
208
209 - keep 1 file each day for self.days days,
210 - keep 1 file each week for self.weeks weeks (after --days processing)
211 - keep 1 file each month for self.months months (after --month processing)
212
213 Return a list of files which can be deleted
214 """
215 start_rotate = datetime.date.today()
216 weeks_start = start_rotate - datetime.timedelta(days=self.days)
217 months_start = weeks_start - datetime.timedelta(weeks=self.weeks)
218 end_rotate = months_start - datetime.timedelta(days=self.months * 30)
219
220 kept_days = set()
221
222 get_month = lambda mdate: mdate.replace(day=1)
223 get_week = lambda mdate: mdate - datetime.timedelta(days=mdate.weekday())
224
225 for mtime, file in files:
226 mdate = mtime.date()
227 if mdate <= end_rotate:
228 yield file
229 elif end_rotate < mdate <= months_start:
230 # Keep one file for each month
231 month = get_month(mdate)
232 if month in kept_days:
233 yield file
234 else:
235 kept_days.add(month)
236 elif months_start < mdate <= weeks_start:
237 # Keep one file for each week
238 week = get_week(mdate)
239 if week in kept_days:
240 yield file
241 else:
242 kept_days.add(week)
243 elif weeks_start < mdate <= start_rotate:
244 # Keep one file for each day
245 if mdate in kept_days:
246 yield file
247 else:
248 kept_days.add(mdate)
249 else:
250 # The file seems to be from future, keep it
251 pass
252
253 def files_to_delete(self):
254 for top, dirs, files in self.walk():
255 files = [os.path.join(top, file) for file in files]
256 files = (
257 (self._get_mtime(f), f) for f in files
258 )
259 files = self._logarithmic_rotate(files)
260 for file in files:
261 yield file
262
263
264if __name__ == '__main__':
265
266 optparser = optparse.OptionParser(
267 usage="""usage: %prog [options] path
268
269"path" may be a path to a directory, or an s3 URL e.g.
270s3://BUCKET[/DIRNAME]""")
271
272
273 optparser.add_option(
274 '--regex', help='Optional regex to match filenames for.'
275 ' The full path will be matched.')
276 optparser.add_option(
277 '--days', type='int',
278 help='keep 1 file each day for --days days [%default]',
279 default=LogarithmicRotater.DEFAULT_DAYS)
280 optparser.add_option(
281 '--weeks', type='int',
282 help='keep 1 file each week for --weeks weeks [%default]',
283 default=LogarithmicRotater.DEFAULT_WEEKS)
284 optparser.add_option(
285 '--months', type='int',
286 help='keep 1 file each month for --months months [%default]',
287 default=LogarithmicRotater.DEFAULT_MONTHS)
288 optparser.add_option(
289 '--no-date-from-filename', action='store_false',
290 help='do not try to guess modification time from filename [%default]',
291 default=True, dest='date_from_filename')
292 optparser.add_option(
293 '--test', '--test-delete', action='store_true', default=False,
294 dest='test_delete',
295 help='test mode, no files will be deleted, the ones to '
296 'DELETE will be printed to stdout [%default]')
297 optparser.add_option(
298 '--test-keep', action='store_true', default=False,
299 help='test mode, no files will be deleted, the ones to '
300 'KEEP will be printed to stdout [%default]')
301 optparser.add_option(
302 '--recurse', action='store_true', default=False,
303 help='Recurse into subdirectories [%default]',
304 )
305 optparser.add_option(
306 '--loglevel',
307 default='DEBUG',
308 help='One of DEBUG, WARNING, INFO, ERROR [%default]')
309 optparser.add_option('--amazon_access_key', type='string')
310 optparser.add_option('--amazon_secret_key', type='string')
311
312 options, args = optparser.parse_args()
313
314 if len(args) != 1:
315 print >> sys.stderr, '%s takes only 1 argument\nAborting...' % sys.argv[0]
316 sys.exit(1)
317
318 dir = args[0]
319
320 loglevels = {
321 'debug': logging.DEBUG,
322 'info': logging.INFO,
323 'warning': logging.WARNING,
324 'error': logging.ERROR,
325 }
326
327 loglevel=loglevels[options.loglevel.lower()]
328 handler = logging.StreamHandler(sys.stdout)
329 handler.setFormatter(logging.Formatter("%(levelname)s: %(message)s"))
330 log.addHandler(handler)
331 log.setLevel(loglevel)
332
333 storage = get_storage_by_path(dir, **options.__dict__)
334 rotater = LogarithmicRotater(storage, **options.__dict__)
335
336 if options.test_delete:
337 for file in sorted(rotater.files_to_delete()):
338 log.info('will delete "%s"', file)
339 elif options.test_keep:
340 for file in sorted(rotater.files_to_keep()):
341 log.info('will keep "%s"', file)
342 else:
343 rotater.rotate()