· 6 years ago · Oct 16, 2019, 01:45 PM
1def s3list(bucket, path, start=None, end=None, recursive=True, list_dirs=True,
2 list_objs=True, limit=None):
3 """
4 Iterator that lists a bucket's objects under path, (optionally) starting with
5 start and ending before end.
6
7 If recursive is False, then list only the "depth=0" items (dirs and objects).
8
9 If recursive is True, then list recursively all objects (no dirs).
10
11 Args:
12 bucket:
13 a boto3.resource('s3').Bucket().
14 path:
15 a directory in the bucket.
16 start:
17 optional: start key, inclusive (may be a relative path under path, or
18 absolute in the bucket)
19 end:
20 optional: stop key, exclusive (may be a relative path under path, or
21 absolute in the bucket)
22 recursive:
23 optional, default True. If True, lists only objects. If False, lists
24 only depth 0 "directories" and objects.
25 list_dirs:
26 optional, default True. Has no effect in recursive listing. On
27 non-recursive listing, if False, then directories are omitted.
28 list_objs:
29 optional, default True. If False, then directories are omitted.
30 limit:
31 optional. If specified, then lists at most this many items.
32
33 Returns:
34 an iterator of S3Obj.
35
36 Examples:
37 # set up
38 #>>> s3 = boto3.resource('s3')
39 #... bucket = s3.Bucket(name)
40
41 # iterate through all S3 objects under some dir
42 #>>> for p in s3ls(bucket, 'some/dir'):
43 #... print(p)
44
45 # iterate through up to 20 S3 objects under some dir, starting with foo_0010
46 #>>> for p in s3ls(bucket, 'some/dir', limit=20, start='foo_0010'):
47 #... print(p)
48
49 # non-recursive listing under some dir:
50 #>>> for p in s3ls(bucket, 'some/dir', recursive=False):
51 #... print(p)
52
53 # non-recursive listing under some dir, listing only dirs:
54 #>>> for p in s3ls(bucket, 'some/dir', recursive=False, list_objs=False):
55 # ... print(p)
56"""
57 kwargs = dict()
58 if start is not None:
59 if not start.startswith(path):
60 start = os.path.join(path, start)
61 # note: need to use a string just smaller than start, because
62 # the list_object API specifies that start is excluded (the first
63 # result is *after* start).
64 kwargs.update(Marker=__prev_str(start))
65 if end is not None:
66 if not end.startswith(path):
67 end = os.path.join(path, end)
68 if not recursive:
69 kwargs.update(Delimiter='/')
70 if not path.endswith('/'):
71 path += '/'
72 kwargs.update(Prefix=path)
73 if limit is not None:
74 kwargs.update(PaginationConfig={'MaxItems': limit})
75
76 paginator = bucket.meta.client.get_paginator('list_objects')
77 for resp in paginator.paginate(Bucket=bucket.name, **kwargs):
78 q = []
79 if 'CommonPrefixes' in resp and list_dirs:
80 q = [S3Obj(f['Prefix'], None, None, None) for f in resp['CommonPrefixes']]
81 if 'Contents' in resp and list_objs:
82 q += [S3Obj(f['Key'], f['LastModified'], f['Size'], f['ETag']) for f in resp['Contents']]
83 # note: even with sorted lists, it is faster to sort(a+b)
84 # than heapq.merge(a, b) at least up to 10K elements in each list
85 q = sorted(q, key=attrgetter('key'))
86 if limit is not None:
87 q = q[:limit]
88 limit -= len(q)
89 for p in q:
90 if end is not None and p.key >= end:
91 return
92 yield p
93
94
95def __prev_str(s):
96 if len(s) == 0:
97 return s
98 s, c = s[:-1], ord(s[-1])
99 if c > 0:
100 s += chr(c - 1)
101 s += ''.join(['\u7FFF' for _ in range(10)])
102 return s