· 6 years ago · Mar 23, 2020, 11:10 AM
1# Copyright (C) 2019 The Software Heritage developers
2# See the AUTHORS file at the top-level directory of this distribution
3# License: GNU General Public License version 3, or any later version
4# See top-level LICENSE file for more information
5
6"""Python client for the Software Heritage Web API
7
8Light wrapper around requests for the archive API, taking care of data
9conversions and pagination.
10
11>>> from webclient import WebAPIClient
12>>> cli = WebAPIClient()
13>>> cli.get('swh:1:rev:aafb16d69fd30ff58afdd69036a26047f3aebdc6')
14{'id': PersistentId(namespace='swh', scheme_version=1, object_type='revision',
15 object_id='aafb16d69fd30ff58afdd69036a26047f3aebdc6',
16 metadata={}),
17 'author': {
18 'fullname': 'Nicolas Dandrimont <nicolas.dandrimont@crans.org>',
19 'name': 'Nicolas Dandrimont',
20 'email': 'nicolas.dandrimont@crans.org'
21 },
22 'date': datetime.datetime(2014, 8, 18, 18, 18, 25,
23 tzinfo=tzoffset(None, 7200)),
24 'committer': {
25 'fullname': 'Nicolas Dandrimont <nicolas.dandrimont@crans.org>',
26 'name': 'Nicolas Dandrimont',
27 'email': 'nicolas.dandrimont@crans.org'
28 },
29 'committer_date': datetime.datetime(2014, 8, 18, 18, 18, 25,
30 tzinfo=tzoffset(None, 7200))
31 'type': 'git',
32 'directory': PersistentId(namespace='swh', scheme_version=1,
33 object_type='directory',
34 object_id='9f2e5898e00a66e6ac11033959d7e05b1593353b',
35 metadata={}),
36 'message': "Merge branch 'master' into pr/584\n",
37 'metadata': {},
38 'synthetic': False,
39 'parents': [
40 {'id': PersistentId(namespace='swh', scheme_version=1,
41 object_type='revision',
42 object_id='26307d261279861c2d9c9eca3bb38519f951bea4',
43 metadata={}),
44 'url': '/api/1/revision/26307d261279861c2d9c9eca3bb38519f951bea4/'},
45 {'id': PersistentId(namespace='swh', scheme_version=1,
46 object_type='revision',
47 object_id='37fc9e08d0c4b71807a4f1ecb06112e78d91c283',
48 metadata={}),
49 'url': '/api/1/revision/37fc9e08d0c4b71807a4f1ecb06112e78d91c283/'}
50 ],
51 'merge': True,
52 'url': '/api/1/revision/aafb16d69fd30ff58afdd69036a26047f3aebdc6/',
53 'history_url': '/api/1/revision/aafb16d69fd30ff58afdd69036a26047f3aebdc6/log/',
54 'directory_url': '/api/1/directory/9f2e5898e00a66e6ac11033959d7e05b1593353b/'
55}
56
57"""
58
59from typing import Any, Dict, Generator, List, Union
60from urllib.parse import urlparse
61
62import dateutil.parser
63import requests
64
65from swh.model.identifiers import \
66 SNAPSHOT, REVISION, RELEASE, DIRECTORY, CONTENT
67from swh.model.identifiers import PersistentId as PID
68from swh.model.identifiers import parse_persistent_identifier as parse_pid
69
70
71PIDish = Union[PID, str]
72
73
74def _get_pid(pidish: PIDish) -> PID:
75 """parse string to PID if needed"""
76 if isinstance(pidish, str):
77 return parse_pid(pidish)
78 else:
79 return pidish
80
81
82def typify(data: Any, obj_type: str) -> Any:
83 """type API responses using pythonic types where appropriate
84
85 the following conversions are performed:
86
87 - identifiers are converted from strings to PersistentId instances
88 - timestamps are converted from strings to datetime.datetime objects
89
90 """
91 def to_pid(object_type, s):
92 return PID(object_type=object_type, object_id=s)
93
94 def to_date(s):
95 return dateutil.parser.parse(s)
96
97 def obj_type_of_entry_type(s):
98 if s == 'file':
99 return CONTENT
100 elif s == 'dir':
101 return DIRECTORY
102 elif s == 'rev':
103 return REVISION
104 else:
105 raise ValueError(f'invalid directory entry type: {s}')
106
107 if obj_type == SNAPSHOT:
108 for name, target in data.items():
109 target['target'] = to_pid(target['target_type'], target['target'])
110 elif obj_type == REVISION:
111 data['id'] = to_pid(obj_type, data['id'])
112 data['directory'] = to_pid(DIRECTORY, data['directory'])
113 for key in ('date', 'committer_date'):
114 data[key] = to_date(data[key])
115 for parent in data['parents']:
116 parent['id'] = to_pid(REVISION, parent['id'])
117 elif obj_type == RELEASE:
118 data['id'] = to_pid(obj_type, data['id'])
119 data['date'] = to_date(data['date'])
120 data['target'] = to_pid(data['target_type'], data['target'])
121 elif obj_type == DIRECTORY:
122 dir_pid = None
123 for entry in data:
124 dir_pid = dir_pid or to_pid(obj_type, entry['dir_id'])
125 entry['dir_id'] = dir_pid
126 entry['target'] = to_pid(obj_type_of_entry_type(entry['type']),
127 entry['target'])
128 elif obj_type == CONTENT:
129 pass # nothing to do for contents
130 else:
131 raise ValueError(f'invalid object type: {obj_type}')
132
133 return data
134
135
136class WebAPIClient:
137 """client for the Software Heritage archive Web API, see
138
139 https://archive.softwareheritage.org/api/
140
141 """
142
143 def __init__(self, api_url='https://archive.softwareheritage.org/api/1'):
144 """create a client for the Software Heritage Web API
145
146 see: https://archive.softwareheritage.org/api/
147
148 Args:
149 api_url: base URL for API calls (default:
150 "https://archive.softwareheritage.org/api/1")
151
152 """
153 api_url = api_url.rstrip('/')
154 u = urlparse(api_url)
155
156 self.api_url = api_url
157 self.api_path = u.path
158
159 def _call(self, query: str, http_method: str = 'get',
160 **req_args) -> requests.models.Response:
161 """dispatcher for archive API invocation
162
163 Args:
164 query: API method to be invoked, rooted at api_url
165 http_method: HTTP method to be invoked, one of: 'get', 'head'
166 req_args: extra keyword arguments for requests.get()/.head()
167
168 Raises:
169 requests.HTTPError: if HTTP request fails and http_method is 'get'
170
171 """
172 url = '/'.join([self.api_url, query])
173 r = None
174
175 if http_method == 'get':
176 r = requests.get(url, **req_args)
177 r.raise_for_status()
178 elif http_method == 'head':
179 r = requests.head(url, **req_args)
180 else:
181 raise ValueError(f'unsupported HTTP method: {http_method}')
182
183 return r
184
185 def get(self, pid: PIDish, **req_args) -> Any:
186 """retrieve information about an object of any kind
187
188 dispatcher method over the more specific methods content(),
189 directory(), etc.
190
191 note that this method will buffer the entire output in case of long,
192 iterable output (e.g., for snapshot()), see the iter() method for
193 streaming
194
195 """
196 pid_ = _get_pid(pid)
197 getters = {
198 CONTENT: self.content,
199 DIRECTORY: self.directory,
200 RELEASE: self.release,
201 REVISION: self.revision,
202 SNAPSHOT: lambda pid: dict(self.snapshot(pid)),
203 }
204 return getters[pid_.object_type](pid_)
205
206 def iter(self, pid: PIDish, **req_args) -> Generator[Dict[str, Any],
207 None, None]:
208 """stream over the information about an object of any kind
209
210 streaming variant of get()
211
212 """
213 pid_ = _get_pid(pid)
214 obj_type = pid_.object_type
215 if obj_type == SNAPSHOT:
216 yield from self.snapshot(pid_)
217 elif obj_type == REVISION:
218 yield from [self.revision(pid_)]
219 elif obj_type == RELEASE:
220 yield from [self.release(pid_)]
221 elif obj_type == DIRECTORY:
222 yield from self.directory(pid_)
223 elif obj_type == CONTENT:
224 yield from [self.content(pid_)]
225 else:
226 raise ValueError(f'invalid object type: {obj_type}')
227
228 def content(self, pid: PIDish, **req_args) -> Dict[str, Any]:
229 """retrieve information about a content object
230
231 Args:
232 pid: object identifier
233 req_args: extra keyword arguments for requests.get()
234
235 Raises:
236 requests.HTTPError: if HTTP request fails
237
238 """
239 return typify(
240 self._call(f'content/sha1_git:{_get_pid(pid).object_id}/',
241 **req_args).json(),
242 CONTENT)
243
244 def directory(self, pid: PIDish, **req_args) -> List[Dict[str, Any]]:
245 """retrieve information about a directory object
246
247 Args:
248 pid: object identifier
249 req_args: extra keyword arguments for requests.get()
250
251 Raises:
252 requests.HTTPError: if HTTP request fails
253
254 """
255 return typify(
256 self._call(f'directory/{_get_pid(pid).object_id}/',
257 **req_args).json(),
258 DIRECTORY)
259
260 def revision(self, pid: PIDish, **req_args) -> Dict[str, Any]:
261 """retrieve information about a revision object
262
263 Args:
264 pid: object identifier
265 req_args: extra keyword arguments for requests.get()
266
267 Raises:
268 requests.HTTPError: if HTTP request fails
269
270 """
271 return typify(
272 self._call(f'revision/{_get_pid(pid).object_id}/',
273 **req_args).json(),
274 REVISION)
275
276 def release(self, pid: PIDish, **req_args) -> Dict[str, Any]:
277 """retrieve information about a release object
278
279 Args:
280 pid: object identifier
281 req_args: extra keyword arguments for requests.get()
282
283 Raises:
284 requests.HTTPError: if HTTP request fails
285
286 """
287 return typify(
288 self._call(f'release/{_get_pid(pid).object_id}/',
289 **req_args).json(),
290 RELEASE)
291
292 def snapshot(self, pid: PIDish,
293 **req_args) -> Generator[Dict[str, Any], None, None]:
294 """retrieve information about a snapshot object
295
296 Args:
297 pid: object identifier
298 req_args: extra keyword arguments for requests.get()
299
300 Returns:
301 an iterator over partial snapshots, each containing a subset of
302 available branches
303
304 Raises:
305 requests.HTTPError: if HTTP request fails
306
307 """
308 done = False
309 r = None
310 query = f'snapshot/{_get_pid(pid).object_id}/'
311
312 while not done:
313 r = self._call(query, http_method='get', **req_args)
314 yield from typify(r.json()['branches'], SNAPSHOT).items()
315 if 'next' in r.links and 'url' in r.links['next']:
316 query = r.links['next']['url']
317 if query.startswith(self.api_path):
318 # XXX hackish URL cleaning while we wait for swh-web API to
319 # return complete URLs (a-la GitHub/GitLab) in Link headers
320 # instead of absolute paths rooted at https://archive.s.o/
321 # cf. https://forge.softwareheritage.org/T2147
322 query = query[len(self.api_path):].lstrip('/')
323 else:
324 done = True
325
326 def content_exists(self, pid: PIDish, **req_args) -> bool:
327 """check if a content object exists in the archive
328
329 Args:
330 pid: object identifier
331 req_args: extra keyword arguments for requests.head()
332
333 Raises:
334 requests.HTTPError: if HTTP request fails
335
336 """
337 return bool(self._call(f'content/sha1_git:{_get_pid(pid).object_id}/',
338 http_method='head', **req_args))
339
340 def directory_exists(self, pid: PIDish, **req_args) -> bool:
341 """check if a directory object exists in the archive
342
343 Args:
344 pid: object identifier
345 req_args: extra keyword arguments for requests.head()
346
347 Raises:
348 requests.HTTPError: if HTTP request fails
349
350 """
351 return bool(self._call(f'directory/{_get_pid(pid).object_id}/',
352 http_method='head', **req_args))
353
354 def revision_exists(self, pid: PIDish, **req_args) -> bool:
355 """check if a revision object exists in the archive
356
357 Args:
358 pid: object identifier
359 req_args: extra keyword arguments for requests.head()
360
361 Raises:
362 requests.HTTPError: if HTTP request fails
363
364 """
365 return bool(self._call(f'revision/{_get_pid(pid).object_id}/',
366 http_method='head', **req_args))
367
368 def release_exists(self, pid: PIDish, **req_args) -> bool:
369 """check if a release object exists in the archive
370
371 Args:
372 pid: object identifier
373 req_args: extra keyword arguments for requests.head()
374
375 Raises:
376 requests.HTTPError: if HTTP request fails
377
378 """
379 return bool(self._call(f'release/{_get_pid(pid).object_id}/',
380 http_method='head', **req_args))
381
382 def snapshot_exists(self, pid: PIDish, **req_args) -> bool:
383 """check if a snapshot object exists in the archive
384
385 Args:
386 pid: object identifier
387 req_args: extra keyword arguments for requests.head()
388
389 Raises:
390 requests.HTTPError: if HTTP request fails
391
392 """
393 return bool(self._call(f'snapshot/{_get_pid(pid).object_id}/',
394 http_method='head', **req_args))
395
396 def content_raw(self, pid: PIDish,
397 **req_args) -> Generator[bytes, None, None]:
398 """iterate over the raw content of a content object
399
400 Args:
401 pid: object identifier
402 req_args: extra keyword arguments for requests.get()
403
404 Raises:
405 requests.HTTPError: if HTTP request fails
406
407 """
408 r = self._call(f'content/sha1_git:{_get_pid(pid).object_id}/raw/',
409 stream=True, **req_args)
410 r.raise_for_status()
411
412 yield from r.iter_content(chunk_size=None, decode_unicode=False)
413
414 def visits(self, origin: str, per_page=None,
415 **req_args) -> Generator[Dict[str, Any], None, None]:
416 """list visits of an origin
417
418 Args:
419 pid: object identifier
420 per_page: the number of visits per_page
421 req_args: extra keyword arguments for requests.get()
422
423 Returns:
424 an iterator over visits of the origin
425
426 Raises:
427 requests.HTTPError: if HTTP request fails
428
429 """
430 done = False
431 r = None
432 params = ''
433 if per_page is not None:
434 params = f"?per_page={per_page}"
435 query = f'origin/{origin}/visits{params}'
436
437 while not done:
438 r = self._call(query, http_method='get', **req_args)
439 yield from r.json()
440 if 'next' in r.links and 'url' in r.links['next']:
441 if query.startswith(self.api_url):
442 # XXX hackish URL cleaning while we wait for swh-web API to
443 # return complete URLs (a-la GitHub/GitLab) in Link headers
444 # instead of absolute paths rooted at https://archive.s.o/
445 # cf. https://forge.softwareheritage.org/T2147
446 query = query[len(self.api_url):].lstrip('/')
447 else:
448 done = True
449
450
451import argparse
452
453parser = argparse.ArgumentParser()
454parser.add_argument('url', type=str)
455parser.add_argument('origin', type=str)
456args = parser.parse_args()
457
458url = args.url
459origin = args.origin
460endpoint = url + "/api/1"
461
462c = WebAPIClient(api_url=endpoint)
463visits = {}
464evaluations = {}
465for v in c.visits(origin):
466 snapshot = v['snapshot']
467 if snapshot is None:
468 continue
469
470 branches = dict(c.snapshot(PID(object_type='snapshot',
471 object_id=snapshot)))
472
473 visits[v['visit']] = {
474 'snapshot': snapshot,
475 'branches': branches,
476 'date': v['date'],
477 }
478 evaluation = branches['evaluation']['target'].object_id
479 if evaluation in evaluations:
480 evaluations[evaluation].update(branches)
481 else:
482 evaluations[evaluation] = branches
483
484import json
485print(json.dumps({'visits': visits, 'evaluations': evaluations}))