8znf4U4u

· 6 years ago · Mar 23, 2020, 11:10 AM
1# Copyright (C) 2019  The Software Heritage developers
2# See the AUTHORS file at the top-level directory of this distribution
3# License: GNU General Public License version 3, or any later version
4# See top-level LICENSE file for more information
5
6"""Python client for the Software Heritage Web API
7
8Light wrapper around requests for the archive API, taking care of data
9conversions and pagination.
10
11>>> from webclient import WebAPIClient
12>>> cli = WebAPIClient()
13>>> cli.get('swh:1:rev:aafb16d69fd30ff58afdd69036a26047f3aebdc6')
14{'id': PersistentId(namespace='swh', scheme_version=1, object_type='revision',
15                    object_id='aafb16d69fd30ff58afdd69036a26047f3aebdc6',
16                    metadata={}),
17 'author': {
18  'fullname': 'Nicolas Dandrimont <nicolas.dandrimont@crans.org>',
19  'name': 'Nicolas Dandrimont',
20  'email': 'nicolas.dandrimont@crans.org'
21 },
22 'date': datetime.datetime(2014, 8, 18, 18, 18, 25,
23                           tzinfo=tzoffset(None, 7200)),
24 'committer': {
25  'fullname': 'Nicolas Dandrimont <nicolas.dandrimont@crans.org>',
26  'name': 'Nicolas Dandrimont',
27  'email': 'nicolas.dandrimont@crans.org'
28 },
29 'committer_date': datetime.datetime(2014, 8, 18, 18, 18, 25,
30                                     tzinfo=tzoffset(None, 7200))
31 'type': 'git',
32 'directory': PersistentId(namespace='swh', scheme_version=1,
33                           object_type='directory',
34                           object_id='9f2e5898e00a66e6ac11033959d7e05b1593353b',
35                           metadata={}),
36 'message': "Merge branch 'master' into pr/584\n",
37 'metadata': {},
38 'synthetic': False,
39 'parents': [
40  {'id': PersistentId(namespace='swh', scheme_version=1,
41                      object_type='revision',
42                      object_id='26307d261279861c2d9c9eca3bb38519f951bea4',
43                      metadata={}),
44   'url': '/api/1/revision/26307d261279861c2d9c9eca3bb38519f951bea4/'},
45  {'id': PersistentId(namespace='swh', scheme_version=1,
46                      object_type='revision',
47                      object_id='37fc9e08d0c4b71807a4f1ecb06112e78d91c283',
48                      metadata={}),
49   'url': '/api/1/revision/37fc9e08d0c4b71807a4f1ecb06112e78d91c283/'}
50 ],
51 'merge': True,
52 'url': '/api/1/revision/aafb16d69fd30ff58afdd69036a26047f3aebdc6/',
53 'history_url': '/api/1/revision/aafb16d69fd30ff58afdd69036a26047f3aebdc6/log/',
54 'directory_url': '/api/1/directory/9f2e5898e00a66e6ac11033959d7e05b1593353b/'
55}
56
57"""
58
59from typing import Any, Dict, Generator, List, Union
60from urllib.parse import urlparse
61
62import dateutil.parser
63import requests
64
65from swh.model.identifiers import \
66    SNAPSHOT, REVISION, RELEASE, DIRECTORY, CONTENT
67from swh.model.identifiers import PersistentId as PID
68from swh.model.identifiers import parse_persistent_identifier as parse_pid
69
70
71PIDish = Union[PID, str]
72
73
74def _get_pid(pidish: PIDish) -> PID:
75    """parse string to PID if needed"""
76    if isinstance(pidish, str):
77        return parse_pid(pidish)
78    else:
79        return pidish
80
81
82def typify(data: Any, obj_type: str) -> Any:
83    """type API responses using pythonic types where appropriate
84
85    the following conversions are performed:
86
87    - identifiers are converted from strings to PersistentId instances
88    - timestamps are converted from strings to datetime.datetime objects
89
90    """
91    def to_pid(object_type, s):
92        return PID(object_type=object_type, object_id=s)
93
94    def to_date(s):
95        return dateutil.parser.parse(s)
96
97    def obj_type_of_entry_type(s):
98        if s == 'file':
99            return CONTENT
100        elif s == 'dir':
101            return DIRECTORY
102        elif s == 'rev':
103            return REVISION
104        else:
105            raise ValueError(f'invalid directory entry type: {s}')
106
107    if obj_type == SNAPSHOT:
108        for name, target in data.items():
109            target['target'] = to_pid(target['target_type'], target['target'])
110    elif obj_type == REVISION:
111        data['id'] = to_pid(obj_type, data['id'])
112        data['directory'] = to_pid(DIRECTORY, data['directory'])
113        for key in ('date', 'committer_date'):
114            data[key] = to_date(data[key])
115        for parent in data['parents']:
116            parent['id'] = to_pid(REVISION, parent['id'])
117    elif obj_type == RELEASE:
118        data['id'] = to_pid(obj_type, data['id'])
119        data['date'] = to_date(data['date'])
120        data['target'] = to_pid(data['target_type'], data['target'])
121    elif obj_type == DIRECTORY:
122        dir_pid = None
123        for entry in data:
124            dir_pid = dir_pid or to_pid(obj_type, entry['dir_id'])
125            entry['dir_id'] = dir_pid
126            entry['target'] = to_pid(obj_type_of_entry_type(entry['type']),
127                                     entry['target'])
128    elif obj_type == CONTENT:
129        pass  # nothing to do for contents
130    else:
131        raise ValueError(f'invalid object type: {obj_type}')
132
133    return data
134
135
136class WebAPIClient:
137    """client for the Software Heritage archive Web API, see
138
139    https://archive.softwareheritage.org/api/
140
141    """
142
143    def __init__(self, api_url='https://archive.softwareheritage.org/api/1'):
144        """create a client for the Software Heritage Web API
145
146        see: https://archive.softwareheritage.org/api/
147
148        Args:
149            api_url: base URL for API calls (default:
150                "https://archive.softwareheritage.org/api/1")
151
152        """
153        api_url = api_url.rstrip('/')
154        u = urlparse(api_url)
155
156        self.api_url = api_url
157        self.api_path = u.path
158
159    def _call(self, query: str, http_method: str = 'get',
160              **req_args) -> requests.models.Response:
161        """dispatcher for archive API invocation
162
163        Args:
164            query: API method to be invoked, rooted at api_url
165            http_method: HTTP method to be invoked, one of: 'get', 'head'
166            req_args: extra keyword arguments for requests.get()/.head()
167
168        Raises:
169            requests.HTTPError: if HTTP request fails and http_method is 'get'
170
171        """
172        url = '/'.join([self.api_url, query])
173        r = None
174
175        if http_method == 'get':
176            r = requests.get(url, **req_args)
177            r.raise_for_status()
178        elif http_method == 'head':
179            r = requests.head(url, **req_args)
180        else:
181            raise ValueError(f'unsupported HTTP method: {http_method}')
182
183        return r
184
185    def get(self, pid: PIDish, **req_args) -> Any:
186        """retrieve information about an object of any kind
187
188        dispatcher method over the more specific methods content(),
189        directory(), etc.
190
191        note that this method will buffer the entire output in case of long,
192        iterable output (e.g., for snapshot()), see the iter() method for
193        streaming
194
195        """
196        pid_ = _get_pid(pid)
197        getters = {
198            CONTENT: self.content,
199            DIRECTORY: self.directory,
200            RELEASE: self.release,
201            REVISION: self.revision,
202            SNAPSHOT: lambda pid: dict(self.snapshot(pid)),
203        }
204        return getters[pid_.object_type](pid_)
205
206    def iter(self, pid: PIDish, **req_args) -> Generator[Dict[str, Any],
207                                                         None, None]:
208        """stream over the information about an object of any kind
209
210        streaming variant of get()
211
212        """
213        pid_ = _get_pid(pid)
214        obj_type = pid_.object_type
215        if obj_type == SNAPSHOT:
216            yield from self.snapshot(pid_)
217        elif obj_type == REVISION:
218            yield from [self.revision(pid_)]
219        elif obj_type == RELEASE:
220            yield from [self.release(pid_)]
221        elif obj_type == DIRECTORY:
222            yield from self.directory(pid_)
223        elif obj_type == CONTENT:
224            yield from [self.content(pid_)]
225        else:
226            raise ValueError(f'invalid object type: {obj_type}')
227
228    def content(self, pid: PIDish, **req_args) -> Dict[str, Any]:
229        """retrieve information about a content object
230
231        Args:
232            pid: object identifier
233            req_args: extra keyword arguments for requests.get()
234
235        Raises:
236          requests.HTTPError: if HTTP request fails
237
238        """
239        return typify(
240            self._call(f'content/sha1_git:{_get_pid(pid).object_id}/',
241                       **req_args).json(),
242            CONTENT)
243
244    def directory(self, pid: PIDish, **req_args) -> List[Dict[str, Any]]:
245        """retrieve information about a directory object
246
247        Args:
248            pid: object identifier
249            req_args: extra keyword arguments for requests.get()
250
251        Raises:
252          requests.HTTPError: if HTTP request fails
253
254        """
255        return typify(
256            self._call(f'directory/{_get_pid(pid).object_id}/',
257                       **req_args).json(),
258            DIRECTORY)
259
260    def revision(self, pid: PIDish, **req_args) -> Dict[str, Any]:
261        """retrieve information about a revision object
262
263        Args:
264            pid: object identifier
265            req_args: extra keyword arguments for requests.get()
266
267        Raises:
268          requests.HTTPError: if HTTP request fails
269
270        """
271        return typify(
272            self._call(f'revision/{_get_pid(pid).object_id}/',
273                       **req_args).json(),
274            REVISION)
275
276    def release(self, pid: PIDish, **req_args) -> Dict[str, Any]:
277        """retrieve information about a release object
278
279        Args:
280            pid: object identifier
281            req_args: extra keyword arguments for requests.get()
282
283        Raises:
284          requests.HTTPError: if HTTP request fails
285
286        """
287        return typify(
288            self._call(f'release/{_get_pid(pid).object_id}/',
289                       **req_args).json(),
290            RELEASE)
291
292    def snapshot(self, pid: PIDish,
293                 **req_args) -> Generator[Dict[str, Any], None, None]:
294        """retrieve information about a snapshot object
295
296        Args:
297            pid: object identifier
298            req_args: extra keyword arguments for requests.get()
299
300        Returns:
301            an iterator over partial snapshots, each containing a subset of
302            available branches
303
304        Raises:
305          requests.HTTPError: if HTTP request fails
306
307        """
308        done = False
309        r = None
310        query = f'snapshot/{_get_pid(pid).object_id}/'
311
312        while not done:
313            r = self._call(query, http_method='get', **req_args)
314            yield from typify(r.json()['branches'], SNAPSHOT).items()
315            if 'next' in r.links and 'url' in r.links['next']:
316                query = r.links['next']['url']
317                if query.startswith(self.api_path):
318                    # XXX hackish URL cleaning while we wait for swh-web API to
319                    # return complete URLs (a-la GitHub/GitLab) in Link headers
320                    # instead of absolute paths rooted at https://archive.s.o/
321                    # cf. https://forge.softwareheritage.org/T2147
322                    query = query[len(self.api_path):].lstrip('/')
323            else:
324                done = True
325
326    def content_exists(self, pid: PIDish, **req_args) -> bool:
327        """check if a content object exists in the archive
328
329        Args:
330            pid: object identifier
331            req_args: extra keyword arguments for requests.head()
332
333        Raises:
334          requests.HTTPError: if HTTP request fails
335
336        """
337        return bool(self._call(f'content/sha1_git:{_get_pid(pid).object_id}/',
338                               http_method='head', **req_args))
339
340    def directory_exists(self, pid: PIDish, **req_args) -> bool:
341        """check if a directory object exists in the archive
342
343        Args:
344            pid: object identifier
345            req_args: extra keyword arguments for requests.head()
346
347        Raises:
348          requests.HTTPError: if HTTP request fails
349
350        """
351        return bool(self._call(f'directory/{_get_pid(pid).object_id}/',
352                               http_method='head', **req_args))
353
354    def revision_exists(self, pid: PIDish, **req_args) -> bool:
355        """check if a revision object exists in the archive
356
357        Args:
358            pid: object identifier
359            req_args: extra keyword arguments for requests.head()
360
361        Raises:
362          requests.HTTPError: if HTTP request fails
363
364        """
365        return bool(self._call(f'revision/{_get_pid(pid).object_id}/',
366                               http_method='head', **req_args))
367
368    def release_exists(self, pid: PIDish, **req_args) -> bool:
369        """check if a release object exists in the archive
370
371        Args:
372            pid: object identifier
373            req_args: extra keyword arguments for requests.head()
374
375        Raises:
376          requests.HTTPError: if HTTP request fails
377
378        """
379        return bool(self._call(f'release/{_get_pid(pid).object_id}/',
380                               http_method='head', **req_args))
381
382    def snapshot_exists(self, pid: PIDish, **req_args) -> bool:
383        """check if a snapshot object exists in the archive
384
385        Args:
386            pid: object identifier
387            req_args: extra keyword arguments for requests.head()
388
389        Raises:
390          requests.HTTPError: if HTTP request fails
391
392        """
393        return bool(self._call(f'snapshot/{_get_pid(pid).object_id}/',
394                               http_method='head', **req_args))
395
396    def content_raw(self, pid: PIDish,
397                    **req_args) -> Generator[bytes, None, None]:
398        """iterate over the raw content of a content object
399
400        Args:
401            pid: object identifier
402            req_args: extra keyword arguments for requests.get()
403
404        Raises:
405          requests.HTTPError: if HTTP request fails
406
407        """
408        r = self._call(f'content/sha1_git:{_get_pid(pid).object_id}/raw/',
409                       stream=True, **req_args)
410        r.raise_for_status()
411
412        yield from r.iter_content(chunk_size=None, decode_unicode=False)
413
414    def visits(self, origin: str, per_page=None,
415               **req_args) -> Generator[Dict[str, Any], None, None]:
416        """list visits of an origin
417
418        Args:
419            pid: object identifier
420            per_page: the number of visits per_page
421            req_args: extra keyword arguments for requests.get()
422
423        Returns:
424            an iterator over visits of the origin
425
426        Raises:
427          requests.HTTPError: if HTTP request fails
428
429        """
430        done = False
431        r = None
432        params = ''
433        if per_page is not None:
434            params = f"?per_page={per_page}"
435        query = f'origin/{origin}/visits{params}'
436
437        while not done:
438            r = self._call(query, http_method='get', **req_args)
439            yield from r.json()
440            if 'next' in r.links and 'url' in r.links['next']:
441                if query.startswith(self.api_url):
442                    # XXX hackish URL cleaning while we wait for swh-web API to
443                    # return complete URLs (a-la GitHub/GitLab) in Link headers
444                    # instead of absolute paths rooted at https://archive.s.o/
445                    # cf. https://forge.softwareheritage.org/T2147
446                    query = query[len(self.api_url):].lstrip('/')
447            else:
448                done = True
449
450
451import argparse
452
453parser = argparse.ArgumentParser()
454parser.add_argument('url', type=str)
455parser.add_argument('origin', type=str)
456args = parser.parse_args()
457
458url = args.url
459origin = args.origin
460endpoint = url + "/api/1"
461
462c = WebAPIClient(api_url=endpoint)
463visits = {}
464evaluations = {}
465for v in c.visits(origin):
466    snapshot = v['snapshot']
467    if snapshot is None:
468        continue
469
470    branches = dict(c.snapshot(PID(object_type='snapshot',
471                                   object_id=snapshot)))
472
473    visits[v['visit']] = {
474        'snapshot': snapshot,
475        'branches': branches,
476        'date': v['date'],
477    }
478    evaluation = branches['evaluation']['target'].object_id
479    if evaluation in evaluations:
480        evaluations[evaluation].update(branches)
481    else:
482        evaluations[evaluation] = branches
483
484import json
485print(json.dumps({'visits': visits, 'evaluations': evaluations}))