· 5 years ago · Sep 24, 2020, 02:50 PM
1@hug.post('/search', response_headers={'Access-Control-Allow-Origin': '*'},)
2def search(seq: hug.types.text,
3 threshold: hug.types.float_number = 0.4, # bgsi recommendation
4 score: hug.types.smart_boolean = False):
5 """
6 Search a DNA sequence against the index of the human_gut using BIGSI
7 BIGSI stands for: BIGSIs–BItsliced Genomic Signature Indexes.
8 :seq: The fasta sequence to be searched on the index.
9 :param print_cols: A flag used to print the columns to the console
10 (default is False)
11 :returns: a dictionary with the original query, threshold and results.
12 Thee results is a list of objects with the following structure:
13 {
14 bigsi: {
15 sample_name
16 percent_kmers_found
17 num_kmers
18 num_kmers_found
19 },
20 mgnify: {
21 id,
22 attributes: -- MGnify genomes API data --
23 }
24 }
25 :rtype: dict
26 """
27 fasta_seq = _clean_fasta(seq)
28
29 if seq.count('>') > 1:
30 raise falcon.HTTPBadRequest('seq', 'Please provide just one DNA fasta sequence.')
31
32 if not re.match('^[ATGCRYMKSWHBVDN\s]+$', fasta_seq, re.IGNORECASE):
33 raise falcon.HTTPBadRequest('seq', 'The sequence doesn\'t appear to be a DNA sequence')
34
35 if len(fasta_seq) > MAX_LEN or len(fasta_seq) < MIN_LEN:
36 raise falcon.HTTPBadRequest('seq', f'The sequence should be longer that {MIN_LEN} and ' +
37 'shorter than {MAX_LEN}pb')
38
39 best_matches = bigsi.search(fasta_seq, threshold, score)
40 best_matches = sorted(best_matches, key=itemgetter('percent_kmers_found'), reverse=True)
41
42 # merge the data from the DB
43 storage = db.DB()
44 storage.open(MGNIFY_CACHE_PATH, None, db.DB_HASH, db.DB_READ_COMMITTED)
45
46 results = []
47
48 for hit in best_matches:
49 key = hit.get('sample_name')
50 data = storage.get(key.encode('utf-8'))
51 results.append({
52 'mgnify': json.loads(data) if data else {},
53 'bigsi': hit
54 })
55
56 storage.close()
57
58 return {
59 'query': fasta_seq,
60 'threshold': threshold,
61 'results': results
62 }