· 5 years ago · Feb 03, 2020, 01:34 PM
1#!/usr/bin/env python3
2import datetime
3import time
4import yaml
5import os
6import urllib
7import requests
8import json
9import pandas as pd
10import boto3
11
12S3_CONFIG_FILE = "s3cfg.yml"
13CONFIG_FILE = "config.cfg"
14
15client = boto3.client('s3')
16
17class Breed:
18 ### Private Variables ###
19 __environment = None
20 __api_url = None
21 __onboarding_api_url = None
22 __species_id = None
23 __species_name = None
24 __partner_id = None
25 __partner_name = None
26 __trait_id = None
27 __dataset_id = None
28 __trait_name = None
29 __speciesDF = None
30 __partnersDF = None
31 __UOMlistDF = None
32 __genotype_dataDF = None
33 __phenotypesDF = None
34 __onboarding_eventsDF = None
35 __onboarding_event_raw_dataDF = None
36 __onboarding_event_trait_raw_dataDF = None
37 __onboarding_projectsDF = None
38 __end_point_paths = None
39 __api_urls = None
40 __onboarding_api_urls = None
41 __onboarding_comp_bio_microservice_api_url = None
42 __onboarding_comp_bio_microservice_api_urls = None
43 __s3buckets = None
44 __current_time = datetime.datetime.now(),
45 __species_germplasmsDF = None
46 __user_name = None
47 __germplasm_groupsDF = None
48
49
50 def __init__(self, user_name=None, env=None, specie_id=None, partner_id=None, trait_id=None, dataset_id=None):
51 # below loads end point paths to url to various environments
52 self.__load_configurations()
53 # self.__load_s3_configurations()
54 if user_name:
55 self.set_user(user_name)
56 if env:
57 self.set_environment(env)
58 if specie_id:
59 self.set_specie(specie_id)
60 if partner_id:
61 print('partner_id___________________',partner_id)
62 self.set_partner(partner_id)
63 if dataset_id:
64 # this is required
65 self.set_dataset_id(dataset_id)
66 if trait_id:
67 # this is required
68 self.set_trait(trait_id)
69
70 # def private(self):
71
72 def __get_configurations(self):
73 # config below used to be stored in separate yaml file but with this package
74 # needed to deployed it's best to just store these locally
75 with open(CONFIG_FILE) as f:
76 config = yaml.load(f, Loader=yaml.FullLoader)
77 return config
78
79 def __load_configurations(self):
80 print('----------------')
81 config = self.__get_configurations()
82 if not config:
83 print("Error: attempt to get configuation from .loadLocalConfigurations failed")
84 return
85 print(config)
86 self.__end_point_paths = config.get('endPointPaths')
87 self.__api_urls = config.get('apiUrls')
88 self.__s3buckets = config.get('s3buckets')
89 self.__onboarding_api_urls = config.get('onboardingApiUrls')
90 self.__onboarding_comp_bio_microservice_api_urls = config.get('onboardingCompBioMicroserviceApiUrls')
91
92 def __load_s3_configurations(self):
93 with open(S3_CONFIG_FILE) as f:
94 config = yaml.load(f, Loader=yaml.FullLoader)
95 if config:
96 required_keys = ["access_key", "secret_key"]
97 for key in required_keys:
98 if not hasattr(config, key):
99 print(f"Error: expected to find {key} in {S3_CONFIG_FILE} but did not find it")
100 return
101 os.environ["AWS_ACCESS_KEY_ID"] = config["access_key"]
102 os.environ["AWS_SECRET_ACCESS_KEY"] = config["secret_key"]
103 os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
104
105 print('\nS3config file found and successfully loaded')
106
107 def __validate_internal_settings(self):
108 # private variables below should be set by functions initialized by .initialize function
109 variables = ["speciesId", "speciesName", "partnerId", "partnerName", "userName",
110 "environment", "apiUrl", "datasetId", "onboardingApiUrl"]
111 # TODO: continue this implementation
112
113
114 def __update_url_path(self, url, path):
115 parsed_url = urllib.parse.urlparse(url)
116 print('-----------------',parsed_url)
117 updated_url = parsed_url._replace(path=path,scheme='http',netloc='cropos-breed-api.us-east-1.elasticbeanstalk.com',params='',query='',fragment='',)
118 print("-----------------------------------------",updated_url.geturl())
119 return updated_url.geturl()
120
121 def __validated_api_end_point_response(self, api_end_point):
122 resp = requests.get(api_end_point)
123 if resp.status_code != 200:
124 print(f"Error: attempt to connect to apiEndPoint {api_end_point} failed with status: {resp.status_code}")
125 return
126 try:
127 return resp.json()
128 except:
129 print(f"Error: {api_end_point} did not return json data")
130
131 def __validate_url(self, url):
132 try:
133 resp = requests.get(url)
134 except Exception as e:
135 print(f"URL does not seem to exist: {url}",e)
136 finally:
137 print(f"\nValidating URL: {url}")
138
139 return resp
140
141 def __validate_dataframe_has_content(self, df):
142 if df.empty:
143 print("Error: attempt to validate data frame returned null")
144 return False
145 if not df.shape[1]:
146 print("Error: data frame does not contain any columns")
147 return False
148 if not df.shape[0]:
149 print("Error: data frame does not contain any rows")
150 return False
151 return True
152
153 def set_user(self, user_name):
154 if user_name:
155 self.__user_name = user_name
156 else:
157 user_name = input("Please enter user Name: ")
158 if not self.__user_name:
159 print(f"User set to: {user_name}")
160 else:
161 print(f"User updated to: {self.__user_name} to {user_name}")
162 self.__user_name = user_name
163
164 def set_specie(self, specie_id):
165 path = self.__end_point_paths['SpeciesPath']
166 if not path:
167 print("Error: failed to get 'SpeciesPath' end point path from config file")
168 # make user environment is set bec specie can vary from env to env
169 if not self.__api_url:
170 print("Environment has not been set internally. Please run $setEnvironment() function to set it")
171 # hit the end point .. get all available specie info
172 # breed endpoint http://breed-api-dev.us-east-1.elasticbeanstalk.com/api/CompBio/Species
173 url = self.__update_url_path(self.__api_url, path)
174 resp = self.__validated_api_end_point_response(url)
175 print('-----------------Species I-----------------------')
176 self.__speciesDF = pd.DataFrame(resp)
177 print('-------------',self.__speciesDF)
178
179 if not self.__validate_dataframe_has_content(self.__speciesDF):
180 print(f"Error: failed to get species from Breed end point: {path}")
181 return
182 if not specie_id:
183 # no user specified input ..
184 # give users 5 attempts to set the specie correctly
185 for i in range(5):
186 print("\nAvailable species:")
187 print(self.__speciesDF[["Name", "Id"]])
188 specie_id = input("Please select specie Id from above: ")
189 row = self.__speciesDF[self.__speciesDF["Id"]==specie_id]
190
191 if row.shape[0] == 0:
192 print(f"Error: specie id {specie_id} is invalid, please try again")
193 elif row.shape[0] > 1:
194 print(f"Error: id {specie_id} returned multiple species")
195 return
196 elif row.shape[0] == 1:
197 self.__species_id = row['Id'].iloc[0]
198 self.__species_name = row['Name'].iloc[0]
199 break
200
201 if i==4:
202 print("You have exhausted 5 attempts to set specie correctly")
203 return
204 else:
205 row = self.__speciesDF[self.__speciesDF["Id"]== specie_id]
206
207 if row.shape[0] == 0:
208 print(f"Error: specie id {specie_id} is invalid, please try again")
209 return
210 elif row.shape[0] > 1:
211 print(f"Error: id {specie_id} returned multiple species")
212 return
213 elif row.shape[0] == 1:
214 self.__species_id = str(specie_id)
215 self.__species_name = row['Name'].iloc[0]
216 print(f"Specie set to: {self.__species_id}")
217
218 def list_species(self):
219 if not self.__speciesDF:
220 print("ERROR: list of available species have not been loaded")
221 return
222 return self.__speciesDF
223
224 def list_current_specie(self):
225 if not (self.__species_name and self.__species_id):
226 print("Error: failed to validate Breed internal settings.")
227 return
228 print(f"Name: {self.__species_name} Id: {self.__species_id}")
229
230 def list_current_specie_name(self):
231 return self.__species_name
232
233 def set_environment(self, environment):
234 valid_environments = ['uat', 'uat_v2', 'dev', 'dev_v2', 'prod', 'demo', 'dev2']
235 if environment:
236 if environment not in valid_environments:
237 print(f"Error: invalid environment: {environment} valid environments are: {valid_environments}")
238 return
239 else:
240 for i in range(5):
241 environment = input("Please select environment['dev','uat','prod','demo','dev2']:")
242 if environment in valid_environments:
243 break
244 else:
245 print(f"Error: invalid environment: {environment} valid ones are: dev, uat and prod")
246 if i == 4:
247 print("You have exhausted 5 attempts to set environment correctly")
248 return
249 self.__environment = environment
250 api_url = self.__api_urls[self.__environment]
251 onboarding_api_url = self.__onboarding_api_urls[self.__environment]
252 onboarding_comp_bio_microservice_api_url = self.__onboarding_comp_bio_microservice_api_urls[self.__environment]
253
254 # try up to 5 times to validate URL.
255 # URL validation can fail if the ec2 housing the api is stopped or other traffic related issues
256 sleep_time_secs = 600
257 max_validation_attempts = 5
258 for j in range(max_validation_attempts):
259 result = self.__validate_url(api_url)
260 if not result:
261 print(f"Error: validation attempt {i+1} of max_validation_attempts failed for url: {api_url}")
262 time.sleep(sleep_time_secs)
263 else:
264 break
265 if j == 4:
266 print(f"Error: exiting after {max_validation_attempts} attempts to validate url: {api_url}")
267 return
268 self.__onboarding_api_url = onboarding_api_url
269 self.__onboarding_comp_bio_microservice_api_url = onboarding_comp_bio_microservice_api_url
270 print(f"STATUS: environment set to {self.__environment}")
271
272 def list_current_environment(self):
273 return self.__environment
274
275 def set_dataset_id(self, dataset_id):
276 self.__dataset_id = dataset_id
277 if not dataset_id:
278 self.__dataset_id = input("Please enter a datasetId: ")
279 print(f"DatasetId is set to {self.__dataset_id}")
280
281 def set_trait(self, trait_id):
282 # make sure trait id is passed in
283 if not trait_id:
284 print("Error: you must pass in a valid trait id")
285 return
286 # make sure phenotypes are loaded
287 if not self.__phenotypesDF:
288 # load them if not
289 self.__phenotypesDF = pd.DataFrame(self.load_phenotypes())
290
291 # make sure trait id is valid
292 trait_name = self.__phenotypesDF[self.__phenotypesDF["Id"]==int(trait_id)]["Name"]
293 if trait_name.empty:
294 print(f"Error: trait id {trait_id} is not valid for specie {self.__species_name}")
295 return
296 print(f"Successfully set trait to {trait_name}")
297 self.__trait_id = trait_id
298 self.__trait_name = trait_name
299
300 def set_partner(self, partner_id):
301 path = self.__end_point_paths['GetPartnersPath']
302 if not path:
303 print("Error: failed to get 'GetPartnersPath' from config file")
304 return
305 if not self.__api_url:
306 print("Environment has not been set internally in Partner. Please run $setEnvironment() function to set it")
307 url = self.__update_url_path(self.__api_url, path)
308 resp = self.__validated_api_end_point_response(url)
309 # print('____________Partner Response_______________________',resp)
310 self.__partnersDF = pd.DataFrame(resp)
311 # print('000000000000000',self.__partnersDF)
312 if not self.__validate_dataframe_has_content(self.__partnersDF):
313 print(f"Error: failed to get partners from breed end point: {path}")
314 return
315 if not partner_id:
316 # no user specified input ..
317 # give users 5 attempts to set the specie correctly
318 for i in range(5):
319 print("\nAvailable partners:")
320 print(self.__speciesDF[["Name", "Id", "Active"]])
321 partner_id = input("Please select partner Id from above: ")
322 row = self.__partnersDF[self.__partnersDF["Id"]==int(partner_id)]
323 print('88888888888888888888',row)
324
325 if row.shape[0] == 0:
326 print(f"Error: partner id {partner_id} is invalid, please try again")
327 elif row.shape[0] > 1:
328 print(f"Error: id {partner_id} returned multiple partners")
329 return
330 elif row.shape[0] == 1:
331 self.__partner_id = partner_id
332 self.__partner_name = row['Name'][0]
333 break
334
335 if i==4:
336 print("You have exhausted 5 attempts to set partner correctly")
337 return
338 else:
339 print('entering else of partner 341')
340
341 row = self.__partnersDF[self.__partnersDF["Id"] == int(partner_id)]
342 print('-------------Row-------------------',row['Name'])
343
344 if row.shape[0] == 0:
345 print(f"Error: partner id {partner_id} is invalid, please try again")
346 return
347 elif row.shape[0] > 1:
348 print(f"Error: id {partner_id} returned multiple partners")
349 return
350 elif row.shape[0] == 1:
351 self.__partner_id = row["Id"].iloc[0]
352 self.__partner_name = row['Name'].iloc[0]
353 print(f"Partener set to: {self.__partner_id}")
354
355 def list_partenrs(self):
356 if not self.__partnersDF:
357 print("Error: partners have not been loaded. Please run setPartner() to load them")
358 return
359 print(self.__partnersDF)
360
361 def list_current_partner(self):
362 if not self.__partner_id:
363 print("Error: partnerId is not internally set")
364 return
365 if not self.__partner_name:
366 print("Error: partnerName is not internally set")
367 return
368 print(f"Name: {self.__partner_name}, Id: {self.__partner_id}")
369
370 def list_current_partner_name(self):
371 return self.__partner_name
372
373 def load_phenotypes(self):
374 path = self.__end_point_paths['ListPhenotypesPath']
375 if not path:
376 print("Error: failed to get 'GetPhenotypesBySpeciesPath' end point path from config file")
377 return
378 if not self.__species_id:
379 print("Error: species has not been set. Please run setSpecie() function to set it")
380 return
381 if not self.__partner_id:
382 print("Error: partner has not been set. Please run setPartner() to set it")
383 return
384 if not self.__dataset_id:
385 print("Error: DatasetId has not been set. Please run setDatasetId() to set it")
386 return
387
388 path = f"{path}/{self.__partner_id}/{self.__dataset_id}"
389 url = self.__update_url_path(self.__api_url, path)
390 print(f"Phenotype access url is {url}")
391 resp = self.__validated_api_end_point_response(url)
392 print('---------------Phenotype Response----------------------',resp)
393
394 # if not self.__phenotypesDF or not self.__phenotypesDF.size:
395 # print(f"No phenotypes were found for specie {self.__specie_name} partner {self.__partner_name}")
396 # return
397 print(f"\nSuccessfully loaded phenotypes for specie {self.__species_name}")
398 return resp
399
400 def list_api_url(self):
401 if not self.__api_url:
402 print("ERROR: api-url has not been set")
403 return self.__api_url
404
405 def load_genotype_data(self, s3dir=None):
406 # FIXME: ideally we'd want to use internally set specie/partner/event ids to
407 # derive at the s3path but currently what's in s3buckets do not necessarily
408 # match what's in breed DB so we'll ask the user to supply the s3path
409 # 4/3/1 works in dev
410
411 if not s3dir:
412 print("Error: you must supply an s3 dir path like \"4/6/10\" to this function")
413 return
414
415 s3_bucket = self.__s3buckets[self.__environment]
416 # NOTE: below set to return max of 100_000 objects this means max of 100_000 germplasms
417 s3objs = client.list_objects(Bucket=s3_bucket ,Prefix=s3dir, MaxKeys=500000)
418 # TODO: continue this implementation
419
420
421
422
423 def list_genotype_data(self):
424 if not self.__genotype_dataDF:
425 print("Error: genotype data has not been loaded. Please run loadGenotypeData() function to load it first")
426 return
427 return self.__genotype_dataDF
428
429 def load_germplasms(self):
430 germplasm_path = self.__end_point_paths['GermplasmsPath']
431 if not germplasm_path:
432 print("Error: failed to validate Breed internal settings.")
433 return
434 print(f"Loading germplasms for datasetId {self.__dataset_id}")
435 path = f"{germplasm_path}/{self.__dataset_id}"
436 url = self.__update_url_path(self.__api_url, path=path)
437 resp = self.__validated_api_end_point_response(url)
438 df = pd.DataFrame(resp)
439 if not self.__validate_dataframe_has_content(df):
440 print(f"No germplasms found for datasetId: {self.__dataset_id}")
441 return
442 print(f"Loaded {df.shape[0]} germplasms for datasetId {self.__dataset_id}")
443
444 self.__species_germplasmsDF = df
445 if self.__species_germplasmsDF.shape[0] > 0:
446 print(f"Successfully loaded {df.shape[0]} germplasms for partner: {self.__partner_name} datasetId: {self.__dataset_id}")
447 else:
448 print(f"After filtering for partner {self.__partner_name} no germplasms are left")
449 return self.__species_germplasmsDF
450
451 def list_germplasms(self):
452 if self.__species_germplasmsDF.empty:
453 print("Error: germplasms for species have not been loaded. Please run loadGermplasms() function")
454 return
455 if not self.__species_germplasmsDF.shape[0] > 0:
456 print(f"No germplasms found for datasetId: {self.__dataset_id}")
457 return
458 return self.__species_germplasmsDF
459
460 def remove_stored_germplasms(self):
461 if self.__species_germplasmsDF:
462 self.__species_germplasmsDF = None
463
464 def list_access_groups(self):
465 path = self.__end_point_paths['AccessGroupListPath']
466 if not path:
467 print("Error: failed to get AccessGroupListPath end point path from config file")
468 return
469 url = self.__update_url_path(self.__api_url, path)
470 resp = self.__validated_api_end_point_response(url)
471 groups = pd.DataFrame(resp)
472 print(groups[["Name", "Id", "Active"]])
473
474 def list_onboarding_event_raw_data(self, event_id):
475 if not event_id:
476 print("Error: onboardingEventId must be passed in. Please run listOnboardingEvents() to see a list of valid events")
477 return
478
479 if not self.__onboarding_eventsDF:
480 print("Error: onboarding events have not been set. Please run listOnboardingEvents() function")
481 return
482
483 targetDF = self.__onboarding_eventsDF[self.__onboarding_eventsDF["Id"] == event_id and self.__onboarding_eventsDF["UploadType"] == "Phenotype"]
484 if not targetDF.size:
485 print(f"Error: failed to find 'Phenotype' upload event for id {event_id}")
486 path = self.__end_point_paths['GetRawDataByEventIdPath']
487 if not path:
488 print("Error: failed to get 'GetRawDataByEventIdPath' from config file")
489 print("Loading data .. please be patient .. this may be slow ..")
490 path = path + event_id
491 url = self.__update_url_path(self.__api_url, path)
492 resp = requests.get(url, timeout=3)
493 if resp.status_code != 200:
494 print(f"Error: attempt to connnect to url failed with status {resp.status_code}")
495 if self.__onboarding_event_raw_dataDF.shape[0] <= 1:
496 print("Error: onboardingEvent did not return any raw data")
497 # only message is set, should this be returned?
498 print(f"Loaded {self.__onboarding_event_raw_dataDF.shape[0]} phenotypes raw data")
499
500
501 def load_partner_raw_data(self):
502 path = self.__end_point_paths["GetRawDataByEventIdPath"]
503 if not path:
504 print("Error: failed to get 'GetRawDataByEventIdPath' from config file")
505 return
506 if not self.__partner_name:
507 print("Error: partner is not set. Please run setPartner() function to set it")
508
509 if not self.__species_id:
510 print("Error: specie is not set. Please run setSpecie() to set it")
511
512 if not self.__onboarding_eventsDF:
513 self.list_onboarding_events()
514 if not self.__onboarding_eventsDF:
515 print(f"No onboarding events found for specie: {self.__species_name} and partner: {self.__partner_name}")
516 phenotypesDF = self.__onboarding_eventsDF[self.__onboarding_eventsDF["UploadType"] == "Phenotype"]
517 if not phenotypesDF.shape[0]:
518 print(f"No 'phenotype' UploadType found for specie: {self.__species_name} and partner: {self.__partner_name}")
519 return
520 onboarding_event_ids = set(list(phenotypesDF["Id"]))
521 print(f"Found {len(onboarding_event_ids)} phenotype event(s) to download data from")
522 print("Downloading data. Please be patient, this could take a few minutes.")
523
524 if self.__onboarding_event_raw_dataDF:
525 reply = input("There is already raw data stored internally. Would you like to delete this data and continue?")
526 if reply.lower() == 'yes':
527 self.__onboarding_event_raw_dataDF = None
528 else:
529 return
530 for event_id in onboarding_event_ids:
531 url_path = path + event_id
532 print(url_path)
533 url = self.__update_url_path(self.__api_url, url_path)
534 print(url)
535 # FIXME: not sure if the 5min time out below is appropriate
536 resp = requests.get(url, timeout=6)
537 if resp.status_code != 200:
538 print(f"Error: attempt to connnect to url failed with status {resp.status_code}")
539 return
540 df = pd.DataFrame(resp)
541 if not self.__validate_dataframe_has_content(self.__speciesDF):
542 print(f"Error: end point {path} did not return any data")
543 return
544 print(f"Downloaded {df.shape[0]} phenotype data")
545 self.__onboarding_event_raw_dataDF = pd.concat([self.__onboarding_event_raw_dataDF, df])
546
547 def load_partner_trait_raw_data(self):
548 path = self.__end_point_paths["getRawDataByEventIdAndTraitPath"]
549 if not path:
550 print("Error: failed to get 'getRawDataByEventIdAndTraitPath' end point path")
551 if not self.__species_id:
552 print("Error: species has not been set. Please run setSpecie() function to set it")
553 return
554 if not self.__partner_id:
555 print("Error: partner has not been set. Please run setPartner() to set it")
556 return
557 if not self.__dataset_id:
558 print("Error: DatasetId has not been set. Please run setDatasetId() to set it")
559 return
560 if not self.__trait_id:
561 print("Error: traitId has not been set. Please run setTrait(<traitId>) to set it")
562 return
563 if not self.__onboarding_eventsDF:
564 self.list_onboarding_events()
565 if not self.__onboarding_eventsDF:
566 print(f"No onboarding events found for specie: {self.__species_name} and partner: {self.__partner_name}")
567
568 phenotypesDF = self.__onboarding_eventsDF[self.__onboarding_eventsDF["UploadType"]=="Phenotype"]
569 if not phenotypesDF.shape[0]:
570 print(f"No 'phenotype' UploadType found for specie: {self.__species_name} and partner {self.__partner_name}")
571
572 onboarding_event_ids = set(list(phenotypesDF["Id"]))
573 print(f"Found {len(onboarding_event_ids)} phenotype event(s) to download data from")
574 print("Downloading data. Please be patient, this could take a few minutes.")
575
576 if self.__onboarding_event_trait_raw_dataDF:
577 reply = input("There is already raw data stored internally. Would you like to delete this data and continue?")
578 if reply.lower() == 'yes':
579 self.__onboarding_event_trait_raw_dataDF = None
580 else:
581 return
582 for event_id in onboarding_event_ids:
583 page_num = 1
584 size = 50000
585 while True:
586 url_path = f"{path}{event_id}/{self.__trait_id}/{page_num}/{size}"
587 page_num += 1
588 url = self.__update_url_path(self.__api_url, url_path)
589 print(url)
590 resp = requests.get(url, timeout=3)
591 if resp.status_code != 200:
592 print(f"Error: attempt to connnect to url {url} failed with status {resp.status_code}")
593 print(f"Warning: skipping onboarding id {event_id}")
594 break
595 df = pd.DataFrame(resp)
596 if df.shape[0] == 0:
597 print(f"Warning: no trait phenotype data found in onboarding event {event_id}")
598 break
599
600 if not self.__validate_dataframe_has_content(self.__speciesDF):
601 print(f"Error: end point {path} did not return any data")
602 return
603
604 print(f"Downloaded {df.shape[0]} phenotype data for trait {self.__trait_name}")
605 self.__onboarding_event_trait_raw_dataDF = pd.concat([self.__onboarding_event_trait_raw_dataDF, df])
606 if df.shape[0] < size:
607 print(f"Successfully completed {page_num} pages of download")
608 break
609 print(f"Successfully downloaded {self.__onboarding_event_trait_raw_dataDF.shape[0]} rows of data trait")
610
611 def list_trait_raw_data(self):
612 if self.__validate_dataframe_has_content(self.__onboarding_event_trait_raw_dataDF):
613 print("Error: no trait raw data has been loaded")
614 return
615 return self.__onboarding_event_trait_raw_dataDF
616
617 def remove_stored_trait_raw_data(self):
618 if self.__onboarding_event_trait_raw_dataDF:
619 self.__onboarding_event_trait_raw_dataDF = None
620
621 def list_raw_data_phenotypes(self):
622 if not self.__onboarding_event_raw_dataDF:
623 print("Error: no phenotype raw data have been loaded")
624 return
625 print("Following phenotypes were found")
626 phenotypes = " ".join(list(self.__onboarding_event_raw_dataDF["Phenotype"]))
627 print(phenotypes)
628
629 def filter_raw_data_for_phonotypes(self, trait):
630 if not self.__onboarding_event_raw_dataDF:
631 print("Error: no phenotype raw data have been loaded")
632 return
633 validPhenotypes = list(set(list(self.__onboarding_event_raw_dataDF["Phenotype"])))
634 newDF = pd.DataFrame()
635 if not trait in validPhenotypes:
636 print(f"Invalid phenotype: {trait}")
637 return
638
639 ptDF = self.__onboarding_event_raw_dataDF[self.__onboarding_event_raw_dataDF["Phenotype"]==trait]
640 print(f"Parsed {ptDF.shape[0]} {trait} phenotypes")
641 newDF = pd.concat([newDF, ptDF])
642 self.__onboarding_event_raw_dataDF = newDF
643 print(f"Data set trimmed down to {self.__onboarding_event_raw_dataDF.shape[0]} rows")
644
645 def list_raw_data(self):
646 if not self.__validate_dataframe_has_content(self.__onboarding_event_raw_dataDF):
647 print("Error: no phenotype data have been loaded.")
648 return
649 return self.__onboarding_event_raw_dataDF
650
651 def remove_stored_raw_data(self):
652 if self.__onboarding_event_raw_dataDF:
653 self.__onboarding_event_raw_dataDF == None
654
655 def upload_summarized_data(self):
656 #sri
657 pass
658
659 def upload_analysis_type_status(self, dataDF, **kwargs):
660 path = self.__end_point_paths["PhenotypeAnalysisTypeStatus"]
661 if not path:
662 print("Error: failed to get 'PhenotypeAnalysisTypeStatus' path from config file")
663 return
664 url = self.__update_url_path(self.__onboarding_comp_bio_microservice_api_url, path)
665 # try up to 5 times to upload data, waiting 10 min in between
666 validation_attempt = 0
667 max_validation_attempts = 2
668 sleep_time_secs = 30
669 for i in range(max_validation_attempts):
670 resp = requests.post(url, data = dataDF, encode = "json")
671 print(resp)
672 if resp.status_code != 200:
673 result = resp.json()
674 print(f"Error: upload Analysis Type Status {validation_attempt} of {max_validation_attempts} failed")
675 time.sleep(sleep_time_secs)
676 continue
677 else:
678 print("Successfully uploaded upload Analysis Type Status")
679 break
680
681
682
683 def get_data_by_analysis_id(self, analysis_id):
684 path = self.__end_point_paths["GetPhenotypeAnalysisByIdPath"]
685 if not path:
686 print("Error: failed to get 'GetPhenotypeAnalysisByIdPath' end point path from config file")
687 return
688 if not self.__validate_internal_settings():
689 print("Error: attempt to validate Breed internal setttings failed")
690 return
691 path = path + analysis_id
692 url = self.__update_url_path(self.__api_url, path)
693 resp = self.__validated_api_end_point_response(url)
694 df = pd.DataFrame(resp)
695 # FIXME: lots of columns with NAs .. maybe just print/display certain columns only
696 return df
697
698 def load_germplasm_groups(self):
699 germ_plasm_groups_path = self.__end_point_paths["GermplasmGroupsPath"]
700 if not germ_plasm_groups_path:
701 print("Error: failed to get 'GermplasmGroupsPath' end point from config file")
702 return
703 if not self.__species_id:
704 print("Error: specieId has not been internally set")
705 return
706 path = germ_plasm_groups_path + '/'+self.__species_id
707 url = self.__update_url_path(self.__api_url, path)
708 resp = self.__validated_api_end_point_response(url)
709 self.__germplasm_groupsDF = pd.DataFrame(resp)
710 if not self.__germplasm_groupsDF.shape[0]:
711 print(f"No germplams groups were found for specie {self.__species_name}")
712 return
713 print(self.__germplasm_groupsDF[["Id", "Name", "Type", "SpeciesId", "Traits", "HasModel"]])
714
715 def list_germplasm_groups(self):
716 if self.__germplasm_groupsDF.empty:
717 print("Error: germplasm groups have not been loaded. Please run loadGermplasmGroups() function first")
718 return
719 if not self.__germplasm_groupsDF.shape[0]:
720 print(f"No germplasm groups found for specie: {self.__species_name}")
721 return
722 return self.__germplasm_groupsDF
723
724 def list_germplasms_by_group(self, group_id):
725 germ_plasm_group_path = self.__api_url["GermplasmGroupPath"]
726 if not germ_plasm_group_path:
727 print()
728 return
729 if not self.__germplasm_groupsDF:
730 print("Error: germplasm groups have not been loaded. Please run loadGermplasmGroups() function first")
731 return
732 if not self.__germplasm_groupsDF.shape[0]:
733 print(f"No germplasm groups found for specie: {self.__species_name}")
734 return
735 if group_id not in self.__germplasm_groupsDF["Id"]:
736 print()
737 return
738 path = germ_plasm_group_path + group_id + "/Phenotypes/FALSE"
739 url = self.__update_url_path(self.__api_url, path)
740 resp = self.__validated_api_end_point_response(url)
741 df = pd.DataFrame(resp)
742 if not df.shape[0]:
743 print(f"No germplasms found for germplasm group {group_id}")
744 return df
745
746 def validate_germplasm_id(self, germ_plasm_id):
747 # FIXME: I think an end point was expected to exist for this but
748 # there isn't one so we'll check Ids against list of ids for species
749 if not self.__species_germplasmsDF:
750 print("Warning: germplasms for species have not been loaded. Please run loadGermplasms function to load them")
751 return
752 if germ_plasm_id not in self.__species_germplasmsDF["Id"]:
753 print(f"Warning: germplasmId: {germ_plasm_id} is not valid for specie {self.__species_name}")
754 return
755 print(f"GermplasmId {germ_plasm_id} is valid for specie {self.__species_name}")
756
757
758 def list_phenotypes(self):
759 if self.__phenotypesDF.empty:
760 print(
761 "Error: phenotypes have not been loaded. Please run loadPhenotypes() to load them")
762 return
763
764 return self.__phenotypesDF
765
766 def check_phenotype(self, **kwargs):
767 if not self.__phenotypesDF:
768 print(
769 "Error: phenotypes have not been loaded. Please run loadPhenotypes() to load them")
770 return
771
772 valid_params = ('id', 'name')
773 if not kwargs:
774 print("expected either Id or Name input for checking phenotype")
775 return
776
777 for i in kwargs:
778 if i not in valid_params:
779 print(
780 f"Error: invalid checkPhenotpye param name: {i}"
781 f" valid search params are id and name"
782 )
783 return
784
785 rows = []
786 if 'id' in kwargs:
787 rows = self.__phenotypesDF[self.__phenotypesDF['id'] == kwargs['id']]
788 elif 'name' in kwargs:
789 rows = self.__phenotypesDF[self.__phenotypesDF['name'] == kwargs['name']]
790
791 if len(rows) == 0:
792 print(f"No matching phenotype were found for search params: {kwargs}")
793
794 print(rows)
795
796 def add_phenotype(self):
797 # FIXME: no end point for this currently
798 print("sorry this is not yet functional")
799
800 def add_phenotype_value(self, **kwargs):
801 # check input params
802 valid_params = ('phenotypeId', 'germplasmId', 'numvalue', 'uomid')
803
804 if len(kwargs) != 4:
805 print(f"Error: expecter 5 input params for addPhenotype but got {len(kwargs)}")
806
807 for i in kwargs:
808 if i not in valid_params:
809 print(f"Error: invalid addUOM param: {i}")
810 return
811
812 # dict to store phenotype params
813 data = {
814 'PhenotypeId': kwargs['phenotypeId'],
815 'GermplasmId': kwargs['germplasmId'],
816 'NumValue': kwargs['numvalue'],
817 'UOMId': kwargs['uomid']
818 }
819
820 path = self.__end_point_paths.get('AddPhenotypeValuePath')
821 if not path:
822 print("Error: failed to get 'AddPhenotypeValuesPath' from config file")
823 return
824
825 url = self.__update_url_path(self.__api_url, path)
826 response = requests.post(url, data=data)
827 reply = response.text()
828 if reply != 'true':
829 print(f"Error: expected 'true' reply but got: {reply}")
830 return
831
832 print(reply)
833
834 def list_UOMs(self):
835 path = self.__end_point_paths.get('UOMListPath')
836 if not path:
837 print("Error: failed to get 'UOMListPath' from config file")
838 return
839
840 url = self.__update_url_path(self.__api_url, path)
841 resp = self.__validated_api_end_point_response(url)
842 self.__UOMlistDF = pd.DataFrame(resp)
843
844 if self.__UOMlistDF.shape[0] == 0:
845 print("Error: attempt to listUOM failed")
846 return
847
848 print(self.__UOMlistDF[["Name", "Description", "Id", "PartnerId", "Active", "CreatedBy"]])
849
850 def check_UOM(self, uomId=None):
851 if not self.__UOMlistDF:
852 print("Error: uom list has not been loaded locally. Please run listUOMs()")
853 return
854
855 if uomId not in self.__UOMlistDF['id']:
856 print(f"UOM with Id {uomId} does not currently exist")
857 return
858 else:
859 rows = self.__UOMlistDF[self.__UOMlistDF['id'] == uomId]
860 print(rows[["Name", "Description", "Id", "Active"]])
861
862 # FIXME: users should also be able to filter by name
863
864 def add_UOM(self, **kwargs):
865 path = self.__end_point_paths.get('addUOMPath')
866 if not path:
867 print("Error: failed to get 'addUOMPath' from config file")
868 return
869
870 valid_params = ['name', 'description', 'active']
871
872 if len(kwargs) != 3:
873 print("Error: not all required inputs have been passed in, Name??, Description?? Active")
874 return
875
876 for i in kwargs:
877 if i not in valid_params:
878 print(f"Error: Invalid addUOM param: {i}")
879 return
880
881 if self.__UOMlistDF is None:
882 print("Error: existing UOMs have not been loaded. Please run listUOMs() to load them")
883 return
884
885 rows = self.__UOMlistDF[self.__UOMlistDF['name'] == kwargs['name'].lower()]
886
887 if len(rows) > 0:
888 print(f"Error: UOM with name {kwargs['name']} already exists")
889 return
890
891 # user is not prompted to set userName so check to make sure this is set
892 if not self.__user_name:
893 print("Error: user name has not been set. Please run setUser() to set it")
894 return
895
896 url = self.__update_url_path(self.__api_url, path)
897
898 data = {
899 'Name': kwargs['name'],
900 'Description': kwargs['description'],
901 'Active': kwargs['active'],
902 'CreatedTime': self.__current_time,
903 'CreatedBy': self.__user_name,
904 'PartnerId': self.__partner_id
905 }
906
907 resp = requests.post(url, data)
908 reply = resp.text()
909 if reply != "true":
910 print(f"Error: expected 'true' reply but got: {reply}")
911 return
912
913 print("Successfully added UOM")
914
915
916 def list_summary_analysis(self):
917 path = self.__end_point_paths.get('PhenotypesAnalysisPath')
918
919 if not path:
920 print("Error: failed to get 'PhenotypesAnalysisPath' end point path from config file")
921 return
922
923 # make sure species and partners are internally set
924 if self.__validate_internal_settings() is None:
925 print("Error: failed to validate internaly settting")
926 return
927
928 path = '/'.join([path, self.__partner_id, self.__dataset_id])
929 url = self.__update_url_path(self.__api_url, path)
930 resp = self.__validated_api_end_point_response(url)
931 DF = pd.DataFrame(resp)
932
933 def validate_analysis_id(self, analysisId=None):
934 if not analysisId:
935 print("Expected an analysisId input but did not get it, validateAnalysisId(ID?)")
936 return
937
938 path = self.__end_point_paths.get('IsAnalysisIdValidPath')
939 if not path:
940 print("Error: failed to get 'IsAnalysisIdValidPath' end point from config file")
941 return
942
943 path = '/'.join(path, analysisId)
944 url = self.__update_url_path(self.__api_url, path)
945 resp = requests.get(url)
946 reply = resp.json()
947
948 if reply:
949 print(f"analysisId {analysisId} is valid")
950 else:
951 print(f"analysisId {analysisId} is NOT VALID")
952
953 def clean_analysis_id(self, analysisId):
954 path = self.__end_point_paths.get('CleanAnalysisIdPath')
955
956 if not path:
957 print("Error: failed to get 'CleanAnalysisIdPath' end point from config file")
958 return
959
960 path = '/'.join([path, analysisId])
961 print(f"Setting clean analysis Id: {analysisId}")
962 url = self.__update_url_path(self.__api_url, path)
963 resp = requests.post(url)
964 result = resp.json()
965 if result:
966 print(f"Successfully cleaned analysisId {analysisId}")
967 else:
968 print(f"Error: attempt to clean analysisID return result: {result}")
969
970 def set_live_analysis_id(self):
971 print("Sorry, this function is not ready for use")
972
973 def list_onboarding_projects(self):
974 path = self.__end_point_paths.get('GetOnboardingProjectsPath')
975 if not path:
976 print("Failed to get 'GetOnboardingProjectsPath' end point path from config file")
977 return
978
979 # make sure internal variables such as specieId, name, etc are set
980 if self.__validate_internal_settings() is None:
981 print("Error: failed to validate Breed internal settings.")
982 return
983
984 url = self.__update_url_path(self.__api_url, path)
985
986 # FIXME: species = private$speciesId to headers below doesn't actually work. End point will
987 # return objects for all species so will manually filter few lines below
988 # resp = GET(url, add_headers( partner_id = private$partnerId, species = private$speciesId))
989 resp = requests.get(url, headers={'partner_id': self.__partner_id})
990 if resp.status_code != 200:
991 print(f"Attempt to connnect to url failed with status {resp.status_code}")
992
993 DF = pd.DataFrame(resp.json())
994
995 if DF.shape[0] == 0:
996 print(f"No onboarding projects found for partnerId {self.__partner_id}")
997 return
998
999 # parse just projects for currently set specie
1000 DF = DF[DF['SpeciesId'] == self.__species_id]
1001 if DF.shape[0] == 0:
1002 print(f"No onboarding projects found for specieId {self.__species_id}")
1003 return
1004
1005 print(DF[["Id", "PartnerId", "SpeciesId", "AccessGroupId", "Active", "IsComplete", "CreatedBy"]])
1006
1007 def list_onboarding_events(self):
1008 # FIXME: this may not get refreshed when specie is changed
1009 path = self.__end_point_paths.get("GetCompletedOnboardingEventsPath")
1010 if not path:
1011 print("Failed to get 'GetCompletedOnboardingEventsPath' end point from config file")
1012 return
1013
1014 url = self.__update_url_path(self.__api_url, path)
1015 resp = self.__validated_api_end_point_response(url)
1016
1017
1018 DF = pd.DataFrame(resp)
1019 if not self.__validate_dataframe_has_content(DF):
1020 print(f"Error: failed to get onboarding events from Breed end point: {path}")
1021 return
1022
1023 # make sure internal variables such as specieId, name, etc are set
1024 if not self.__validate_internal_settings():
1025 print("Error: failed to validate Breed internal settings.")
1026 return
1027
1028 DF = DF[DF['SpeciesId'] == self.__species_id]
1029 if DF.shape[0] == 0:
1030 print(f"No onboarding events have been found for Specie: {self.__species_name}")
1031 return
1032
1033 DF = DF[DF["PartnerId"] == self.__partner_id]
1034 DF = DF[DF["UploadType"] == 'Phenotype']
1035 DF = DF[DF["DatasetId"] == self.__dataset_id]
1036 if DF.shape[0] == 0:
1037 print(f"No onboarding events have been found for Specie {self.__species_name} "
1038 f"and partner {self.__partner_name}")
1039 return
1040
1041 self.__onboarding_eventsDF = DF
1042 print('\nAvailable phenotype onboarding events:')
1043 print(self.__onboarding_eventsDF[["Name", "Id", "UploadType", "Species", "SpeciesId", "PartnerName", "PartnerId", "DatasetId"]])
1044
1045 def list_all_onboarding_events(self):
1046 path = self.__end_point_paths["GetOnboardingEventsPath"]
1047 if not path:
1048 print("Failed to get 'GetCompletedOnboardingEventsPath' end point path from config file")
1049 return
1050
1051 url = self.__update_url_path(self.__api_url, path)
1052 resp = self.__validated_api_end_point_response(url)
1053 DF = pd.DataFrame(resp)
1054 if not self.__validate_dataframe_has_content(DF):
1055 print(f"Error: failed to get onboarding events from Breed end point: {path}")
1056 return
1057 print(DF[["Name", "Id", "UploadType", "Species", "SpeciesId", "PartnerName", "PartnerId" "Status", "IsComplete"]])
1058
1059 def get_phenotype_analysis(self):
1060 # gets uploaded phenotype analysis using analysisId
1061 pass
1062b = Breed("cropos_worker","dev_v2",392,2,29404,1765)
1063b.load_germplasm_groups()
1064b.list_germplasm_groups()
1065b.load_germplasms()
1066print(b.list_germplasms())
1067b.load_phenotypes()