· 6 years ago · Jul 28, 2019, 05:48 AM
1{
2 "cells": [
3 {
4 "cell_type": "markdown",
5 "metadata": {},
6 "source": [
7 "## Data Generation for Model v3"
8 ]
9 },
10 {
11 "cell_type": "code",
12 "execution_count": 1,
13 "metadata": {},
14 "outputs": [],
15 "source": [
16 "import pyBigWig\n",
17 "import pandas as pd\n",
18 "import numpy as np\n",
19 "import gc"
20 ]
21 },
22 {
23 "cell_type": "code",
24 "execution_count": 2,
25 "metadata": {},
26 "outputs": [
27 {
28 "data": {
29 "text/html": [
30 "<div>\n",
31 "<style scoped>\n",
32 " .dataframe tbody tr th:only-of-type {\n",
33 " vertical-align: middle;\n",
34 " }\n",
35 "\n",
36 " .dataframe tbody tr th {\n",
37 " vertical-align: top;\n",
38 " }\n",
39 "\n",
40 " .dataframe thead th {\n",
41 " text-align: right;\n",
42 " }\n",
43 "</style>\n",
44 "<table border=\"1\" class=\"dataframe\">\n",
45 " <thead>\n",
46 " <tr style=\"text-align: right;\">\n",
47 " <th></th>\n",
48 " <th>Cell_ID</th>\n",
49 " <th>Mark_ID</th>\n",
50 " <th>CellType</th>\n",
51 " <th>Assay</th>\n",
52 " <th>DataType</th>\n",
53 " <th>fileName</th>\n",
54 " </tr>\n",
55 " </thead>\n",
56 " <tbody>\n",
57 " <tr>\n",
58 " <th>0</th>\n",
59 " <td>C01</td>\n",
60 " <td>M16</td>\n",
61 " <td>adipose_tissue</td>\n",
62 " <td>H3K27ac</td>\n",
63 " <td>T</td>\n",
64 " <td>C01M16.bigwig</td>\n",
65 " </tr>\n",
66 " <tr>\n",
67 " <th>1</th>\n",
68 " <td>C02</td>\n",
69 " <td>M01</td>\n",
70 " <td>adrenal_gland</td>\n",
71 " <td>ATAC-seq</td>\n",
72 " <td>T</td>\n",
73 " <td>C02M01.bigwig</td>\n",
74 " </tr>\n",
75 " <tr>\n",
76 " <th>2</th>\n",
77 " <td>C02</td>\n",
78 " <td>M02</td>\n",
79 " <td>adrenal_gland</td>\n",
80 " <td>DNase-seq</td>\n",
81 " <td>T</td>\n",
82 " <td>C02M02.bigwig</td>\n",
83 " </tr>\n",
84 " <tr>\n",
85 " <th>3</th>\n",
86 " <td>C02</td>\n",
87 " <td>M16</td>\n",
88 " <td>adrenal_gland</td>\n",
89 " <td>H3K27ac</td>\n",
90 " <td>T</td>\n",
91 " <td>C02M16.bigwig</td>\n",
92 " </tr>\n",
93 " <tr>\n",
94 " <th>4</th>\n",
95 " <td>C02</td>\n",
96 " <td>M18</td>\n",
97 " <td>adrenal_gland</td>\n",
98 " <td>H3K36me3</td>\n",
99 " <td>T</td>\n",
100 " <td>C02M18.bigwig</td>\n",
101 " </tr>\n",
102 " <tr>\n",
103 " <th>5</th>\n",
104 " <td>C02</td>\n",
105 " <td>M20</td>\n",
106 " <td>adrenal_gland</td>\n",
107 " <td>H3K4me1</td>\n",
108 " <td>T</td>\n",
109 " <td>C02M20.bigwig</td>\n",
110 " </tr>\n",
111 " <tr>\n",
112 " <th>6</th>\n",
113 " <td>C02</td>\n",
114 " <td>M22</td>\n",
115 " <td>adrenal_gland</td>\n",
116 " <td>H3K4me3</td>\n",
117 " <td>V</td>\n",
118 " <td>C02M22.bigwig</td>\n",
119 " </tr>\n",
120 " <tr>\n",
121 " <th>7</th>\n",
122 " <td>C02</td>\n",
123 " <td>M29</td>\n",
124 " <td>adrenal_gland</td>\n",
125 " <td>H3K9me3</td>\n",
126 " <td>T</td>\n",
127 " <td>C02M29.bigwig</td>\n",
128 " </tr>\n",
129 " <tr>\n",
130 " <th>8</th>\n",
131 " <td>C03</td>\n",
132 " <td>M02</td>\n",
133 " <td>adrenal_gland (embryonic)</td>\n",
134 " <td>DNase-seq</td>\n",
135 " <td>V</td>\n",
136 " <td>C03M02.bigwig</td>\n",
137 " </tr>\n",
138 " <tr>\n",
139 " <th>9</th>\n",
140 " <td>C03</td>\n",
141 " <td>M16</td>\n",
142 " <td>adrenal_gland (embryonic)</td>\n",
143 " <td>H3K27ac</td>\n",
144 " <td>T</td>\n",
145 " <td>C03M16.bigwig</td>\n",
146 " </tr>\n",
147 " </tbody>\n",
148 "</table>\n",
149 "</div>"
150 ],
151 "text/plain": [
152 " Cell_ID Mark_ID CellType Assay DataType \\\n",
153 "0 C01 M16 adipose_tissue H3K27ac T \n",
154 "1 C02 M01 adrenal_gland ATAC-seq T \n",
155 "2 C02 M02 adrenal_gland DNase-seq T \n",
156 "3 C02 M16 adrenal_gland H3K27ac T \n",
157 "4 C02 M18 adrenal_gland H3K36me3 T \n",
158 "5 C02 M20 adrenal_gland H3K4me1 T \n",
159 "6 C02 M22 adrenal_gland H3K4me3 V \n",
160 "7 C02 M29 adrenal_gland H3K9me3 T \n",
161 "8 C03 M02 adrenal_gland (embryonic) DNase-seq V \n",
162 "9 C03 M16 adrenal_gland (embryonic) H3K27ac T \n",
163 "\n",
164 " fileName \n",
165 "0 C01M16.bigwig \n",
166 "1 C02M01.bigwig \n",
167 "2 C02M02.bigwig \n",
168 "3 C02M16.bigwig \n",
169 "4 C02M18.bigwig \n",
170 "5 C02M20.bigwig \n",
171 "6 C02M22.bigwig \n",
172 "7 C02M29.bigwig \n",
173 "8 C03M02.bigwig \n",
174 "9 C03M16.bigwig "
175 ]
176 },
177 "execution_count": 2,
178 "metadata": {},
179 "output_type": "execute_result"
180 }
181 ],
182 "source": [
183 "filepath = \"../../data/\"\n",
184 "\n",
185 "df_meta = pd.read_csv(filepath + \"Encode_meta.tsv\", sep=\"\\t\")\n",
186 "df_meta.rename(columns={\"Training(T),Validation(V),Blind-test(B)\":\"DataType\"}, \n",
187 " inplace=True)\n",
188 "df_meta.head(10)"
189 ]
190 },
191 {
192 "cell_type": "raw",
193 "metadata": {},
194 "source": [
195 "# load blacklist\n",
196 "# TODO: this is hard to deal with. ignore for now\n",
197 "df_black = pd.read_csv(filepath + \"hg38.blacklist.bed.gz\", \n",
198 " sep = \"\\t\", header = None,\n",
199 " names = [\"chr\", \"start\", \"end\"])\n",
200 "df_black.head()"
201 ]
202 },
203 {
204 "cell_type": "code",
205 "execution_count": 3,
206 "metadata": {},
207 "outputs": [],
208 "source": [
209 "filepath_train = \"../../data/training_data/\"\n",
210 "filepath_valid = \"../../data/validation_data/\""
211 ]
212 },
213 {
214 "cell_type": "code",
215 "execution_count": 4,
216 "metadata": {},
217 "outputs": [],
218 "source": [
219 "chrom = \"chr16\"\n",
220 "n_cpu = 5"
221 ]
222 },
223 {
224 "cell_type": "markdown",
225 "metadata": {},
226 "source": [
227 "### 1. Prepare Avocado Input Dict\n",
228 "\n",
229 "25 bin and arcsinh"
230 ]
231 },
232 {
233 "cell_type": "code",
234 "execution_count": 5,
235 "metadata": {},
236 "outputs": [],
237 "source": [
238 "# filepath = fpth + fname\n",
239 "def get_binvals(filepath, chrom):\n",
240 " \"\"\"Get 25 bin values of given chrom\n",
241 " Take arcsinh first then bin (mean)\n",
242 " \n",
243 " Params:\n",
244 " ========\n",
245 " filepath: str. path to the bigwig file\n",
246 " chrom: str. specifiy which chromosome to process\n",
247 " \n",
248 " Return:\n",
249 " ========\n",
250 " vals: np.array('float32') with length of nbins\n",
251 " \"\"\"\n",
252 " bw = pyBigWig.open(filepath)\n",
253 "\n",
254 " end = bw.chroms(chrom)\n",
255 " nbins= (-(-end // 25)) ## round-up\n",
256 "\n",
257 " vals = bw.values(chrom, 0, end)\n",
258 " vals = np.array(vals)\n",
259 " vals[np.isnan(vals)] = 0\n",
260 "\n",
261 " # calc arcsinh\n",
262 " vals = np.arcsinh(vals)\n",
263 "\n",
264 " # pad 0 in order to match nbin*25\n",
265 " n_extra = nbins*25 - end\n",
266 " vals = np.append(vals, np.zeros(n_extra))\n",
267 "\n",
268 " # calc mean\n",
269 " vals = vals.reshape(nbins, -1).mean(axis=1)\n",
270 " vals = vals.astype('float32')\n",
271 "\n",
272 " bw.close()\n",
273 " \n",
274 " return(vals)"
275 ]
276 },
277 {
278 "cell_type": "code",
279 "execution_count": 6,
280 "metadata": {},
281 "outputs": [],
282 "source": [
283 "def preproc(chrom, filepath):\n",
284 " \n",
285 " vals = get_binvals(filepath, chrom)\n",
286 " \n",
287 " fname = filepath.split(\"/\")[-1]\n",
288 " Cell_ID = fname[0:3]\n",
289 " Mark_ID = fname[3:6]\n",
290 " \n",
291 " return {(Cell_ID, Mark_ID): vals}"
292 ]
293 },
294 {
295 "cell_type": "code",
296 "execution_count": 7,
297 "metadata": {},
298 "outputs": [],
299 "source": [
300 "fnames = []\n",
301 "for i, f in df_meta.iterrows():\n",
302 " \n",
303 " if f.DataType == \"T\":\n",
304 " fpth = filepath_train + f.fileName\n",
305 " \n",
306 " elif f.DataType == \"V\":\n",
307 " fpth = filepath_valid + f.fileName\n",
308 " \n",
309 " else:\n",
310 " continue\n",
311 " \n",
312 " fnames.append(fpth)"
313 ]
314 },
315 {
316 "cell_type": "code",
317 "execution_count": null,
318 "metadata": {},
319 "outputs": [],
320 "source": [
321 "from multiprocess import Pool\n",
322 "pool = Pool(n_cpu) # use 5 for large genomes\n",
323 "\n",
324 "result = pool.map(lambda x: preproc(chrom, x), fnames)\n",
325 "\n",
326 "pool.close()\n",
327 "pool.join()"
328 ]
329 },
330 {
331 "cell_type": "code",
332 "execution_count": null,
333 "metadata": {},
334 "outputs": [],
335 "source": [
336 "data = {}\n",
337 "for d in result:\n",
338 " data.update(d)"
339 ]
340 },
341 {
342 "cell_type": "code",
343 "execution_count": null,
344 "metadata": {},
345 "outputs": [],
346 "source": [
347 "len(data)"
348 ]
349 },
350 {
351 "cell_type": "code",
352 "execution_count": null,
353 "metadata": {},
354 "outputs": [],
355 "source": [
356 "import pickle\n",
357 "\n",
358 "outpath = \"../../data/model_v3_data/\"\n",
359 "outname = \"dict_all_\" + chrom + \"_arcsinh.pickle\"\n",
360 "\n",
361 "with open(outpath + outname, 'wb') as handle:\n",
362 " pickle.dump(data, handle, \n",
363 " protocol=pickle.HIGHEST_PROTOCOL)\n",
364 " \n",
365 "del result, data, pool, d\n",
366 "gc.collect()"
367 ]
368 },
369 {
370 "cell_type": "markdown",
371 "metadata": {},
372 "source": [
373 "### 2. Average Assay"
374 ]
375 },
376 {
377 "cell_type": "code",
378 "execution_count": null,
379 "metadata": {},
380 "outputs": [],
381 "source": [
382 "from multiprocess import Pool\n",
383 "import pickle\n",
384 "\n",
385 "outpath = \"../../data/model_v3_data/\"\n",
386 "outname = \"dict_all_\" + chrom + \"_arcsinh.pickle\"\n",
387 "\n",
388 "with open(outpath + outname, 'rb') as handle:\n",
389 " data = pickle.load(handle)"
390 ]
391 },
392 {
393 "cell_type": "code",
394 "execution_count": null,
395 "metadata": {},
396 "outputs": [],
397 "source": [
398 "## get data_avg for same assay (for each cell type, the avg assay is the same)\n",
399 "indices = np.array([[celltype for celltype, _ in data.keys()],\n",
400 " [assay for _, assay in data.keys()]])\n",
401 "tracks = list(data.values())"
402 ]
403 },
404 {
405 "cell_type": "code",
406 "execution_count": null,
407 "metadata": {},
408 "outputs": [],
409 "source": [
410 "def vals2cat(v):\n",
411 " \n",
412 " vals = pd.Series(v)\n",
413 " vals = vals.rank(method='first') \n",
414 " vals = pd.qcut(vals, 3, labels=[0, 1, 2])\n",
415 " vals = np.array(vals)\n",
416 " vals.astype('int8')\n",
417 " \n",
418 " return vals"
419 ]
420 },
421 {
422 "cell_type": "code",
423 "execution_count": null,
424 "metadata": {},
425 "outputs": [],
426 "source": [
427 "def get_avg(k, v):\n",
428 " \n",
429 " c_id = k[0]\n",
430 " a_id = k[1]\n",
431 " \n",
432 " # average same assays together\n",
433 " track_avg_idxs = (indices[1,:] == a_id) \n",
434 " track_avg = np.zeros(len(tracks[0]), dtype=np.float32)\n",
435 " n_other_cells = 0\n",
436 " \n",
437 " for j, b in enumerate(track_avg_idxs):\n",
438 " if b:\n",
439 " track_avg += tracks[j]\n",
440 " n_other_cells += 1\n",
441 " \n",
442 " #print(f\"processing {k}, n_other_cells: {n_other_cells}\")\n",
443 " \n",
444 " if n_other_cells > 1:\n",
445 " track_avg = track_avg/n_other_cells\n",
446 " #data_avg[k] = track_avg\n",
447 " return {k: track_avg}\n",
448 " return None"
449 ]
450 },
451 {
452 "cell_type": "code",
453 "execution_count": null,
454 "metadata": {},
455 "outputs": [],
456 "source": [
457 "def get_avg_cat(k, v):\n",
458 " \n",
459 " c_id = k[0]\n",
460 " a_id = k[1]\n",
461 " \n",
462 " # average same assays together\n",
463 " track_avg_idxs = (indices[1,:] == a_id) \n",
464 " track_avg = np.zeros(len(tracks[0]), dtype=np.float32)\n",
465 " n_other_cells = 0\n",
466 " \n",
467 " for j, b in enumerate(track_avg_idxs):\n",
468 " if b:\n",
469 " track_avg += tracks[j]\n",
470 " n_other_cells += 1\n",
471 " \n",
472 " #print(f\"processing {k}, n_other_cells: {n_other_cells}\")\n",
473 " \n",
474 " if n_other_cells > 1:\n",
475 " track_avg = track_avg/n_other_cells\n",
476 " #data_avg[k] = track_avg\n",
477 " return {k: vals2cat(track_avg)}\n",
478 " return None"
479 ]
480 },
481 {
482 "cell_type": "code",
483 "execution_count": null,
484 "metadata": {},
485 "outputs": [],
486 "source": [
487 "def get_all_cat(k, v):\n",
488 " return {k: vals2cat(v)}"
489 ]
490 },
491 {
492 "cell_type": "markdown",
493 "metadata": {},
494 "source": [
495 "### Calculate Average"
496 ]
497 },
498 {
499 "cell_type": "code",
500 "execution_count": null,
501 "metadata": {},
502 "outputs": [],
503 "source": [
504 "pool = Pool(n_cpu) # use 5 for large genomes\n",
505 "\n",
506 "result = pool.map(lambda x: get_avg(x[0], x[1]), data.items())\n",
507 "\n",
508 "pool.close()\n",
509 "pool.join()"
510 ]
511 },
512 {
513 "cell_type": "code",
514 "execution_count": null,
515 "metadata": {},
516 "outputs": [],
517 "source": [
518 "data_avg = {}\n",
519 "for d in result:\n",
520 " if d is not None:\n",
521 " data_avg.update(d)"
522 ]
523 },
524 {
525 "cell_type": "code",
526 "execution_count": null,
527 "metadata": {},
528 "outputs": [],
529 "source": [
530 "outpath = \"../../data/model_v3_data/\"\n",
531 "outname_avg = \"dict_avg_\" + chrom + \"_arcsinh.pickle\"\n",
532 "\n",
533 "with open(outpath + outname_avg, 'wb') as handle1:\n",
534 " pickle.dump(data_avg, handle1, \n",
535 " protocol=pickle.HIGHEST_PROTOCOL)\n",
536 "\n",
537 "del result, data_avg, pool, d\n",
538 "gc.collect()"
539 ]
540 },
541 {
542 "cell_type": "markdown",
543 "metadata": {},
544 "source": [
545 "### Calculate Average-3 Categories"
546 ]
547 },
548 {
549 "cell_type": "code",
550 "execution_count": null,
551 "metadata": {},
552 "outputs": [],
553 "source": [
554 "pool = Pool(n_cpu) # use 5 for large genomes\n",
555 "\n",
556 "result = pool.map(lambda x: get_avg_cat(x[0], x[1]), data.items())\n",
557 "\n",
558 "pool.close()\n",
559 "pool.join()"
560 ]
561 },
562 {
563 "cell_type": "code",
564 "execution_count": null,
565 "metadata": {},
566 "outputs": [],
567 "source": [
568 "data_avg_cat = {}\n",
569 "for d in result:\n",
570 " if d is not None:\n",
571 " data_avg_cat.update(d)"
572 ]
573 },
574 {
575 "cell_type": "code",
576 "execution_count": null,
577 "metadata": {},
578 "outputs": [],
579 "source": [
580 "outpath = \"../../data/model_v3_data/\"\n",
581 "outname_avg_cat = \"dict_avg3cat_\" + chrom + \"_arcsinh.pickle\"\n",
582 " \n",
583 "with open(outpath + outname_avg_cat, 'wb') as handle2:\n",
584 " pickle.dump(data_avg_cat, handle2, \n",
585 " protocol=pickle.HIGHEST_PROTOCOL)\n",
586 "\n",
587 "del result, data_avg_cat, pool, d\n",
588 "gc.collect()"
589 ]
590 },
591 {
592 "cell_type": "markdown",
593 "metadata": {},
594 "source": [
595 "### Convert All to 3 Categories"
596 ]
597 },
598 {
599 "cell_type": "code",
600 "execution_count": null,
601 "metadata": {},
602 "outputs": [],
603 "source": [
604 "pool = Pool(n_cpu) # use 5 for large genomes\n",
605 "\n",
606 "result = pool.map(lambda x: get_all_cat(x[0], x[1]), data.items())\n",
607 "\n",
608 "pool.close()\n",
609 "pool.join()"
610 ]
611 },
612 {
613 "cell_type": "code",
614 "execution_count": null,
615 "metadata": {},
616 "outputs": [],
617 "source": [
618 "data_all_cat = {}\n",
619 "for d in result:\n",
620 " if d is not None:\n",
621 " data_all_cat.update(d)"
622 ]
623 },
624 {
625 "cell_type": "code",
626 "execution_count": null,
627 "metadata": {},
628 "outputs": [],
629 "source": [
630 "len(data_all_cat)"
631 ]
632 },
633 {
634 "cell_type": "code",
635 "execution_count": null,
636 "metadata": {},
637 "outputs": [],
638 "source": [
639 "outpath = \"../../data/model_v3_data/\"\n",
640 "outname_all_cat = \"dict_3cat_\" + chrom + \"_arcsinh.pickle\"\n",
641 " \n",
642 "with open(outpath + outname_all_cat, 'wb') as handle3:\n",
643 " pickle.dump(data_all_cat, handle3, \n",
644 " protocol=pickle.HIGHEST_PROTOCOL)\n",
645 "\n",
646 "del result, data_all_cat, pool, d\n",
647 "gc.collect()"
648 ]
649 },
650 {
651 "cell_type": "markdown",
652 "metadata": {},
653 "source": [
654 "## 3. Seq Motif\n",
655 "\n",
656 "2019/07/09 - switch to cismotif instead of cistrome"
657 ]
658 },
659 {
660 "cell_type": "raw",
661 "metadata": {},
662 "source": [
663 "import os\n",
664 "\n",
665 "#seq_path = \"../../data/sequence_data/cistrome_hg38/hg38_cistrome/\"\n",
666 "seq_path = \"../../data/sequence_data/cistrome_hg38/hg38_cismotifs/\"\n",
667 "fnames = os.listdir(seq_path)"
668 ]
669 },
670 {
671 "cell_type": "raw",
672 "metadata": {},
673 "source": [
674 "df_cist = pd.DataFrame()\n",
675 "\n",
676 "for f in fnames:\n",
677 "\n",
678 " name = f.split(\".\")\n",
679 " motf = name[0].split(\"_\")[0]\n",
680 " qual = name[1]\n",
681 " \n",
682 " df_cist = df_cist.append(\n",
683 " {\"motif\":motf,\n",
684 " \"quality\":qual,\n",
685 " \"filename\":f}, ignore_index=True)"
686 ]
687 },
688 {
689 "cell_type": "raw",
690 "metadata": {},
691 "source": [
692 "df_cist.sort_values([\"motif\", \"quality\"], inplace=True)\n",
693 "df_cist_unique = df_cist.groupby(\"motif\").first()\n",
694 "df_cist_unique.reset_index(inplace=True)\n",
695 "\n",
696 "print(df_cist_unique.shape)\n",
697 "df_cist_unique.head()"
698 ]
699 },
700 {
701 "cell_type": "markdown",
702 "metadata": {},
703 "source": [
704 "create a 2D array for sequence motif, \n",
705 "shape = (nbins, 599):\n",
706 "\n",
707 " bin1 = [0, 0, 0, 1, 1, 0, ...] (599 cols)\n",
708 " bin2 = [...]"
709 ]
710 },
711 {
712 "cell_type": "raw",
713 "metadata": {},
714 "source": [
715 "bw = pyBigWig.open(filepath_train + \"C01M16.bigwig\")\n",
716 "end = bw.chroms(chrom)\n",
717 "nbins = (-(-end // 25)) ## round-up\n",
718 "\n",
719 "bw.close()"
720 ]
721 },
722 {
723 "cell_type": "raw",
724 "metadata": {},
725 "source": [
726 "def get_motif(filename, chrom, nbins):\n",
727 " \"\"\"For a given cistrome, get an array of flags (1) \n",
728 " where motif exists. \n",
729 " \n",
730 " Params:\n",
731 " ========\n",
732 " filename: str\n",
733 " cistrome filename (filepath is fixed)\n",
734 " chrom: str. e.g. 'chr19'\n",
735 " nbins: int. number of bins for that chrom\n",
736 " \n",
737 " Return:\n",
738 " ========\n",
739 " motif_bin: np.array (nbins, )\n",
740 " \"\"\"\n",
741 " \n",
742 " name = filename.split(\".\")\n",
743 " motif = name[0].split(\"_\")[0]\n",
744 " \n",
745 " df_motif = pd.read_csv(seq_path+filename, \n",
746 " sep = \"\\t\", \n",
747 " header = None,\n",
748 " names = [\"chr\", \"start\", \"end\", \"confidence\"])\n",
749 " df_motif = df_motif[df_motif.chr == chrom]\n",
750 " df_motif.sort_values(\"start\", inplace=True)\n",
751 " \n",
752 " # get start-end of the chromosome\n",
753 " end_ext = nbins*25\n",
754 " chrm_range = np.arange(0, end_ext, dtype=\"int32\")\n",
755 " starts = chrm_range[0::25]\n",
756 " ends = starts + 25\n",
757 "\n",
758 " assert nbins == len(starts)\n",
759 " \n",
760 " motif_bin = np.zeros((nbins), dtype=\"int8\")\n",
761 "\n",
762 " # loop through the motif file's locations\n",
763 " for i, m in df_motif.iterrows():\n",
764 "\n",
765 " s = m.start\n",
766 " e = m.end\n",
767 "\n",
768 " # conservative range\n",
769 " # bins 'within' motif\n",
770 " idx = (starts >= s) & (ends <= e) \n",
771 "\n",
772 " motif_bin[idx] = 1\n",
773 "\n",
774 " print(sum(motif_bin == 1), \"bins have motif -\", filename)\n",
775 " \n",
776 " return {motif: motif_bin}"
777 ]
778 },
779 {
780 "cell_type": "raw",
781 "metadata": {},
782 "source": [
783 "# choose only quality 'A' motifs\n",
784 "idx_A = df_cist_unique.quality == \"A\"\n",
785 "\n",
786 "motif_files = df_cist_unique.filename.values\n",
787 "motif_files = motif_files[idx_A]"
788 ]
789 },
790 {
791 "cell_type": "raw",
792 "metadata": {},
793 "source": [
794 "len(motif_files)"
795 ]
796 },
797 {
798 "cell_type": "raw",
799 "metadata": {},
800 "source": [
801 "from multiprocess import Pool"
802 ]
803 },
804 {
805 "cell_type": "raw",
806 "metadata": {
807 "scrolled": true
808 },
809 "source": [
810 "pool = Pool(10)\n",
811 "\n",
812 "result = pool.map(lambda x: get_motif(x, chrom, nbins), motif_files)\n",
813 "\n",
814 "pool.close()\n",
815 "pool.join()"
816 ]
817 },
818 {
819 "cell_type": "raw",
820 "metadata": {},
821 "source": [
822 "data_motif = {}\n",
823 "for d in result:\n",
824 " data_motif.update(d)\n",
825 " \n",
826 "data_motif = pd.DataFrame(data_motif, dtype='int8')\n",
827 "data_motif.head()"
828 ]
829 },
830 {
831 "cell_type": "raw",
832 "metadata": {},
833 "source": [
834 "import pickle\n",
835 "\n",
836 "outpath = \"../../data/model_v3_data/\"\n",
837 "outname = \"arry_seq_\" + chrom + \"_cismotif_A.pickle\"\n",
838 "\n",
839 "with open(outpath + outname, 'wb') as handle:\n",
840 " pickle.dump(data_motif, handle, \n",
841 " protocol=pickle.HIGHEST_PROTOCOL)"
842 ]
843 },
844 {
845 "cell_type": "raw",
846 "metadata": {
847 "scrolled": true
848 },
849 "source": [
850 "#data_motif[data_motif.AP2A.values == 1]"
851 ]
852 },
853 {
854 "cell_type": "markdown",
855 "metadata": {},
856 "source": [
857 "## 4. Identify bins with no signals"
858 ]
859 },
860 {
861 "cell_type": "raw",
862 "metadata": {},
863 "source": [
864 "import pickle\n",
865 "\n",
866 "outpath = \"../../data/model_v3_data/\"\n",
867 "outname = \"dict_all_\" + chrom + \"_arcsinh.pickle\"\n",
868 "\n",
869 "with open(outpath + outname, 'rb') as handle:\n",
870 " data = pickle.load(handle)"
871 ]
872 },
873 {
874 "cell_type": "raw",
875 "metadata": {},
876 "source": [
877 "tracks = list(data.values())\n",
878 "tracks = np.array(tracks)\n",
879 "tracks_var = tracks.var(axis=0)"
880 ]
881 },
882 {
883 "cell_type": "raw",
884 "metadata": {},
885 "source": [
886 "min_cutoff = 0.025\n",
887 "idx_masked = tracks_var < min_cutoff\n",
888 "\n",
889 "print(f\"average track has med var: {np.median(tracks_var):1.4}\")\n",
890 "print(f\"average track has min var: {tracks_var.min():1.4}\")\n",
891 "print(f\"average track has max var: {tracks_var.max():1.4}\\n\")\n",
892 "print(f\"variance lower than {min_cutoff:0.4} will be removed\")\n",
893 "print(f\"{sum(idx_masked):,}/{len(tracks_var):,} bins will be removed.\")"
894 ]
895 },
896 {
897 "cell_type": "raw",
898 "metadata": {},
899 "source": [
900 "outpath = \"../../data/model_v3_data/\"\n",
901 "outname = f\"arry_seq_{chrom}_mask_{str(min_cutoff)}.pickle\"\n",
902 "\n",
903 "with open(outpath + outname, 'wb') as handle:\n",
904 " pickle.dump(idx_masked, handle, \n",
905 " protocol=pickle.HIGHEST_PROTOCOL)"
906 ]
907 },
908 {
909 "cell_type": "code",
910 "execution_count": null,
911 "metadata": {},
912 "outputs": [],
913 "source": []
914 }
915 ],
916 "metadata": {
917 "kernelspec": {
918 "display_name": "Python 3",
919 "language": "python",
920 "name": "python3"
921 },
922 "language_info": {
923 "codemirror_mode": {
924 "name": "ipython",
925 "version": 3
926 },
927 "file_extension": ".py",
928 "mimetype": "text/x-python",
929 "name": "python",
930 "nbconvert_exporter": "python",
931 "pygments_lexer": "ipython3",
932 "version": "3.6.8"
933 }
934 },
935 "nbformat": 4,
936 "nbformat_minor": 2
937}