rV4WZf5P

· 6 years ago · Jul 28, 2019, 05:48 AM
1{
2 "cells": [
3  {
4   "cell_type": "markdown",
5   "metadata": {},
6   "source": [
7    "## Data Generation for Model v3"
8   ]
9  },
10  {
11   "cell_type": "code",
12   "execution_count": 1,
13   "metadata": {},
14   "outputs": [],
15   "source": [
16    "import pyBigWig\n",
17    "import pandas as pd\n",
18    "import numpy as np\n",
19    "import gc"
20   ]
21  },
22  {
23   "cell_type": "code",
24   "execution_count": 2,
25   "metadata": {},
26   "outputs": [
27    {
28     "data": {
29      "text/html": [
30       "<div>\n",
31       "<style scoped>\n",
32       "    .dataframe tbody tr th:only-of-type {\n",
33       "        vertical-align: middle;\n",
34       "    }\n",
35       "\n",
36       "    .dataframe tbody tr th {\n",
37       "        vertical-align: top;\n",
38       "    }\n",
39       "\n",
40       "    .dataframe thead th {\n",
41       "        text-align: right;\n",
42       "    }\n",
43       "</style>\n",
44       "<table border=\"1\" class=\"dataframe\">\n",
45       "  <thead>\n",
46       "    <tr style=\"text-align: right;\">\n",
47       "      <th></th>\n",
48       "      <th>Cell_ID</th>\n",
49       "      <th>Mark_ID</th>\n",
50       "      <th>CellType</th>\n",
51       "      <th>Assay</th>\n",
52       "      <th>DataType</th>\n",
53       "      <th>fileName</th>\n",
54       "    </tr>\n",
55       "  </thead>\n",
56       "  <tbody>\n",
57       "    <tr>\n",
58       "      <th>0</th>\n",
59       "      <td>C01</td>\n",
60       "      <td>M16</td>\n",
61       "      <td>adipose_tissue</td>\n",
62       "      <td>H3K27ac</td>\n",
63       "      <td>T</td>\n",
64       "      <td>C01M16.bigwig</td>\n",
65       "    </tr>\n",
66       "    <tr>\n",
67       "      <th>1</th>\n",
68       "      <td>C02</td>\n",
69       "      <td>M01</td>\n",
70       "      <td>adrenal_gland</td>\n",
71       "      <td>ATAC-seq</td>\n",
72       "      <td>T</td>\n",
73       "      <td>C02M01.bigwig</td>\n",
74       "    </tr>\n",
75       "    <tr>\n",
76       "      <th>2</th>\n",
77       "      <td>C02</td>\n",
78       "      <td>M02</td>\n",
79       "      <td>adrenal_gland</td>\n",
80       "      <td>DNase-seq</td>\n",
81       "      <td>T</td>\n",
82       "      <td>C02M02.bigwig</td>\n",
83       "    </tr>\n",
84       "    <tr>\n",
85       "      <th>3</th>\n",
86       "      <td>C02</td>\n",
87       "      <td>M16</td>\n",
88       "      <td>adrenal_gland</td>\n",
89       "      <td>H3K27ac</td>\n",
90       "      <td>T</td>\n",
91       "      <td>C02M16.bigwig</td>\n",
92       "    </tr>\n",
93       "    <tr>\n",
94       "      <th>4</th>\n",
95       "      <td>C02</td>\n",
96       "      <td>M18</td>\n",
97       "      <td>adrenal_gland</td>\n",
98       "      <td>H3K36me3</td>\n",
99       "      <td>T</td>\n",
100       "      <td>C02M18.bigwig</td>\n",
101       "    </tr>\n",
102       "    <tr>\n",
103       "      <th>5</th>\n",
104       "      <td>C02</td>\n",
105       "      <td>M20</td>\n",
106       "      <td>adrenal_gland</td>\n",
107       "      <td>H3K4me1</td>\n",
108       "      <td>T</td>\n",
109       "      <td>C02M20.bigwig</td>\n",
110       "    </tr>\n",
111       "    <tr>\n",
112       "      <th>6</th>\n",
113       "      <td>C02</td>\n",
114       "      <td>M22</td>\n",
115       "      <td>adrenal_gland</td>\n",
116       "      <td>H3K4me3</td>\n",
117       "      <td>V</td>\n",
118       "      <td>C02M22.bigwig</td>\n",
119       "    </tr>\n",
120       "    <tr>\n",
121       "      <th>7</th>\n",
122       "      <td>C02</td>\n",
123       "      <td>M29</td>\n",
124       "      <td>adrenal_gland</td>\n",
125       "      <td>H3K9me3</td>\n",
126       "      <td>T</td>\n",
127       "      <td>C02M29.bigwig</td>\n",
128       "    </tr>\n",
129       "    <tr>\n",
130       "      <th>8</th>\n",
131       "      <td>C03</td>\n",
132       "      <td>M02</td>\n",
133       "      <td>adrenal_gland (embryonic)</td>\n",
134       "      <td>DNase-seq</td>\n",
135       "      <td>V</td>\n",
136       "      <td>C03M02.bigwig</td>\n",
137       "    </tr>\n",
138       "    <tr>\n",
139       "      <th>9</th>\n",
140       "      <td>C03</td>\n",
141       "      <td>M16</td>\n",
142       "      <td>adrenal_gland (embryonic)</td>\n",
143       "      <td>H3K27ac</td>\n",
144       "      <td>T</td>\n",
145       "      <td>C03M16.bigwig</td>\n",
146       "    </tr>\n",
147       "  </tbody>\n",
148       "</table>\n",
149       "</div>"
150      ],
151      "text/plain": [
152       "  Cell_ID Mark_ID                   CellType      Assay DataType  \\\n",
153       "0     C01     M16             adipose_tissue    H3K27ac        T   \n",
154       "1     C02     M01              adrenal_gland   ATAC-seq        T   \n",
155       "2     C02     M02              adrenal_gland  DNase-seq        T   \n",
156       "3     C02     M16              adrenal_gland    H3K27ac        T   \n",
157       "4     C02     M18              adrenal_gland   H3K36me3        T   \n",
158       "5     C02     M20              adrenal_gland    H3K4me1        T   \n",
159       "6     C02     M22              adrenal_gland    H3K4me3        V   \n",
160       "7     C02     M29              adrenal_gland    H3K9me3        T   \n",
161       "8     C03     M02  adrenal_gland (embryonic)  DNase-seq        V   \n",
162       "9     C03     M16  adrenal_gland (embryonic)    H3K27ac        T   \n",
163       "\n",
164       "        fileName  \n",
165       "0  C01M16.bigwig  \n",
166       "1  C02M01.bigwig  \n",
167       "2  C02M02.bigwig  \n",
168       "3  C02M16.bigwig  \n",
169       "4  C02M18.bigwig  \n",
170       "5  C02M20.bigwig  \n",
171       "6  C02M22.bigwig  \n",
172       "7  C02M29.bigwig  \n",
173       "8  C03M02.bigwig  \n",
174       "9  C03M16.bigwig  "
175      ]
176     },
177     "execution_count": 2,
178     "metadata": {},
179     "output_type": "execute_result"
180    }
181   ],
182   "source": [
183    "filepath = \"../../data/\"\n",
184    "\n",
185    "df_meta = pd.read_csv(filepath + \"Encode_meta.tsv\", sep=\"\\t\")\n",
186    "df_meta.rename(columns={\"Training(T),Validation(V),Blind-test(B)\":\"DataType\"}, \n",
187    "               inplace=True)\n",
188    "df_meta.head(10)"
189   ]
190  },
191  {
192   "cell_type": "raw",
193   "metadata": {},
194   "source": [
195    "# load blacklist\n",
196    "# TODO: this is hard to deal with. ignore for now\n",
197    "df_black = pd.read_csv(filepath + \"hg38.blacklist.bed.gz\", \n",
198    "                       sep = \"\\t\", header = None,\n",
199    "                       names = [\"chr\", \"start\", \"end\"])\n",
200    "df_black.head()"
201   ]
202  },
203  {
204   "cell_type": "code",
205   "execution_count": 3,
206   "metadata": {},
207   "outputs": [],
208   "source": [
209    "filepath_train = \"../../data/training_data/\"\n",
210    "filepath_valid = \"../../data/validation_data/\""
211   ]
212  },
213  {
214   "cell_type": "code",
215   "execution_count": 4,
216   "metadata": {},
217   "outputs": [],
218   "source": [
219    "chrom = \"chr16\"\n",
220    "n_cpu = 5"
221   ]
222  },
223  {
224   "cell_type": "markdown",
225   "metadata": {},
226   "source": [
227    "### 1. Prepare Avocado Input Dict\n",
228    "\n",
229    "25 bin and arcsinh"
230   ]
231  },
232  {
233   "cell_type": "code",
234   "execution_count": 5,
235   "metadata": {},
236   "outputs": [],
237   "source": [
238    "# filepath = fpth + fname\n",
239    "def get_binvals(filepath, chrom):\n",
240    "    \"\"\"Get 25 bin values of given chrom\n",
241    "    Take arcsinh first then bin (mean)\n",
242    "    \n",
243    "    Params:\n",
244    "    ========\n",
245    "    filepath: str. path to the bigwig file\n",
246    "    chrom: str. specifiy which chromosome to process\n",
247    "    \n",
248    "    Return:\n",
249    "    ========\n",
250    "    vals: np.array('float32') with length of nbins\n",
251    "    \"\"\"\n",
252    "    bw = pyBigWig.open(filepath)\n",
253    "\n",
254    "    end = bw.chroms(chrom)\n",
255    "    nbins= (-(-end // 25)) ## round-up\n",
256    "\n",
257    "    vals = bw.values(chrom, 0, end)\n",
258    "    vals = np.array(vals)\n",
259    "    vals[np.isnan(vals)] = 0\n",
260    "\n",
261    "    # calc arcsinh\n",
262    "    vals = np.arcsinh(vals)\n",
263    "\n",
264    "    # pad 0 in order to match nbin*25\n",
265    "    n_extra = nbins*25 - end\n",
266    "    vals = np.append(vals, np.zeros(n_extra))\n",
267    "\n",
268    "    # calc mean\n",
269    "    vals = vals.reshape(nbins, -1).mean(axis=1)\n",
270    "    vals = vals.astype('float32')\n",
271    "\n",
272    "    bw.close()\n",
273    "    \n",
274    "    return(vals)"
275   ]
276  },
277  {
278   "cell_type": "code",
279   "execution_count": 6,
280   "metadata": {},
281   "outputs": [],
282   "source": [
283    "def preproc(chrom, filepath):\n",
284    "    \n",
285    "    vals = get_binvals(filepath, chrom)\n",
286    "    \n",
287    "    fname = filepath.split(\"/\")[-1]\n",
288    "    Cell_ID = fname[0:3]\n",
289    "    Mark_ID = fname[3:6]\n",
290    "    \n",
291    "    return {(Cell_ID, Mark_ID): vals}"
292   ]
293  },
294  {
295   "cell_type": "code",
296   "execution_count": 7,
297   "metadata": {},
298   "outputs": [],
299   "source": [
300    "fnames = []\n",
301    "for i, f in df_meta.iterrows():\n",
302    "    \n",
303    "    if f.DataType == \"T\":\n",
304    "        fpth = filepath_train + f.fileName\n",
305    "    \n",
306    "    elif f.DataType == \"V\":\n",
307    "        fpth = filepath_valid + f.fileName\n",
308    " \n",
309    "    else:\n",
310    "        continue\n",
311    "        \n",
312    "    fnames.append(fpth)"
313   ]
314  },
315  {
316   "cell_type": "code",
317   "execution_count": null,
318   "metadata": {},
319   "outputs": [],
320   "source": [
321    "from multiprocess import Pool\n",
322    "pool = Pool(n_cpu) # use 5 for large genomes\n",
323    "\n",
324    "result = pool.map(lambda x: preproc(chrom, x), fnames)\n",
325    "\n",
326    "pool.close()\n",
327    "pool.join()"
328   ]
329  },
330  {
331   "cell_type": "code",
332   "execution_count": null,
333   "metadata": {},
334   "outputs": [],
335   "source": [
336    "data = {}\n",
337    "for d in result:\n",
338    "    data.update(d)"
339   ]
340  },
341  {
342   "cell_type": "code",
343   "execution_count": null,
344   "metadata": {},
345   "outputs": [],
346   "source": [
347    "len(data)"
348   ]
349  },
350  {
351   "cell_type": "code",
352   "execution_count": null,
353   "metadata": {},
354   "outputs": [],
355   "source": [
356    "import pickle\n",
357    "\n",
358    "outpath = \"../../data/model_v3_data/\"\n",
359    "outname = \"dict_all_\" + chrom + \"_arcsinh.pickle\"\n",
360    "\n",
361    "with open(outpath + outname, 'wb') as handle:\n",
362    "    pickle.dump(data, handle, \n",
363    "                protocol=pickle.HIGHEST_PROTOCOL)\n",
364    "    \n",
365    "del result, data, pool, d\n",
366    "gc.collect()"
367   ]
368  },
369  {
370   "cell_type": "markdown",
371   "metadata": {},
372   "source": [
373    "### 2. Average Assay"
374   ]
375  },
376  {
377   "cell_type": "code",
378   "execution_count": null,
379   "metadata": {},
380   "outputs": [],
381   "source": [
382    "from multiprocess import Pool\n",
383    "import pickle\n",
384    "\n",
385    "outpath = \"../../data/model_v3_data/\"\n",
386    "outname = \"dict_all_\" + chrom + \"_arcsinh.pickle\"\n",
387    "\n",
388    "with open(outpath + outname, 'rb') as handle:\n",
389    "    data = pickle.load(handle)"
390   ]
391  },
392  {
393   "cell_type": "code",
394   "execution_count": null,
395   "metadata": {},
396   "outputs": [],
397   "source": [
398    "## get data_avg for same assay (for each cell type, the avg assay is the same)\n",
399    "indices = np.array([[celltype for celltype, _ in data.keys()],\n",
400    "                    [assay    for _, assay    in data.keys()]])\n",
401    "tracks  = list(data.values())"
402   ]
403  },
404  {
405   "cell_type": "code",
406   "execution_count": null,
407   "metadata": {},
408   "outputs": [],
409   "source": [
410    "def vals2cat(v):\n",
411    "    \n",
412    "    vals = pd.Series(v)\n",
413    "    vals = vals.rank(method='first') \n",
414    "    vals = pd.qcut(vals, 3, labels=[0, 1, 2])\n",
415    "    vals = np.array(vals)\n",
416    "    vals.astype('int8')\n",
417    "    \n",
418    "    return vals"
419   ]
420  },
421  {
422   "cell_type": "code",
423   "execution_count": null,
424   "metadata": {},
425   "outputs": [],
426   "source": [
427    "def get_avg(k, v):\n",
428    "      \n",
429    "    c_id = k[0]\n",
430    "    a_id = k[1]\n",
431    "    \n",
432    "    # average same assays together\n",
433    "    track_avg_idxs = (indices[1,:] == a_id) \n",
434    "    track_avg      = np.zeros(len(tracks[0]), dtype=np.float32)\n",
435    "    n_other_cells  = 0\n",
436    "    \n",
437    "    for j, b in enumerate(track_avg_idxs):\n",
438    "        if b:\n",
439    "            track_avg    += tracks[j]\n",
440    "            n_other_cells += 1\n",
441    "    \n",
442    "    #print(f\"processing {k}, n_other_cells: {n_other_cells}\")\n",
443    "    \n",
444    "    if n_other_cells > 1:\n",
445    "        track_avg       = track_avg/n_other_cells\n",
446    "        #data_avg[k]     = track_avg\n",
447    "        return {k: track_avg}\n",
448    "    return None"
449   ]
450  },
451  {
452   "cell_type": "code",
453   "execution_count": null,
454   "metadata": {},
455   "outputs": [],
456   "source": [
457    "def get_avg_cat(k, v):\n",
458    "      \n",
459    "    c_id = k[0]\n",
460    "    a_id = k[1]\n",
461    "    \n",
462    "    # average same assays together\n",
463    "    track_avg_idxs = (indices[1,:] == a_id) \n",
464    "    track_avg      = np.zeros(len(tracks[0]), dtype=np.float32)\n",
465    "    n_other_cells  = 0\n",
466    "    \n",
467    "    for j, b in enumerate(track_avg_idxs):\n",
468    "        if b:\n",
469    "            track_avg    += tracks[j]\n",
470    "            n_other_cells += 1\n",
471    "    \n",
472    "    #print(f\"processing {k}, n_other_cells: {n_other_cells}\")\n",
473    "    \n",
474    "    if n_other_cells > 1:\n",
475    "        track_avg       = track_avg/n_other_cells\n",
476    "        #data_avg[k]     = track_avg\n",
477    "        return {k: vals2cat(track_avg)}\n",
478    "    return None"
479   ]
480  },
481  {
482   "cell_type": "code",
483   "execution_count": null,
484   "metadata": {},
485   "outputs": [],
486   "source": [
487    "def get_all_cat(k, v):\n",
488    "    return {k: vals2cat(v)}"
489   ]
490  },
491  {
492   "cell_type": "markdown",
493   "metadata": {},
494   "source": [
495    "### Calculate Average"
496   ]
497  },
498  {
499   "cell_type": "code",
500   "execution_count": null,
501   "metadata": {},
502   "outputs": [],
503   "source": [
504    "pool = Pool(n_cpu) # use 5 for large genomes\n",
505    "\n",
506    "result = pool.map(lambda x: get_avg(x[0], x[1]), data.items())\n",
507    "\n",
508    "pool.close()\n",
509    "pool.join()"
510   ]
511  },
512  {
513   "cell_type": "code",
514   "execution_count": null,
515   "metadata": {},
516   "outputs": [],
517   "source": [
518    "data_avg = {}\n",
519    "for d in result:\n",
520    "    if d is not None:\n",
521    "        data_avg.update(d)"
522   ]
523  },
524  {
525   "cell_type": "code",
526   "execution_count": null,
527   "metadata": {},
528   "outputs": [],
529   "source": [
530    "outpath = \"../../data/model_v3_data/\"\n",
531    "outname_avg     = \"dict_avg_\" + chrom + \"_arcsinh.pickle\"\n",
532    "\n",
533    "with open(outpath + outname_avg, 'wb') as handle1:\n",
534    "    pickle.dump(data_avg, handle1, \n",
535    "                protocol=pickle.HIGHEST_PROTOCOL)\n",
536    "\n",
537    "del result, data_avg, pool, d\n",
538    "gc.collect()"
539   ]
540  },
541  {
542   "cell_type": "markdown",
543   "metadata": {},
544   "source": [
545    "### Calculate Average-3 Categories"
546   ]
547  },
548  {
549   "cell_type": "code",
550   "execution_count": null,
551   "metadata": {},
552   "outputs": [],
553   "source": [
554    "pool = Pool(n_cpu) # use 5 for large genomes\n",
555    "\n",
556    "result = pool.map(lambda x: get_avg_cat(x[0], x[1]), data.items())\n",
557    "\n",
558    "pool.close()\n",
559    "pool.join()"
560   ]
561  },
562  {
563   "cell_type": "code",
564   "execution_count": null,
565   "metadata": {},
566   "outputs": [],
567   "source": [
568    "data_avg_cat = {}\n",
569    "for d in result:\n",
570    "    if d is not None:\n",
571    "        data_avg_cat.update(d)"
572   ]
573  },
574  {
575   "cell_type": "code",
576   "execution_count": null,
577   "metadata": {},
578   "outputs": [],
579   "source": [
580    "outpath = \"../../data/model_v3_data/\"\n",
581    "outname_avg_cat = \"dict_avg3cat_\" + chrom + \"_arcsinh.pickle\"\n",
582    "    \n",
583    "with open(outpath + outname_avg_cat, 'wb') as handle2:\n",
584    "    pickle.dump(data_avg_cat, handle2, \n",
585    "                protocol=pickle.HIGHEST_PROTOCOL)\n",
586    "\n",
587    "del result, data_avg_cat, pool, d\n",
588    "gc.collect()"
589   ]
590  },
591  {
592   "cell_type": "markdown",
593   "metadata": {},
594   "source": [
595    "### Convert All to 3 Categories"
596   ]
597  },
598  {
599   "cell_type": "code",
600   "execution_count": null,
601   "metadata": {},
602   "outputs": [],
603   "source": [
604    "pool = Pool(n_cpu) # use 5 for large genomes\n",
605    "\n",
606    "result = pool.map(lambda x: get_all_cat(x[0], x[1]), data.items())\n",
607    "\n",
608    "pool.close()\n",
609    "pool.join()"
610   ]
611  },
612  {
613   "cell_type": "code",
614   "execution_count": null,
615   "metadata": {},
616   "outputs": [],
617   "source": [
618    "data_all_cat = {}\n",
619    "for d in result:\n",
620    "    if d is not None:\n",
621    "        data_all_cat.update(d)"
622   ]
623  },
624  {
625   "cell_type": "code",
626   "execution_count": null,
627   "metadata": {},
628   "outputs": [],
629   "source": [
630    "len(data_all_cat)"
631   ]
632  },
633  {
634   "cell_type": "code",
635   "execution_count": null,
636   "metadata": {},
637   "outputs": [],
638   "source": [
639    "outpath = \"../../data/model_v3_data/\"\n",
640    "outname_all_cat = \"dict_3cat_\" + chrom + \"_arcsinh.pickle\"\n",
641    "    \n",
642    "with open(outpath + outname_all_cat, 'wb') as handle3:\n",
643    "    pickle.dump(data_all_cat, handle3, \n",
644    "                protocol=pickle.HIGHEST_PROTOCOL)\n",
645    "\n",
646    "del result, data_all_cat, pool, d\n",
647    "gc.collect()"
648   ]
649  },
650  {
651   "cell_type": "markdown",
652   "metadata": {},
653   "source": [
654    "## 3. Seq Motif\n",
655    "\n",
656    "2019/07/09 - switch to cismotif instead of cistrome"
657   ]
658  },
659  {
660   "cell_type": "raw",
661   "metadata": {},
662   "source": [
663    "import os\n",
664    "\n",
665    "#seq_path = \"../../data/sequence_data/cistrome_hg38/hg38_cistrome/\"\n",
666    "seq_path = \"../../data/sequence_data/cistrome_hg38/hg38_cismotifs/\"\n",
667    "fnames   = os.listdir(seq_path)"
668   ]
669  },
670  {
671   "cell_type": "raw",
672   "metadata": {},
673   "source": [
674    "df_cist = pd.DataFrame()\n",
675    "\n",
676    "for f in fnames:\n",
677    "\n",
678    "    name = f.split(\".\")\n",
679    "    motf = name[0].split(\"_\")[0]\n",
680    "    qual = name[1]\n",
681    "    \n",
682    "    df_cist = df_cist.append(\n",
683    "        {\"motif\":motf,\n",
684    "         \"quality\":qual,\n",
685    "         \"filename\":f}, ignore_index=True)"
686   ]
687  },
688  {
689   "cell_type": "raw",
690   "metadata": {},
691   "source": [
692    "df_cist.sort_values([\"motif\", \"quality\"], inplace=True)\n",
693    "df_cist_unique = df_cist.groupby(\"motif\").first()\n",
694    "df_cist_unique.reset_index(inplace=True)\n",
695    "\n",
696    "print(df_cist_unique.shape)\n",
697    "df_cist_unique.head()"
698   ]
699  },
700  {
701   "cell_type": "markdown",
702   "metadata": {},
703   "source": [
704    "create a 2D array for sequence motif, \n",
705    "shape = (nbins, 599):\n",
706    "\n",
707    "    bin1 = [0, 0, 0, 1, 1, 0, ...] (599 cols)\n",
708    "    bin2 = [...]"
709   ]
710  },
711  {
712   "cell_type": "raw",
713   "metadata": {},
714   "source": [
715    "bw    = pyBigWig.open(filepath_train + \"C01M16.bigwig\")\n",
716    "end   = bw.chroms(chrom)\n",
717    "nbins = (-(-end // 25)) ## round-up\n",
718    "\n",
719    "bw.close()"
720   ]
721  },
722  {
723   "cell_type": "raw",
724   "metadata": {},
725   "source": [
726    "def get_motif(filename, chrom, nbins):\n",
727    "    \"\"\"For a given cistrome, get an array of flags (1) \n",
728    "    where motif exists. \n",
729    "    \n",
730    "    Params:\n",
731    "    ========\n",
732    "    filename: str\n",
733    "        cistrome filename (filepath is fixed)\n",
734    "    chrom: str. e.g. 'chr19'\n",
735    "    nbins: int. number of bins for that chrom\n",
736    "    \n",
737    "    Return:\n",
738    "    ========\n",
739    "    motif_bin: np.array (nbins, )\n",
740    "    \"\"\"\n",
741    "    \n",
742    "    name  = filename.split(\".\")\n",
743    "    motif = name[0].split(\"_\")[0]\n",
744    "    \n",
745    "    df_motif = pd.read_csv(seq_path+filename, \n",
746    "                           sep = \"\\t\", \n",
747    "                           header = None,\n",
748    "                           names  = [\"chr\", \"start\", \"end\", \"confidence\"])\n",
749    "    df_motif = df_motif[df_motif.chr == chrom]\n",
750    "    df_motif.sort_values(\"start\", inplace=True)\n",
751    "    \n",
752    "    # get start-end of the chromosome\n",
753    "    end_ext = nbins*25\n",
754    "    chrm_range = np.arange(0, end_ext, dtype=\"int32\")\n",
755    "    starts = chrm_range[0::25]\n",
756    "    ends   = starts + 25\n",
757    "\n",
758    "    assert nbins == len(starts)\n",
759    "    \n",
760    "    motif_bin = np.zeros((nbins), dtype=\"int8\")\n",
761    "\n",
762    "    # loop through the motif file's locations\n",
763    "    for i, m in df_motif.iterrows():\n",
764    "\n",
765    "        s = m.start\n",
766    "        e = m.end\n",
767    "\n",
768    "        # conservative range\n",
769    "        # bins 'within' motif\n",
770    "        idx = (starts >= s) & (ends <= e) \n",
771    "\n",
772    "        motif_bin[idx] = 1\n",
773    "\n",
774    "    print(sum(motif_bin == 1), \"bins have motif -\", filename)\n",
775    "    \n",
776    "    return {motif: motif_bin}"
777   ]
778  },
779  {
780   "cell_type": "raw",
781   "metadata": {},
782   "source": [
783    "# choose only quality 'A' motifs\n",
784    "idx_A = df_cist_unique.quality == \"A\"\n",
785    "\n",
786    "motif_files = df_cist_unique.filename.values\n",
787    "motif_files = motif_files[idx_A]"
788   ]
789  },
790  {
791   "cell_type": "raw",
792   "metadata": {},
793   "source": [
794    "len(motif_files)"
795   ]
796  },
797  {
798   "cell_type": "raw",
799   "metadata": {},
800   "source": [
801    "from multiprocess import Pool"
802   ]
803  },
804  {
805   "cell_type": "raw",
806   "metadata": {
807    "scrolled": true
808   },
809   "source": [
810    "pool = Pool(10)\n",
811    "\n",
812    "result = pool.map(lambda x: get_motif(x, chrom, nbins), motif_files)\n",
813    "\n",
814    "pool.close()\n",
815    "pool.join()"
816   ]
817  },
818  {
819   "cell_type": "raw",
820   "metadata": {},
821   "source": [
822    "data_motif = {}\n",
823    "for d in result:\n",
824    "    data_motif.update(d)\n",
825    "    \n",
826    "data_motif = pd.DataFrame(data_motif, dtype='int8')\n",
827    "data_motif.head()"
828   ]
829  },
830  {
831   "cell_type": "raw",
832   "metadata": {},
833   "source": [
834    "import pickle\n",
835    "\n",
836    "outpath = \"../../data/model_v3_data/\"\n",
837    "outname = \"arry_seq_\" + chrom + \"_cismotif_A.pickle\"\n",
838    "\n",
839    "with open(outpath + outname, 'wb') as handle:\n",
840    "    pickle.dump(data_motif, handle, \n",
841    "                protocol=pickle.HIGHEST_PROTOCOL)"
842   ]
843  },
844  {
845   "cell_type": "raw",
846   "metadata": {
847    "scrolled": true
848   },
849   "source": [
850    "#data_motif[data_motif.AP2A.values == 1]"
851   ]
852  },
853  {
854   "cell_type": "markdown",
855   "metadata": {},
856   "source": [
857    "## 4. Identify bins with no signals"
858   ]
859  },
860  {
861   "cell_type": "raw",
862   "metadata": {},
863   "source": [
864    "import pickle\n",
865    "\n",
866    "outpath = \"../../data/model_v3_data/\"\n",
867    "outname = \"dict_all_\" + chrom + \"_arcsinh.pickle\"\n",
868    "\n",
869    "with open(outpath + outname, 'rb') as handle:\n",
870    "    data = pickle.load(handle)"
871   ]
872  },
873  {
874   "cell_type": "raw",
875   "metadata": {},
876   "source": [
877    "tracks = list(data.values())\n",
878    "tracks = np.array(tracks)\n",
879    "tracks_var = tracks.var(axis=0)"
880   ]
881  },
882  {
883   "cell_type": "raw",
884   "metadata": {},
885   "source": [
886    "min_cutoff = 0.025\n",
887    "idx_masked = tracks_var < min_cutoff\n",
888    "\n",
889    "print(f\"average track has med var: {np.median(tracks_var):1.4}\")\n",
890    "print(f\"average track has min var: {tracks_var.min():1.4}\")\n",
891    "print(f\"average track has max var: {tracks_var.max():1.4}\\n\")\n",
892    "print(f\"variance lower than {min_cutoff:0.4} will be removed\")\n",
893    "print(f\"{sum(idx_masked):,}/{len(tracks_var):,} bins will be removed.\")"
894   ]
895  },
896  {
897   "cell_type": "raw",
898   "metadata": {},
899   "source": [
900    "outpath = \"../../data/model_v3_data/\"\n",
901    "outname = f\"arry_seq_{chrom}_mask_{str(min_cutoff)}.pickle\"\n",
902    "\n",
903    "with open(outpath + outname, 'wb') as handle:\n",
904    "    pickle.dump(idx_masked, handle, \n",
905    "                protocol=pickle.HIGHEST_PROTOCOL)"
906   ]
907  },
908  {
909   "cell_type": "code",
910   "execution_count": null,
911   "metadata": {},
912   "outputs": [],
913   "source": []
914  }
915 ],
916 "metadata": {
917  "kernelspec": {
918   "display_name": "Python 3",
919   "language": "python",
920   "name": "python3"
921  },
922  "language_info": {
923   "codemirror_mode": {
924    "name": "ipython",
925    "version": 3
926   },
927   "file_extension": ".py",
928   "mimetype": "text/x-python",
929   "name": "python",
930   "nbconvert_exporter": "python",
931   "pygments_lexer": "ipython3",
932   "version": "3.6.8"
933  }
934 },
935 "nbformat": 4,
936 "nbformat_minor": 2
937}