· 6 years ago · Aug 23, 2019, 09:42 AM
1#!/usr/bin/env python3
2'''
3gtf2bed.py converts GTF file to BED file.
4Usage: gtf2bed.py {OPTIONS} [.GTF file]
5
6History
7 Nov.5th 2012:
8 1. Allow conversion from general GTF files (instead of only Cufflinks supports).
9 2. If multiple identical transcript_id exist, transcript_id will be appended a string like "_DUP#" to separate.
10'''
11
12import sys;
13import re;
14
15if len(sys.argv)<2:
16 print('This script converts .GTF into .BED annotations.\n');
17 print('Usage: gtf2bed {OPTIONS} [.GTF file]\n');
18 print('Options:');
19 print('-c color\tSpecify the color of the track. This is a RGB value represented as "r,g,b". Default 255,0,0 (red)');
20 print('\nNote:');
21 print('1\tOnly "exon" and "transcript" are recognized in the feature field (3rd field).');
22 print('2\tIn the attribute list of .GTF file, the script tries to find "gene_id", "transcript_id" and "FPKM" attribute, and convert them as name and score field in .BED file.');
23 print('Author: Wei Li (li.david.wei AT gmail.com)');
24 sys.exit();
25
26color='255,0,0'
27
28
29for i in range(len(sys.argv)):
30 if sys.argv[i]=='-c':
31 color=sys.argv[i+1];
32
33
34allids={};
35
36def printbedline(estart,eend,field,nline):
37 try:
38 estp=estart[0]-1;
39 eedp=eend[-1];
40 # use regular expression to get transcript_id, gene_id and expression level
41 geneid=re.findall(r'gene_id \"([\w\.]+)\"',field[8])
42 transid=re.findall(r'transcript_id \"([\w\.]+)\"',field[8])
43 fpkmval=re.findall(r'FPKM \"([\d\.]+)\"',field[8])
44 if len(geneid)==0:
45 print('Warning: no gene_id field ',file=sys.stderr);
46 else:
47 geneid=geneid[0];
48 if len(transid)==0:
49 print('Warning: no transcript_id field',file=sys.stderr);
50 transid='Trans_'+str(nline);
51 else:
52 transid=transid[0];
53 if transid in allids.keys():
54 transid2=transid+'_DUP'+str(allids[transid]);
55 allids[transid]=allids[transid]+1;
56 transid=transid2;
57 else:
58 allids[transid]=1;
59 if len(fpkmval)==0:
60 #print('Warning: no FPKM field',file=sys.stderr);
61 fpkmval='100';
62 else:
63 fpkmval=fpkmval[0];
64 fpkmint=round(float(fpkmval));
65 print(field[0]+'\t'+str(estp)+'\t'+str(eedp)+'\t'+transid+'\t'+str(fpkmint)+'\t'+field[6]+'\t'+str(estp)+'\t'+str(eedp)+'\t'+color+'\t'+str(len(estart))+'\t',end='');
66 seglen=[eend[i]-estart[i]+1 for i in range(len(estart))];
67 segstart=[estart[i]-estart[0] for i in range(len(estart))];
68 strl=str(seglen[0]);
69 for i in range(1,len(seglen)):
70 strl+=','+str(seglen[i]);
71 strs=str(segstart[0]);
72 for i in range(1,len(segstart)):
73 strs+=','+str(segstart[i]);
74 print(strl+'\t'+strs);
75 except ValueError:
76 print('Error: non-number fields at line '+str(nline),file=sys.stderr);
77
78
79
80
81
82
83estart=[];
84eend=[];
85# read lines one to one
86nline=0;
87prevfield=[];
88prevtransid='';
89for lines in open(sys.argv[-1]):
90 field=lines.strip().split('\t');
91 nline=nline+1;
92 if len(field)<9:
93 print('Error: the GTF should has at least 9 fields at line '+str(nline),file=sys.stderr);
94 continue;
95 if field[1]!='Cufflinks':
96 pass;
97 #print('Warning: the second field is expected to be \'Cufflinks\' at line '+str(nline),file=sys.stderr);
98 if field[2]!='exon' and field[2] !='transcript':
99 #print('Error: the third filed is expected to be \'exon\' or \'transcript\' at line '+str(nline),file=sys.stderr);
100 continue;
101 transid=re.findall(r'transcript_id \"([\w\.]+)\"',field[8]);
102 if len(transid)>0:
103 transid=transid[0];
104 else:
105 transid='';
106 if field[2]=='transcript' or (prevtransid != '' and transid!='' and transid != prevtransid):
107 #print('prev:'+prevtransid+', current:'+transid);
108 # A new transcript record, write
109 if len(estart)!=0:
110 printbedline(estart,eend,prevfield,nline);
111 estart=[];
112 eend=[];
113 prevfield=field;
114 prevtransid=transid;
115 if field[2]=='exon':
116 try:
117 est=int(field[3]);
118 eed=int(field[4]);
119 estart+=[est];
120 eend+=[eed];
121 except ValueError:
122 print('Error: non-number fields at line '+str(nline),file=sys.stderr);
123# the last record
124if len(estart)!=0:
125 printbedline(estart,eend,field,nline);