-
Notifications
You must be signed in to change notification settings - Fork 0
/
process_seq.py
134 lines (122 loc) · 4.51 KB
/
process_seq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
def revcomp(seq):
comps = {'A': 'T', 'G':'C', 'T':'A', 'C':'G'}
return ''.join(comps[char] for char in seq[::-1])
def get_seq(gff):
gff = open(gff, 'r')
line = gff.readline()
while line[0] != '>':
line = gff.readline()
chrom = line[1:].strip()
seq = {chrom:''}
for line in gff:
line = line.strip()
if line[0] == '>':
chrom = line[1:]
seq[chrom] = ''
else:
seq[chrom] += line.upper()
gff.close()
return seq
def get_splice_sites(gff):
gff = open(gff, 'r')
line = gff.readline()
while line[0] == '#': line = gff.readline()
line = line.strip().split()
fives, threes = [], []
while len(line) > 7:
chrom, source, feature, start, end, a, strand = line[:7]
if feature == 'intron':
if strand == '+':
five, three = int(start)-1, int(end)
else:
three, five = int(start)-1, int(end)
fives += [(chrom, five, strand)]
threes += [(chrom, three, strand)]
line = gff.readline().strip().split()
gff.close()
return fives, threes
def get_splice_site_seqs(ss, up_len, down_len, seq):
seqs = []
for chrom, site, strand in ss:
if strand == '+':
seqs += [seq[chrom][site - up_len: site + down_len]]
else:
seqs += [revcomp(seq[chrom][site - down_len: site + up_len])]
return seqs
def get_pwm(seqs):
counts = [{'A':0, 'T':0, 'C':0, 'G':0} for _ in range(len(seqs[0]))]
for seq in seqs:
assert len(seq) == len(counts)
for i, char in enumerate(seq):
counts[i][char] += 1
for d in range(len(counts)):
total = float(sum(counts[d].values()))
for key, val in counts[d].items():
counts[d][key] = val / total
return counts
def pwms(f_up, f_down, t_up, t_down):
gff = 'saccharomyces_cerevisiae.gff'
seq = get_seq(gff)
fives, threes = get_splice_sites(gff)
five_seqs = get_splice_site_seqs(fives, f_up, f_down, seq)
three_seqs = get_splice_site_seqs(threes, t_up, t_down, seq)
five_seqs = filter(lambda x: x[3:5] == 'GT', five_seqs)
three_seqs = filter(lambda x: x[13:15] == 'AG', three_seqs)
five_pwm = get_pwm(five_seqs)
three_pwm = get_pwm(three_seqs)
return five_pwm, three_pwm
def get_introns(gff):
gff = open(gff, 'r')
line = gff.readline()
while line[0] == '#': line = gff.readline()
line = line.strip().split()
introns = {}
while len(line) > 7:
chrom, source, feature, start, end, a, strand = line[:7]
if feature == 'intron':
if chrom not in introns: introns[chrom] = []
introns[chrom] += [(start, end, strand, 0, float('inf'))]
line = gff.readline().strip().split()
gff.close()
return introns
def get_flanks(introns, gff):
gff = open(gff, 'r')
line = gff.readline()
while line[0] == '#': line = gff.readline()
line = line.strip().split()
while len(line) > 7:
chrom, source, feature, start, end, a, strand = line[:7]
if feature == 'CDS':
for i, intron in enumerate(introns[chrom]):
if strand == intron[2]:
if int(start) == int(intron[1])+1:
introns[chrom][i] = (intron[0], intron[1], intron[2], intron[3], min(intron[4], int(end)))
elif int(end) == int(intron[0])-1:
introns[chrom][i] = (intron[0], intron[1], intron[2], max(intron[3], int(start)), intron[4])
line = gff.readline().strip().split()
gff.close()
return introns
def intron_seqs(introns, seqs):
out = []
for chrom in introns:
for intron in introns[chrom]:
if intron[3] == 0 or intron[4] == float('inf'): continue
start, end, strand, begin, stop = intron
start, end, begin, stop = int(start), int(end), int(begin), int(stop)
if strand == '+':
seq = seqs[chrom][begin:stop]
five = start - begin - 1
three = end - begin
else:
seq = revcomp(seqs[chrom][begin: stop])
five = stop - end
three = stop - start + 1
if seq[five: five+2] != 'GT' or seq[three-2: three] != 'AG': continue
out += [(seq, five, three)]
return out
def get_genes():
gff = 'saccharomyces_cerevisiae.gff'
seq = get_seq(gff)
introns = get_introns(gff)
introns = get_flanks(introns, gff)
return intron_seqs(introns, seq)