-
Notifications
You must be signed in to change notification settings - Fork 1
/
RM_TRIPS.R
241 lines (156 loc) · 9.09 KB
/
RM_TRIPS.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
#RM_TRIPS
#Author = Chris Butler
#Email = c.butler@uea.ac.uk
#Date = 22/08/19
rm(list = ls())
### set up inputs
i <- '/Users/chrisbutler/Documents/technical_paper/RepeatMasker_Final/no_asterisk' #directory where .out file is located
j <- "noasteriskSample2.fasta.out" #set name of file
k <- '/Users/chrisbutler/Documents/technical_paper' #directory where the repeatmasker library is found (.lib/fasta file)
l <- 'RepeatMasker.lib' #set name of .lib file
#LOAD PACKAGES - ensure these are installed beforehand
library(dplyr)
library(ggplot2)
library(biomartr)
library(seqinr)
#SET UP FUNCTIONS
read_rm1 <- function(file) {
rm_file <- readr::read_lines(file = file, skip = 2) ##### changed from skip = 3 as this removed the first entry
rm_file <- lapply(rm_file, function(x) {
str.res <- unlist(stringr::str_split(x, "\\s+"))[-1]
str.res <- str.res[1:14]
return(str.res)
})
rm_file <- tibble::as_tibble(do.call(rbind, rm_file))
colnames(rm_file) <- c(
"sw_score",
"perc_div",
"perc_del",
"perc_insert",
"qry_id",
"qry_start",
"qry_end",
"qry_left",
"matching_repeat",
"repeat_id",
"matching_class",
"no_bp_in_complement",
"in_repeat_start",
"in_repeat_end"
)
qry_end <- qry_start <- NULL
nrow_before_filtering <- nrow(rm_file)
suppressWarnings(rm_file <- dplyr::mutate(rm_file,
qry_start = as.integer(qry_start),
qry_end = as.integer(qry_end)))
rm_file <-
dplyr::filter(
rm_file,
!is.na(qry_start),
!is.na(qry_end)
)
rm_file <-
dplyr::mutate(
rm_file,
qry_width = as.integer(qry_end - qry_start + 1L))
nrow_after_filtering <- nrow(rm_file)
if ((nrow_before_filtering - nrow_after_filtering) > 0)
message((nrow_before_filtering - nrow_after_filtering) + 1 , " out of ",nrow_before_filtering," rows ~ ", round(((nrow_before_filtering - nrow_after_filtering) + 1) / nrow_before_filtering, 3) , "% were removed from the imported RepeatMasker file, ",
"because they contained 'NA' values in either 'qry_start' or 'qry_end'.")
return(rm_file)
}
###load input files
setwd(i)
RM <- read_rm1(j)
setwd(k)
TEsequence <- read.fasta(file = l, seqtype = c("DNA"))
##########
#FILTERING STEPS
##########
# 1) Remove simple repeats
noisonosim <- RM
noisonosim <- noisonosim %>% filter(matching_class != "Simple_repeat") #no simple repeats
noisonosim <- noisonosim %>% filter(matching_class != "Low_complexity") #no low complexity repeats
noisonosim <- noisonosim %>% filter(matching_class != "Satellite") #no satellites
noisonosim <- noisonosim %>% filter(matching_class != "rRNA") #no rRNA
noisonosim <- noisonosim %>% filter(matching_class != "snRNA") #no snRNA
noisonosim <- noisonosim %>% filter(matching_class != "tRNA") #no tRNA
noisonosim <- noisonosim %>% filter(matching_class != "ARTEFACT") #no artefacts
# 2) Merge fragments
# Extract the length of every reference sequence and store the results in a vector.
seq_length <- rep(NA,length(TEsequence))
for (f in 1:length(TEsequence)) {
seq_length[[f]] <- summary(TEsequence[[f]])$length
}
seq_length <- as.data.frame(seq_length)
seq_length$repeat_id <- attr(TEsequence, "name")
seq_length$repeat_id <- gsub('\\#.*','', seq_length$repeat_id) #tidies up repbase naming conventions
#merge
test <- merge(noisonosim, seq_length, by = "repeat_id")
test <- dplyr::rename(test, reference_length = seq_length)
test <- test %>% mutate(., Query_Length = ((qry_end - qry_start) + 1)) #overall length of fragment
test <- test %>% group_by(repeat_id, matching_repeat, qry_id) %>% mutate(., lowextremety = min(qry_start)) #the earliest the fragment appears in the scaffold
test <- test %>% group_by(repeat_id, qry_id, matching_repeat) %>% mutate(., highextremety = max(qry_end)) #the latest the fragment appears in the scaffold
test <- mutate(test, mergedfraglength = ((highextremety - lowextremety)+1)) #overall merged fragment length
test$length_check <- ifelse(test$mergedfraglength <= test$reference_length, "YES", "NO")
test1 <- filter(test, length_check == "YES") #put all cases where the merged fragment length is less than the refbase entry into one database
test2 <- filter(test, length_check == "NO") #put all cases where the merged fragment length is longer than the refbase entry into one database
#####test2#####
#next snippet of code keeps merged elements that should have been merged seperate
test2 <- test2 %>% dplyr::select(-c('lowextremety', 'highextremety', 'mergedfraglength'))
test2 <- rename(test2, lowextremety = qry_start)
test2 <- rename(test2, highextremety = qry_end)
test2 <- rename(test2, mergedfraglength = qry_width)
##########################
noisonosim <- rbind(test1, test2) #merge
noisonosim <- noisonosim %>% dplyr::select(-c(Query_Length, qry_width, in_repeat_start, in_repeat_end, no_bp_in_complement, qry_start, qry_end, qry_left, sw_score)) #remove all these columns
noisonosim$perc_del <- as.numeric(noisonosim$perc_del) #ensure these column are numeric
noisonosim$perc_div <- as.numeric(noisonosim$perc_div)
noisonosim$perc_insert <- as.numeric(noisonosim$perc_insert)
#calculate a new mean perc del across the novel merged fragments copy
noisonosim <- noisonosim %>% group_by(qry_id, repeat_id, matching_repeat, mergedfraglength) %>% mutate(., Copy_perc_div = mean(perc_div))
noisonosim <- noisonosim %>% group_by(qry_id, repeat_id, matching_repeat, mergedfraglength) %>% mutate(., Copy_perc_del = mean(perc_del))
noisonosim <- noisonosim %>% group_by(qry_id, repeat_id, matching_repeat, mergedfraglength) %>% mutate(., Copy_perc_insert = mean(perc_insert))
#remove old 'perc del' values
noisonosim <- noisonosim %>% dplyr::select(-c(perc_div, perc_del, perc_insert))
noisonosim <- dplyr::rename(noisonosim, perc_div = Copy_perc_div) #rename
noisonosim <- dplyr::rename(noisonosim, perc_del = Copy_perc_del)
noisonosim <- dplyr::rename(noisonosim, perc_insert = Copy_perc_insert)
noisonosim <- unique(noisonosim) #this step ensures that what was two fragments now merged is only represented once
# 3)Remove TEs found in different isoforms of the same "gene"
#########NOTE - this step is not needed when working with genome data##############
noisonosim$Gene <- noisonosim$qry_id #make duplicated transcript file
noisonosim$Gene <- sub("_[^_]+$","", noisonosim$Gene) #removes isoform identifier
noisonosim$isoform <- noisonosim$qry_id #make another duplicated transcript file
noisonosim$isoform <- gsub(".*i","", noisonosim$isoform) #removes isoform identifier
#for a given TE how many isoforms is it found on
noisonosim <- noisonosim %>% group_by(Gene, repeat_id, matching_repeat) %>% mutate(isoform_number = n_distinct(isoform))
check <- filter(noisonosim, isoform_number == 1) #if TE is only found in one isoform then no worry
#What about TEs found in multiple isoforms?
check2 <- filter(noisonosim, isoform_number > 1)
#only keep isoform which has the most TE content
check2 <- check2 %>% group_by(repeat_id, qry_id, matching_repeat) %>% mutate(totalTElength_pertranscript = sum(mergedfraglength)) #total TE content per element per transcript (ie if fragments were merged)
check2 <- check2 %>% ungroup() %>% group_by(repeat_id, Gene, matching_repeat) %>% filter(.,totalTElength_pertranscript ==max(totalTElength_pertranscript))
#however - this will correctly keep two TE fragments on the same isoform but incorrectly keep TEs on different isoforms if they have the same merged length (common)
check2 <- check2 %>% group_by(Gene, repeat_id, matching_repeat) %>% mutate(isoform_number_two = n_distinct(isoform)) #are TEs on same isoform or not?
check3 <- check2 %>% ungroup() %>% filter(isoform_number_two > 1) #check 3 contains those incorrect - found on two different isoforms
check2 <- check2 %>% filter(isoform_number_two == 1) #check 2 contains those correct - fragments found on same isoform
check2 <- check2 %>% dplyr::select(-c(isoform_number_two, totalTElength_pertranscript)) #tidy
check3 <- check3 %>% distinct(Gene, repeat_id, matching_repeat, .keep_all = TRUE) #only keep one instance (at random) if TEs on different isoforms have same merged frag length
#distinct (only keep one entry) if sample, gene, repeat and matching repeat all identical
check3 <- check3 %>% dplyr::select(-c(isoform_number_two, totalTElength_pertranscript)) #tidy
#time to merge those TEs found in only one isoform (check), largest TEs containing isoform if found in multiple isoforms (keeping both fragments on same isoform if necessary) (check2), or if no largest TE containing isoform one entry kept at random
check <-as.data.frame(check)
check2 <- as.data.frame(check2)
check3 <- as.data.frame(check3)
finalcheck <- rbind(check, check2, check3)
noisonosim <- as.data.frame(finalcheck)
# 4) remove fragments less than 80bp
noisonosim <- filter(noisonosim, mergedfraglength >= 80)
# 5) Write output file
#quick clean up
noisonosim <- noisonosim %>% dplyr::select(-c(length_check, isoform_number))
noisonosim <- rename(noisonosim, merged_qrystart = lowextremety)
noisonosim <- rename(noisonosim, merged_qryend = highextremety)
setwd(i)
write.csv(noisonosim, file = paste0(j, "_RM_TRIPS.csv"))