-
Notifications
You must be signed in to change notification settings - Fork 7
/
create_pdb.pl
executable file
·374 lines (323 loc) · 10 KB
/
create_pdb.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
#!/usr/bin/perl
## Pombert Lab 2020
my $version = '0.3a';
my $name = 'create_pdb.pl';
my $updated = '2021-07-23';
use strict;
use warnings;
use Getopt::Long qw(GetOptions);
use File::Basename;
use threads;
use threads::shared;
## Usage definition
my $USAGE = <<"OPTIONS";
NAME ${name}
VERSION ${version}
UPDATED ${updated}
SYNOPSIS Creates .pdb files with trRosetta from .npz files
REQUIREMENTS trRosetta scripts - https://yanglab.nankai.edu.cn/trRosetta/download/ ## trRosetta package (28M)
COMMAND ${name} \\
-c 10 \\
-n NPZ/ \\
-o PDB/ \\
-f FASTA_OL/ \\
-t /opt/trRosetta_scripts
NOTE: The -t option is not required if the environment variable TRROSETTA_SCRIPTS is set, e.g.:
export TRROSETTA_SCRIPTS=/opt/trRosetta_scripts
OPTIONS:
-c (--cpu) Number of cpu threads to use [Default: 10] ## i.e. runs n processes in parallel
-m (--memory) Memory available (in Gb) to threads [Default: 16]
-n (--npz) Folder containing .npz files
-o (--output) Output folder [Default: ./]
-f (--fasta) Folder containing the oneliner fasta files
-t (--trrosetta) trRosetta scripts directory (TRROSETTA_SCRIPTS)
-p (--python) Preferred Python interpreter [Default: python]
OPTIONS
die "\n$USAGE\n" unless @ARGV;
my @commands = @ARGV;
## Defining options
my $npz_dir;
my $out = './';
my $trrosetta_scripts;
my $fasta;
my $threads = 10;
my $python = 'python';
my $memory = 16;
GetOptions(
'n|npz=s' => \$npz_dir,
'o|output=s' => \$out,
't|trrosetta=s' => \$trrosetta_scripts,
'f|fasta=s' => \$fasta,
'c|cpu=i' => \$threads,
'p|python=s' => \$python,
'm|memory=i' => \$memory
);
### Checking for tRosetta scripts; environment variables in Perl are loaded in %ENV
if (!defined $trrosetta_scripts){
if (exists $ENV{'TRROSETTA_SCRIPTS'}){ $trrosetta_scripts = $ENV{'TRROSETTA_SCRIPTS'}; }
else {
print "WARNING: The trRosetta scripts directory is not set as an environment variable (\$TRROSETTA_SCRIPTS) and the -r option was not entered.\n";
print "Please check if trRosetta scripts were installed properly\n\n";
exit;
}
}
elsif (defined $trrosetta_scripts){
unless (-d $trrosetta_scripts){ die "WARNING: Can't find trRosetta scripts installation folder: $trrosetta_scripts. Please check command line\n\n"; }
}
## Load npz files into an array
my @npz;
opendir(DIR,$npz_dir) or die("Can't open $npz_dir: $!\n");
while (my $file = readdir(DIR)){
if ($file =~ /\.npz$/){
push(@npz,"$npz_dir/$file");
}
}
## Checking output folder
unless (-d $out){
mkdir ($out,0755) or die "Can't create folder $out: $!\n";
}
## Creating log file
open LOG, ">", "$out/create_pdb.log" or die "Can't create create_pdb.log in $out: $!\n";
my $time = `date`;
print LOG "$name version $version started on $time\n";
print LOG "COMMANDS:\n";
print LOG "$name @commands\n";
## Initialize # of threads specified
my @threads = initThreads();
## Copying the array into a shared list for multithreading (use threads::shared;)
my @files :shared = @npz;
## Setting the maximum shared file size to prevent RAM overloading
my $max_file_memory :shared = 0.0078125*($memory*(10**9));
## Printable version of shared file size
my $max_file_memory_p = $max_file_memory/(10**6);
## Setup shared file size tracker
my $file_memory :shared = $max_file_memory;
## Create large file array for single threading
my @large_files :shared;
## Total amount of files
my $total_files :shared = scalar(@files);
## Initialize running process printout
my %running_processes :shared;
## Running threads counter
my $running_threads :shared = 0;
## Folding threads counter
my $folding_threads :shared = 0;
## Threads completed
my $completed :shared = 0;
my $start :shared;
my @output_pdb;
## Printout buffer
my $buffer = "-" x 100;
## Create threads that run the exe subroutine
for my $thread (@threads){
$thread = threads -> create(\&mt_exe);
}
my $print_thread = threads -> create(\&mt_po);
## Run until threads are done
for my $thread (@threads){
$thread -> join();
}
$total_files = scalar(@large_files);
$completed = 0;
if (@large_files){
my $thr1 = threads -> create(\&st_exe);
my $thr2 = threads -> create(\&st_po);
$thr1 -> join();
$thr2 -> join();
}
## End time
my $end = `date`;
print LOG "$name ended on $end\n";
## Subroutines
sub initThreads{
# An array to place our threads in
my @initThreads;
for (my $i = 1; $i <= $threads; $i++){ push(@initThreads,$i); }
return @initThreads;
}
### Multi-thread folding function
sub mt_exe{
## Get the thread id. Allows each thread to be identified.
my $t_id = threads->tid();
my $id = sprintf("%02d",$t_id);
if($0){
lock($running_threads);
$running_threads++;
}
## While files remain to be folded
PROCESS: while (0==0){
my $npz;
## If the number of files is less than the number of threads, release non-utilized threads, if not, grab next
## npz file
unless (scalar(@files) > 0){
last PROCESS;
}
else{
lock(@files);
$npz = shift(@files);
}
my ($name, $dir) = fileparse($npz);
my ($prefix, $evalue) = $name =~ /^(\S+)\.(\S+)\.(\w+)$/;
if (-e "$out/$prefix.$evalue.pdb"){
print LOG "$out/$prefix.$evalue.pdb already exists, moving to next npz...\n";
next;
}
## Check if file size is greater than maximum
if ((-s $npz) < $max_file_memory){
## Check if file can be opened given the alloted resources
## Update available file memory
if ($0){
lock($file_memory);
$file_memory -= -s $npz;
}
## If memory is available, fold the npz
if (0 < $file_memory){
## Update process printout
if ($0){
lock(%running_processes);
lock($folding_threads);
$running_processes{$id} = "Thread $id: Folding $name started on ".localtime()."\n";
$folding_threads ++;
}
## Get starttime, run process, and get stop time
my $starttime = `date`;
system "$python \\
$trrosetta_scripts/trRosetta.py \\
-npz $npz \\
-fasta $fasta/$prefix.fasta \\
-o $out/$prefix.$evalue.pdb \\
1> /dev/null \\
2> trRosetta.ERROR.log"
;
my $endtime = `date`;
## If the file did not fold, push it back into the queue to try again
unless (-e "$out/$prefix.$evalue.pdb"){
lock(%running_processes);
$running_processes{$id} = "Thread $id: Failed to fold $name. Placing back in the queue on ".localtime()."\n";
push(@files,$npz);
}
else {
lock(%running_processes);
$running_processes{$id} = "Thread $id: Folding on $name has completed.\n";
print LOG "\n$buffer\nFile $name:\nStarted $starttime\nCompleted $endtime\n$buffer\n\n";
}
if ($0){
lock($folding_threads);
$folding_threads--;
}
}
else{
## If the file is greater than 50% of available memory, place it into the single file queue
if ((-s $npz) > .5*$max_file_memory){
lock(%running_processes);
$running_processes{$id} = "Thread $id: $name is too large for Multi-threading. Sendt to Single-threaded queue on ".localtime()."\n";
push(@large_files,$npz);
}
else {
lock(%running_processes);
$running_processes{$id} = "Thread $id: Not enough memory clearance to fold $name. Placed back in the queue on ".localtime()."\n";
push(@files,$npz);
}
}
if ($0){
lock($file_memory);
$file_memory += -s $npz;
}
}
else {
## If file is large, need to run it one by one
## Add large file to $large_files
lock($total_files);
lock(%running_processes);
$total_files -= 1;
$running_processes{$id} = "Thread $id: $name is too large for Multi-threading. Sent to Single-threaded queue on ".localtime()."\n";
push(@large_files,$npz);
}
sleep(5);
}
lock($running_threads);
lock(%running_processes);
lock($completed);
$running_threads--;
$running_processes{$id} = "Thread $id: No more jobs to run. Exited on ".localtime()."\n";
$completed++;
threads -> exit();
}
### Multi-thread printout
sub mt_po{
WHILE: while (0==0){
if ($completed == $threads){
last WHILE;
}
if ($0){
lock(@files);
lock(%running_processes);
lock($running_threads);
lock($file_memory);
system "clear";
my $remaining = "." x (int((scalar(@files)/$total_files)*100));
my $progress = "|" x (100-int((scalar(@files)/$total_files)*100));
my $status = "[".$progress.$remaining."]";
print "\nFolding Proteins with Multi-threading with trRosetta - https://github.com/gjoni/trRosetta\n";
print "\n\t$status\t".($total_files-scalar(@files))."/$total_files";
print "\n\n\tThreads Running:\t$running_threads/$threads\n";
print "\tThreads Folding:\t$folding_threads/$threads\n";
print "\tAvailable Memory:\t".sprintf("%.2f",($file_memory/1000000))."/".($max_file_memory/1000000)." Mb\n\n\n";
print "Thread Status:\n$buffer\n";
foreach my $key (sort(keys(%running_processes))){
chomp($running_processes{$key});
print("$running_processes{$key}\n");
}
print("\n\n");
sleep(2);
}
}
}
### Single-thread folding
sub st_exe{
while (my $npz = shift(@large_files)){
my ($name, $dir) = fileparse($npz);
my ($prefix, $evalue) = $name =~ /^(\S+)\.(\S+)\.(\w+)$/;
my $buffer = "-" x 100;
if ($0) {
lock($start);
$start = localtime();
}
my $starttime = `date`;
system "$python \\
$trrosetta_scripts/trRosetta.py \\
-npz $npz \\
-fasta $fasta/$prefix.fasta \\
-o $out/$prefix.$evalue.pdb \\
1> /dev/null \\
2> trRosetta.ERROR.log"
;
my $endtime = `date`;
unless (-e "$out/$prefix.$evalue.pdb"){
print LOG "$out/$prefix.$evalue.pdb";
print LOG "\n$buffer\nMain thread has failed to fold file $name\n$buffer\n\n";
}
else {
print LOG "\n$buffer\nFile $name:\nStarted $starttime\nCompleted $endtime\n$buffer\n\n";
}
}
lock($completed);
$completed = 1;
threads -> exit();
}
### Single-thread printout
sub st_po{
WHILE: while (0 == 0){
if ($completed == 1){
last WHILE;
}
system "clear";
my $remaining = "." x (int((scalar(@large_files)/$total_files)*100));
my $progress = "|" x (100-int((scalar(@large_files)/$total_files)*100));
my $status = "[".$progress.$remaining."]";
print("Folding Proteins with Single-threading started on $start\n");
print("\n\t$status\t".($total_files-scalar(@large_files))."/$total_files\n");
sleep(2);
}
threads -> exit();
}