line |
bran |
sub |
pod |
code |
1
|
|
|
|
package Sanger::CGP::TraFiC::Implement; |
2
|
|
|
|
|
3
|
|
|
|
##########LICENCE########## |
4
|
|
|
|
# Copyright (c) 2015 Genome Research Ltd. |
5
|
|
|
|
# |
6
|
|
|
|
# Author: Cancer Genome Project cgpit@sanger.ac.uk |
7
|
|
|
|
# |
8
|
|
|
|
# This file is part of TraFiC. |
9
|
|
|
|
# |
10
|
|
|
|
# TraFiC is free software: you can redistribute it and/or modify it under |
11
|
|
|
|
# the terms of the GNU Affero General Public License as published by the Free |
12
|
|
|
|
# Software Foundation; either version 3 of the License, or (at your option) any |
13
|
|
|
|
# later version. |
14
|
|
|
|
# |
15
|
|
|
|
# This program is distributed in the hope that it will be useful, but WITHOUT |
16
|
|
|
|
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
17
|
|
|
|
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more |
18
|
|
|
|
# details. |
19
|
|
|
|
# |
20
|
|
|
|
# You should have received a copy of the GNU Affero General Public License |
21
|
|
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>. |
22
|
|
|
|
# |
23
|
|
|
|
# 1. The usage of a range of years within a copyright statement contained within |
24
|
|
|
|
# this distribution should be interpreted as being equivalent to a list of years |
25
|
|
|
|
# including the first and last year specified and all consecutive years between |
26
|
|
|
|
# them. For example, a copyright statement that reads 'Copyright (c) 2005, 2007- |
27
|
|
|
|
# 2009, 2011-2012' should be interpreted as being identical to a statement that |
28
|
|
|
|
# reads 'Copyright (c) 2005, 2007, 2008, 2009, 2011, 2012' and a copyright |
29
|
|
|
|
# statement that reads "Copyright (c) 2005-2012' should be interpreted as being |
30
|
|
|
|
# identical to a statement that reads 'Copyright (c) 2005, 2006, 2007, 2008, |
31
|
|
|
|
# 2009, 2010, 2011, 2012'." |
32
|
|
|
|
##########LICENCE########## |
33
|
|
|
|
|
34
|
|
1
|
|
use strict; |
35
|
|
1
|
|
use warnings FATAL => 'all'; |
36
|
|
1
|
|
use English qw( -no_match_vars ); |
37
|
|
1
|
|
use autodie qw(:all); |
38
|
|
1
|
|
use Const::Fast qw(const); |
39
|
|
1
|
|
use File::Which qw(which); |
40
|
|
1
|
|
use File::Path qw(make_path remove_tree); |
41
|
|
1
|
|
use File::Temp qw(tempfile); |
42
|
|
1
|
|
use FindBin qw($Bin); |
43
|
|
1
|
|
use File::Basename qw(fileparse); |
44
|
|
|
|
|
45
|
|
1
|
|
use Sanger::CGP::TraFiC; |
46
|
|
|
|
|
47
|
|
1
|
|
use PCAP::Threaded; |
48
|
|
1
|
|
use PCAP::Bam; |
49
|
|
|
|
|
50
|
|
|
|
const my $CAND_READS => q{ -c %s -o %s -i %s}; |
51
|
|
|
|
const my $CAND_ORPH => q{ -i %s}; |
52
|
|
|
|
const my $CAND_SPLIT => q{ -o %s -i %s}; |
53
|
|
|
|
const my $CLUSTER => q{ -r %s/rm/%%/%s.%%.out -f %s/split/%s.%% -m %s -o %s}; |
54
|
|
|
|
|
55
|
|
|
|
### the core work |
56
|
|
|
|
|
57
|
|
|
|
sub select_candidate_reads { |
58
|
|
0
|
0
|
my ($index, $options) = @_; |
59
|
0
|
|
|
return 1 if(exists $options->{'index'} && $index != $options->{'index'}); |
60
|
|
|
|
|
61
|
|
|
|
my $tmp = $options->{'tmp'}; |
62
|
0
|
|
|
return 1 if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), $index); |
63
|
|
|
|
|
64
|
|
|
|
my @inputs = ($options->{'tumour'}, $options->{'normal'}); |
65
|
|
|
|
my $iter = 1; |
66
|
|
|
|
for my $input(@inputs) { |
67
|
|
|
|
for my $chr(@{$options->{'ordered_seqs'}}) { |
68
|
0
|
|
|
next if($iter++ != $index); # skip to the relevant input in the list |
69
|
|
|
|
|
70
|
|
|
|
## build command for this index |
71
|
|
|
|
# |
72
|
|
|
|
|
73
|
|
|
|
my $sample = sanitised_sample_from_bam($input); |
74
|
|
|
|
my $output = File::Spec->catdir($tmp, 'candidates', $sample); |
75
|
0
|
|
|
make_path($output) unless(-e $output); |
76
|
|
|
|
|
77
|
|
|
|
my $command = "$^X "; |
78
|
|
|
|
$command .= _which('TraFiC_candReads.pl'); |
79
|
|
|
|
$command .= sprintf $CAND_READS, $chr, $output, $input; |
80
|
0
|
|
|
if(exists $options->{'badloci'}) { |
81
|
|
|
|
$command .= ' -e '.$options->{'badloci'}; |
82
|
|
|
|
} |
83
|
|
|
|
|
84
|
|
|
|
PCAP::Threaded::external_process_handler(File::Spec->catdir($tmp, 'logs'), $command, $index); |
85
|
|
|
|
|
86
|
|
|
|
# |
87
|
|
|
|
## The rest is auto-magical |
88
|
|
|
|
PCAP::Threaded::touch_success(File::Spec->catdir($tmp, 'progress'), $index); |
89
|
|
|
|
} |
90
|
|
|
|
} |
91
|
|
|
|
return 1; |
92
|
|
|
|
} |
93
|
|
|
|
|
94
|
|
|
|
sub merge_select_orphans { |
95
|
|
0
|
0
|
my ($index, $options) = @_; |
96
|
0
|
|
|
return 1 if(exists $options->{'index'} && $index != $options->{'index'}); |
97
|
|
|
|
|
98
|
|
|
|
my $tmp = $options->{'tmp'}; |
99
|
0
|
|
|
return 1 if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), $index); |
100
|
|
|
|
|
101
|
|
|
|
my @inputs = ($options->{'tumour'}, $options->{'normal'}); |
102
|
|
|
|
my $iter = 1; |
103
|
|
|
|
for my $input(@inputs) { |
104
|
0
|
|
|
next if($iter++ != $index); # skip to the relevant input in the list |
105
|
|
|
|
|
106
|
|
|
|
## build command for this index |
107
|
|
|
|
# |
108
|
|
|
|
|
109
|
|
|
|
my $sample = sanitised_sample_from_bam($input); |
110
|
|
|
|
my $inputdir = File::Spec->catdir($tmp, 'candidates', $sample); |
111
|
|
|
|
|
112
|
|
|
|
my $command = "$^X "; |
113
|
|
|
|
$command .= _which('TraFiC_candOrph.pl'); |
114
|
|
|
|
$command .= sprintf $CAND_ORPH, $inputdir; |
115
|
|
|
|
|
116
|
|
|
|
PCAP::Threaded::external_process_handler(File::Spec->catdir($tmp, 'logs'), $command, $index); |
117
|
|
|
|
|
118
|
|
|
|
# |
119
|
|
|
|
## The rest is auto-magical |
120
|
|
|
|
PCAP::Threaded::touch_success(File::Spec->catdir($tmp, 'progress'), $index); |
121
|
|
|
|
} |
122
|
|
|
|
return 1; |
123
|
|
|
|
} |
124
|
|
|
|
|
125
|
|
|
|
sub split_candidate_reads { |
126
|
|
0
|
0
|
my ($index, $options) = @_; |
127
|
0
|
|
|
return 1 if(exists $options->{'index'} && $index != $options->{'index'}); |
128
|
|
|
|
|
129
|
|
|
|
my $tmp = $options->{'tmp'}; |
130
|
0
|
|
|
return 1 if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), $index); |
131
|
|
|
|
|
132
|
|
|
|
my @inputs = ($options->{'tumour'}, $options->{'normal'}); |
133
|
|
|
|
my $iter = 1; |
134
|
|
|
|
for my $input(@inputs) { |
135
|
0
|
|
|
next if($iter++ != $index); # skip to the relevant input in the list |
136
|
|
|
|
|
137
|
|
|
|
## build command for this index |
138
|
|
|
|
# |
139
|
|
|
|
|
140
|
|
|
|
my $sample = sanitised_sample_from_bam($input); |
141
|
|
|
|
my $inputdir = File::Spec->catdir($tmp, 'candidates', $sample); |
142
|
|
|
|
my $outdir = File::Spec->catdir($tmp, 'split'); |
143
|
0
|
|
|
make_path($outdir) unless(-e $outdir); |
144
|
|
|
|
|
145
|
|
|
|
my $command = "$^X "; |
146
|
|
|
|
$command .= _which('TraFiC_candSplit.pl'); |
147
|
|
|
|
$command .= sprintf $CAND_SPLIT, File::Spec->catfile($outdir, $sample.'.'), $inputdir; |
148
|
|
|
|
|
149
|
|
|
|
PCAP::Threaded::external_process_handler(File::Spec->catdir($tmp, 'logs'), $command, $index); |
150
|
|
|
|
|
151
|
|
|
|
# |
152
|
|
|
|
## The rest is auto-magical |
153
|
|
|
|
PCAP::Threaded::touch_success(File::Spec->catdir($tmp, 'progress'), $index); |
154
|
|
|
|
} |
155
|
|
|
|
return 1; |
156
|
|
|
|
} |
157
|
|
|
|
|
158
|
|
|
|
sub repeat_mask_te { |
159
|
|
0
|
0
|
my ($index, $options) = @_; |
160
|
0
|
|
|
return 1 if(exists $options->{'index'} && $index != $options->{'index'}); |
161
|
|
|
|
my $tmp = $options->{'tmp'}; |
162
|
0
|
|
|
return 1 if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), $index); |
163
|
|
|
|
|
164
|
|
|
|
my $infile = $options->{'split_files'}->[$index-1]; |
165
|
|
|
|
|
166
|
|
|
|
# build command |
167
|
|
|
|
|
168
|
|
|
|
my $rm_outdir = File::Spec->catdir($tmp, 'rm', $index); |
169
|
0
|
|
|
if(-e $rm_outdir){ |
170
|
|
|
|
remove_tree($rm_outdir); # always cleanup any failed attempt first |
171
|
|
|
|
} |
172
|
|
|
|
make_path($rm_outdir); |
173
|
|
|
|
|
174
|
|
|
|
my $n_name = $options->{'normal_name'}; |
175
|
|
|
|
my $t_name = $options->{'tumour_name'}; |
176
|
|
|
|
my $file_sample = fileparse($infile); |
177
|
|
|
|
|
178
|
|
|
|
my $divergence; |
179
|
|
|
|
# files are name sorted so index won't line up |
180
|
0
|
|
|
if($file_sample =~ m/^$n_name[.][[:digit:]]+$/) { |
|
0
|
|
|
|
181
|
|
|
|
$divergence = $options->{'ndiv'}; |
182
|
|
|
|
} |
183
|
|
|
|
elsif($file_sample =~ m/^$t_name[.][[:digit:]]+$/) { |
184
|
|
|
|
$divergence = $options->{'tdiv'}; |
185
|
|
|
|
} |
186
|
|
|
|
else { |
187
|
|
|
|
die "Unexpected file name: $file_sample\n"; |
188
|
|
|
|
} |
189
|
|
|
|
|
190
|
|
|
|
my $repeatmasker; |
191
|
0
|
|
|
if(defined $options->{'rm'}) { |
192
|
|
|
|
$repeatmasker = $options->{'rm'}; |
193
|
|
|
|
} |
194
|
|
|
|
else { |
195
|
|
|
|
$repeatmasker = _which('RepeatMasker'); |
196
|
|
|
|
} |
197
|
0
|
|
|
die "Unable to find RepeatMasker binary, try setting '-rm'" unless(defined $repeatmasker); |
198
|
|
|
|
|
199
|
|
|
|
my $command = "cd $rm_outdir;"; # you can set -dir, but tmp files written to cwd |
200
|
|
|
|
$command .= $repeatmasker; |
201
|
|
|
|
$command .= " -no_is -nolow"; |
202
|
|
|
|
$command .= " -lib $options->{maskdb}"; |
203
|
|
|
|
$command .= " -e $options->{engine}"; |
204
|
|
|
|
$command .= " -norna $options->{accuracy}"; |
205
|
|
|
|
$command .= " -frag 2500000"; |
206
|
|
|
|
$command .= " -div $divergence"; # should this be configurable? |
207
|
|
|
|
$command .= " -dir $rm_outdir"; |
208
|
|
|
|
$command .= " $infile"; |
209
|
|
|
|
|
210
|
|
|
|
PCAP::Threaded::external_process_handler(File::Spec->catdir($tmp, 'logs'), $command, $index); |
211
|
|
|
|
PCAP::Threaded::touch_success(File::Spec->catdir($tmp, 'progress'), $index); |
212
|
|
|
|
|
213
|
|
|
|
my $basefile = "$rm_outdir/$file_sample.*"; |
214
|
|
|
|
system('rm', '-f', "$basefile.cat.gz", "$basefile.log" , "$basefile.masked", "$basefile.tbl"); |
215
|
|
|
|
|
216
|
|
|
|
return 1; |
217
|
|
|
|
} |
218
|
|
|
|
|
219
|
|
|
|
sub cluster_te { |
220
|
|
0
|
0
|
my ($index, $options) = @_; |
221
|
0
|
|
|
return 1 if(exists $options->{'index'} && $index != $options->{'index'}); |
222
|
|
|
|
my $tmp = $options->{'tmp'}; |
223
|
0
|
|
|
return 1 if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), $index); |
224
|
|
|
|
|
225
|
|
|
|
my $sample; |
226
|
0
|
|
|
if($index == 1) { |
|
0
|
|
|
|
227
|
|
|
|
$sample = $options->{'tumour_name'}; |
228
|
|
|
|
} |
229
|
|
|
|
elsif($index == 2) { |
230
|
|
|
|
$sample = $options->{'normal_name'}; |
231
|
|
|
|
} |
232
|
|
|
|
|
233
|
|
|
|
my $cluster_outdir = File::Spec->catdir($tmp, 'clusters', $sample); |
234
|
0
|
|
|
if(-e $cluster_outdir){ |
235
|
|
|
|
remove_tree($cluster_outdir); # always cleanup any failed attempt first |
236
|
|
|
|
} |
237
|
|
|
|
make_path($cluster_outdir); |
238
|
|
|
|
|
239
|
|
|
|
my $command = "$^X "; |
240
|
|
|
|
$command .= _which('TraFiC_cluster.pl'); |
241
|
|
|
|
$command .= sprintf $CLUSTER, |
242
|
|
|
|
$tmp, $sample, # repeatmaster.out |
243
|
|
|
|
$tmp, $sample, # repeatmasker in modified fasta |
244
|
|
|
|
$options->{'support'}, |
245
|
|
|
|
$cluster_outdir; |
246
|
0
|
|
|
$command .= ' -p' if($sample eq $options->{'tumour_name'}); |
247
|
|
|
|
|
248
|
|
|
|
|
249
|
|
|
|
PCAP::Threaded::external_process_handler(File::Spec->catdir($tmp, 'logs'), $command, $index); |
250
|
|
|
|
PCAP::Threaded::touch_success(File::Spec->catdir($tmp, 'progress'), $index); |
251
|
|
|
|
return 1; |
252
|
|
|
|
} |
253
|
|
|
|
|
254
|
|
|
|
sub filter_results { |
255
|
|
0
|
0
|
my $options = shift; |
256
|
|
|
|
my $tmp = $options->{'tmp'}; |
257
|
0
|
|
|
return 1 if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), 0); |
258
|
|
|
|
|
259
|
|
|
|
my $clusters_tumour = File::Spec->catdir($tmp, 'clusters', $options->{'tumour_name'}); |
260
|
|
|
|
my $clusters_normal = File::Spec->catdir($tmp, 'clusters', $options->{'normal_name'}); |
261
|
|
|
|
|
262
|
|
|
|
my $np; |
263
|
0
|
|
|
if(exists $options->{'germline'}) { |
264
|
|
|
|
$np = $options->{'germline'} |
265
|
|
|
|
} |
266
|
|
|
|
else { |
267
|
|
|
|
$np = File::Spec->catfile($tmp, 'faked_np.txt'); |
268
|
|
|
|
open my $NPFH, '>', $np; |
269
|
|
|
|
print $NPFH "#\n"; |
270
|
|
|
|
close $NPFH; |
271
|
|
|
|
} |
272
|
|
|
|
|
273
|
|
|
|
my $command = "$^X "; |
274
|
|
|
|
$command .= _which('TraFiC_filter.pl'); |
275
|
|
|
|
$command .= " -o $options->{outdir}"; |
276
|
|
|
|
$command .= ' -i '.File::Spec->catfile($clusters_tumour, 'reciprocal_clusters.txt'); |
277
|
|
|
|
$command .= ' -f '.File::Spec->catfile($clusters_normal, 'pos_clusters.txt'); |
278
|
|
|
|
$command .= ' -f '.File::Spec->catfile($clusters_normal, 'neg_clusters.txt'); |
279
|
|
|
|
$command .= " -g $np"; |
280
|
0
|
|
|
if(exists $options->{'known'}) { |
281
|
|
|
|
$command .= " -t $options->{known}"; |
282
|
|
|
|
$command .= " -d $options->{div}"; |
283
|
|
|
|
} |
284
|
|
|
|
$command .= " -fm $options->{filtmin}"; |
285
|
|
|
|
$command .= " -gm $options->{germmin}"; |
286
|
|
|
|
|
287
|
|
|
|
PCAP::Threaded::external_process_handler(File::Spec->catdir($tmp, 'logs'), $command, 0); |
288
|
|
|
|
|
289
|
0
|
|
|
unless(exists $options->{'germline'}) { |
290
|
|
|
|
unlink $np; |
291
|
|
|
|
} |
292
|
|
|
|
|
293
|
|
|
|
PCAP::Threaded::touch_success(File::Spec->catdir($tmp, 'progress'), 0); |
294
|
|
|
|
return 1; |
295
|
|
|
|
} |
296
|
|
|
|
|
297
|
|
|
|
### Utility methods |
298
|
|
|
|
|
299
|
|
|
|
sub prepare { |
300
|
|
0
|
0
|
my $options = shift; |
301
|
|
|
|
$options->{'tumour_name'} = (PCAP::Bam::sample_name($options->{'tumour'}))[0]; |
302
|
|
|
|
$options->{'normal_name'} = (PCAP::Bam::sample_name($options->{'normal'}))[0]; |
303
|
|
|
|
return 1; |
304
|
|
|
|
} |
305
|
|
|
|
|
306
|
|
|
|
sub sanitised_sample_from_bam { |
307
|
|
0
|
0
|
my $sample = (PCAP::Bam::sample_name(shift))[0]; |
308
|
|
|
|
$sample =~ s/[^a-z0-9_-]/_/ig; # sanitise sample name |
309
|
|
|
|
return $sample; |
310
|
|
|
|
} |
311
|
|
|
|
|
312
|
|
|
|
sub _which { |
313
|
|
0
|
|
my $prog = shift; |
314
|
|
|
|
my $l_bin = $Bin; |
315
|
|
|
|
my $path = File::Spec->catfile($l_bin, $prog); |
316
|
0
|
|
|
$path = which($prog) unless(-e $path); |
317
|
|
|
|
return $path; |
318
|
|
|
|
} |
319
|
|
|
|
|
320
|
|
|
|
sub get_sequences { |
321
|
|
0
|
0
|
my $bam_file = shift; |
322
|
|
|
|
my $bam = PCAP::Bam->new($bam_file); |
323
|
|
|
|
my @ordered_sq = sort @{$bam->header_sq}; |
324
|
|
|
|
return \@ordered_sq; |
325
|
|
|
|
} |
326
|
|
|
|
|
327
|
|
|
|
sub split_files { |
328
|
|
0
|
0
|
my $tmp_folder = shift; |
329
|
|
|
|
my @files; |
330
|
|
|
|
my $indir = File::Spec->catdir($tmp_folder, 'split'); |
331
|
0
|
|
|
if(-e $indir) { |
332
|
0
|
|
|
opendir(my $dh, $indir) || die "Unable to read from $indir: $OS_ERROR"; |
333
|
|
|
|
while(my $thing = readdir $dh) { |
334
|
0
|
|
|
next if($thing =~ m/^[.]/); |
335
|
0
|
|
|
push @files, "$indir/$thing" if($thing =~ m/.*[.][[:digit:]]+$/); |
336
|
|
|
|
} |
337
|
|
|
|
closedir($dh); |
338
|
|
|
|
} |
339
|
|
|
|
@files = sort @files; # make sure these are always in same order |
340
|
|
|
|
return \@files; |
341
|
|
|
|
} |
342
|
|
|
|
|
343
|
|
|
|
1; |