line |
bran |
sub |
pod |
code |
1
|
|
|
|
package Sanger::CGP::TraFiC::Parser; |
2
|
|
|
|
|
3
|
|
|
|
##########LICENCE########## |
4
|
|
|
|
# Copyright (c) 2015 Genome Research Ltd. |
5
|
|
|
|
# |
6
|
|
|
|
# Author: Cancer Genome Project cgpit@sanger.ac.uk |
7
|
|
|
|
# |
8
|
|
|
|
# This file is part of TraFiC. |
9
|
|
|
|
# |
10
|
|
|
|
# TraFiC is free software: you can redistribute it and/or modify it under |
11
|
|
|
|
# the terms of the GNU Affero General Public License as published by the Free |
12
|
|
|
|
# Software Foundation; either version 3 of the License, or (at your option) any |
13
|
|
|
|
# later version. |
14
|
|
|
|
# |
15
|
|
|
|
# This program is distributed in the hope that it will be useful, but WITHOUT |
16
|
|
|
|
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
17
|
|
|
|
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more |
18
|
|
|
|
# details. |
19
|
|
|
|
# |
20
|
|
|
|
# You should have received a copy of the GNU Affero General Public License |
21
|
|
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>. |
22
|
|
|
|
# |
23
|
|
|
|
# 1. The usage of a range of years within a copyright statement contained within |
24
|
|
|
|
# this distribution should be interpreted as being equivalent to a list of years |
25
|
|
|
|
# including the first and last year specified and all consecutive years between |
26
|
|
|
|
# them. For example, a copyright statement that reads 'Copyright (c) 2005, 2007- |
27
|
|
|
|
# 2009, 2011-2012' should be interpreted as being identical to a statement that |
28
|
|
|
|
# reads 'Copyright (c) 2005, 2007, 2008, 2009, 2011, 2012' and a copyright |
29
|
|
|
|
# statement that reads "Copyright (c) 2005-2012' should be interpreted as being |
30
|
|
|
|
# identical to a statement that reads 'Copyright (c) 2005, 2006, 2007, 2008, |
31
|
|
|
|
# 2009, 2010, 2011, 2012'." |
32
|
|
|
|
##########LICENCE########## |
33
|
|
|
|
|
34
|
|
|
|
|
35
|
|
2
|
|
use strict; |
36
|
|
2
|
|
use autodie qw(:all); |
37
|
|
2
|
|
use English qw( -no_match_vars ); |
38
|
|
2
|
|
use warnings FATAL => 'all'; |
39
|
|
2
|
|
use Carp qw(croak); |
40
|
|
2
|
|
use List::Util qw(first); |
41
|
|
2
|
|
use File::Which qw(which); |
42
|
|
2
|
|
use Capture::Tiny qw(capture); |
43
|
|
|
|
|
44
|
|
2
|
|
use Const::Fast qw(const); |
45
|
|
|
|
|
46
|
|
2
|
|
use Sanger::CGP::TraFiC; |
47
|
|
|
|
|
48
|
|
|
|
const my $SINGLE_END => 'SINGLE_END'; |
49
|
|
|
|
const my $INTER_CHROM => 'INTER_CHROM'; # between 2 chromosomes |
50
|
|
|
|
const my $ABERRANT => 'ABERRANT'; # within a chromosome INTRA (not using as name as easy to miss similarity) |
51
|
|
|
|
|
52
|
|
|
|
# sam field indexes |
53
|
|
|
|
const my $QNAME => 0; |
54
|
|
|
|
const my $FLAG => 1; |
55
|
|
|
|
const my $RNAME => 2; |
56
|
|
|
|
const my $POS => 3; |
57
|
|
|
|
const my $MAPQ => 4; |
58
|
|
|
|
const my $RNEXT => 6; |
59
|
|
|
|
const my $SEQ => 9; |
60
|
|
|
|
|
61
|
|
|
|
const my $N_LIMIT => q{NNNNN}; |
62
|
|
|
|
|
63
|
|
|
|
# sam flag values |
64
|
|
|
|
const my $PROPER_PAIR => 0x0002; |
65
|
|
|
|
const my $READ_UNMAPPED => 0x0004; |
66
|
|
|
|
const my $MATE_UNMAPPED => 0x0008; |
67
|
|
|
|
const my $READ_REVCOMP => 0x0010; # will only be testing the mapped read |
68
|
|
|
|
const my $READ_ONE => 0x0040; # assuming paired reads only |
69
|
|
|
|
|
70
|
|
|
|
sub new { |
71
|
|
2
|
0
|
my ($class) = @_; |
72
|
|
|
|
my $self = { }; |
73
|
|
|
|
bless $self, $class; |
74
|
|
|
|
|
75
|
|
|
|
return $self; |
76
|
|
|
|
} |
77
|
|
|
|
|
78
|
|
|
|
sub grab_bam_header { |
79
|
|
0
|
0
|
my $bam = shift; |
80
|
|
|
|
my $command = which('samtools'); |
81
|
0
|
|
|
die "Can't find 'samtools' in path\n" unless($command); |
82
|
|
|
|
$command .= ' view -H '.$bam; |
83
|
|
0
|
|
my ($sti,$sto,$err) = capture { system($command); }; |
84
|
|
|
|
my $cleaned = q{}; |
85
|
|
|
|
for my $line(split /\n/, $sti) { |
86
|
0
|
|
|
$cleaned .= $line."\n" if($line =~ m/^[@](HD|SQ)/); |
87
|
|
|
|
} |
88
|
|
|
|
return $cleaned; |
89
|
|
|
|
} |
90
|
|
|
|
|
91
|
|
|
|
sub set_output { |
92
|
|
0
|
1
|
my ($self, $root_path, $bam, $chr) = @_; |
93
|
0
|
|
|
croak "$root_path does not exist." unless(-e $root_path); |
94
|
0
|
|
|
croak "$root_path is not a directory." unless(-d $root_path); |
95
|
0
|
|
|
croak "$root_path cannot be written to." unless(-w $root_path); |
96
|
|
|
|
my $outfile; |
97
|
|
|
|
my $tmpfile; |
98
|
0
|
|
|
if(defined $chr) { |
99
|
0
|
|
|
if($chr eq '*') { |
100
|
|
|
|
$outfile = "$root_path/candidates.orphans.fa"; |
101
|
|
|
|
$tmpfile = "$root_path/biobambamtmp.orphans.tmp"; |
102
|
|
|
|
$self->{'merged_orph'} = "$root_path/merged_orphans.bam"; |
103
|
|
|
|
} |
104
|
|
|
|
else { |
105
|
|
|
|
$outfile = "$root_path/$chr.candidates.fa"; |
106
|
|
|
|
$tmpfile = "$root_path/$chr.biobambamtmp.tmp"; |
107
|
|
|
|
} |
108
|
0
|
|
|
if(defined $bam) { |
109
|
|
|
|
my $orphans = "$root_path/$chr.o.sam"; |
110
|
|
|
|
open my $orph, '>', $orphans; |
111
|
|
|
|
print $orph grab_bam_header($bam); |
112
|
|
|
|
$self->{'ORPHAN_FH'} = $orph; |
113
|
|
|
|
} |
114
|
|
|
|
} |
115
|
|
|
|
else { |
116
|
|
|
|
$outfile = "$root_path/candidates.fa"; |
117
|
|
|
|
$tmpfile = "$root_path/biobambamtmp.tmp"; |
118
|
|
|
|
} |
119
|
|
|
|
open my $se, '>', $outfile; |
120
|
|
|
|
$self->{'CANDIDATE_FH'} = $se; |
121
|
|
|
|
|
122
|
|
|
|
$self->{'biobambam_tmpfile'} = $tmpfile; |
123
|
|
|
|
return 1; |
124
|
|
|
|
} |
125
|
|
|
|
|
126
|
|
|
|
sub process_orphans { |
127
|
|
0
|
0
|
my ($self, $indir) = @_; |
128
|
|
|
|
my $file_list = $self->find_orphan_files($indir); |
129
|
|
|
|
my $command = sprintf 'bamsort level=1 disablevalidation=0 inputformat=sam tmpfile=%s O=%s I=', |
130
|
|
|
|
$self->{'biobambam_tmpfile'}, |
131
|
|
|
|
$self->{'merged_orph'}; |
132
|
|
|
|
my $full_command = $command.join(q{ I=}, @{$file_list} ); |
133
|
0
|
|
|
system($full_command) == 0 or die "Processing failed: $full_command\n"; |
134
|
|
|
|
|
135
|
|
|
|
$self->process_files([$self->{'merged_orph'}]); |
136
|
|
|
|
unlink $self->{'merged_orph'}; # contained within this process so clean it up |
137
|
|
|
|
return 1; |
138
|
|
|
|
} |
139
|
|
|
|
|
140
|
|
|
|
sub find_orphan_files { |
141
|
|
0
|
0
|
my ($self, $indir) = @_; |
142
|
|
|
|
my @orphans; |
143
|
0
|
|
|
opendir(my $dh, $indir) || die "Unable to read from $indir: $OS_ERROR"; |
144
|
|
|
|
while(my $thing = readdir $dh) { |
145
|
0
|
|
|
next if($thing =~ m/^[.]/); |
146
|
0
|
|
|
push @orphans, "$indir/$thing" if($thing =~ m/.*[.]o[.]sam$/); |
147
|
|
|
|
} |
148
|
|
|
|
closedir($dh); |
149
|
|
|
|
return \@orphans; |
150
|
|
|
|
} |
151
|
|
|
|
|
152
|
|
|
|
sub process_files { |
153
|
|
0
|
1
|
my ($self, $file_list, $chr, $exclude) = @_; |
154
|
|
|
|
my $out_fh = $self->{'CANDIDATE_FH'}; |
155
|
|
|
|
for my $in(@{$file_list}) { |
156
|
|
|
|
warn "Processing: $in\n"; |
157
|
|
|
|
|
158
|
|
|
|
my $command; |
159
|
0
|
|
|
if(defined $exclude) { |
160
|
|
|
|
$command = '('; |
161
|
|
|
|
my $in_file = $in; |
162
|
0
|
|
|
if(defined $chr) { |
163
|
|
|
|
$command .= sprintf 'samtools view -u %s %s | ', $in, $chr; |
164
|
|
|
|
$in_file = '-'; |
165
|
|
|
|
} |
166
|
|
|
|
$command .= sprintf 'bedtools intersect -v -ubam -abam %s -b %s | ', |
167
|
|
|
|
$in_file, |
168
|
|
|
|
$exclude; |
169
|
|
|
|
|
170
|
0
|
|
|
$command .= sprintf 'bamcollate2 exclude=SECONDARY,QCFAIL,DUP,SUPPLEMENTARY collate=1 outputformat=sam T=%s inputformat=bam classes=F,F2%s', |
171
|
|
|
|
$self->{'biobambam_tmpfile'}, |
172
|
|
|
|
((defined $chr) ? ',O,O2' : q{}); |
173
|
|
|
|
|
174
|
|
|
|
$command .= ')'; |
175
|
|
|
|
} |
176
|
|
|
|
else { |
177
|
|
|
|
my ($file_type) = $in =~ m/[.](sam|bam|cram)$/; |
178
|
0
|
|
|
$command = sprintf 'bamcollate2 exclude=SECONDARY,QCFAIL,DUP,SUPPLEMENTARY collate=1 outputformat=sam T=%s inputformat=%s filename=%s classes=F,F2%s%s', |
|
0
|
|
|
|
179
|
|
|
|
$self->{'biobambam_tmpfile'}, |
180
|
|
|
|
$file_type, |
181
|
|
|
|
$in, |
182
|
|
|
|
((defined $chr) ? ',O,O2' : q{}), |
183
|
|
|
|
((defined $chr) ? " ranges=$chr" : q{}); |
184
|
|
|
|
} |
185
|
|
|
|
|
186
|
|
|
|
open my $process, '-|', $command; |
187
|
|
|
|
my ($read1, $read2); |
188
|
|
|
|
MAIN: while($read1 = <$process>) { |
189
|
0
|
|
|
next if((substr $read1,0,1) eq '@'); |
190
|
0
|
|
|
unless($read2 = <$process>) { |
191
|
|
|
|
print {$self->{'ORPHAN_FH'}} $read1; # not chomped |
192
|
|
|
|
last MAIN; |
193
|
|
|
|
} |
194
|
|
|
|
chomp ($read1, $read2); |
195
|
|
|
|
my @r1 = split /\t/, $read1; |
196
|
|
|
|
my @r2 = split /\t/, $read2; |
197
|
|
|
|
while($r1[0] ne $r2[0]) { |
198
|
|
|
|
$read1 =~ s/\tPG:Z[^\t]+//; |
199
|
|
|
|
print {$self->{'ORPHAN_FH'}} $read1,"\n"; |
200
|
|
|
|
$read1 = $read2; |
201
|
|
|
|
@r1 = @r2; |
202
|
|
|
|
$read2 = <$process>; |
203
|
0
|
|
|
unless($read2) { |
204
|
|
|
|
# the old read2 has not been output yet just moved to $read1 |
205
|
|
|
|
$read1 =~ s/\tPG:Z[^\t]+//; |
206
|
|
|
|
print {$self->{'ORPHAN_FH'}} $read1,"\n"; |
207
|
|
|
|
last MAIN; |
208
|
|
|
|
} |
209
|
|
|
|
chomp $read2; |
210
|
|
|
|
@r2 = split /\t/, $read2; |
211
|
|
|
|
} |
212
|
|
|
|
my ($class, $fasta) = @{$self->pair_to_candidate(\@r1, \@r2)}; |
213
|
0
|
|
|
next unless(defined $class); |
214
|
|
|
|
print $out_fh ${$fasta}; |
215
|
|
|
|
# last if($. > 1_000_000); |
216
|
|
|
|
} |
217
|
|
|
|
close $process; |
218
|
|
|
|
} |
219
|
|
|
|
return 1; |
220
|
|
|
|
} |
221
|
|
|
|
|
222
|
|
|
|
sub pair_to_candidate{ |
223
|
|
10
|
1
|
my ($self, $read1, $read2) = @_; |
224
|
|
|
|
# make assumptions that flags are correct |
225
|
|
|
|
my @r1 = @{$read1}; |
226
|
|
|
|
my @r2 = @{$read2}; |
227
|
|
|
|
# both ends unmapped discard |
228
|
100
|
|
|
return [] if(($r1[$FLAG] & $READ_UNMAPPED) && ($r2[$FLAG] & $READ_UNMAPPED)); |
229
|
|
|
|
# if proper pair return, paranoid version so check each end is mapped |
230
|
50
|
|
|
return [] if(($r1[$FLAG] & $PROPER_PAIR) && !($r1[$FLAG] & $READ_UNMAPPED) && !($r2[$FLAG] & $READ_UNMAPPED)); |
231
|
|
|
|
|
232
|
|
|
|
my $swap = 0; # want r1 to always be the mapped read for ease of use |
233
|
|
|
|
my $class; |
234
|
100
|
|
|
if((!($r1[$FLAG] & $READ_UNMAPPED) && ($r2[$FLAG] & $READ_UNMAPPED)) || (!($r2[$FLAG] & $READ_UNMAPPED) && ($r1[$FLAG] & $READ_UNMAPPED)) ) { |
235
|
|
|
|
$class = $SINGLE_END; |
236
|
50
|
|
|
$swap = 1 if($r1[$FLAG] & $READ_UNMAPPED); |
237
|
|
|
|
} |
238
|
|
|
|
else { # both ends mapped |
239
|
50
|
|
|
return [] if(($r1[$MAPQ] == 0 && $r2[$MAPQ] == 0) || $r1[$MAPQ] == $r2[$MAPQ]); # can't determine which end should be anchor |
240
|
100
|
|
|
$swap = 1 if($r1[$MAPQ] < $r2[$MAPQ]); |
241
|
100
|
|
|
if($r1[$RNAME] eq $r2[$RNAME]) { |
242
|
|
|
|
$class = $ABERRANT; |
243
|
|
|
|
} |
244
|
|
|
|
else { |
245
|
|
|
|
$class = $INTER_CHROM; |
246
|
|
|
|
} |
247
|
|
|
|
} |
248
|
|
|
|
|
249
|
|
|
|
my $fasta = $self->record_out(\@r1, \@r2, $swap, $class); |
250
|
50
|
|
|
return [] unless(defined $fasta); |
251
|
|
|
|
return [$class, $fasta]; |
252
|
|
|
|
} |
253
|
|
|
|
|
254
|
|
|
|
sub record_out { |
255
|
|
8
|
0
|
my ($self, $r1, $r2, $swap, $class) = @_; |
256
|
|
|
|
# returns undef on fail and scalar ref on pass |
257
|
100
|
|
|
($r2, $r1) = ($r1,$r2) if($swap == 1); |
258
|
|
|
|
my $fasta; |
259
|
50
|
|
|
return $fasta if($r1->[$MAPQ] == 0); # reads with non-specific anchors are of little use |
260
|
50
|
|
|
return $fasta unless(index($r2->[$SEQ], $N_LIMIT) == -1); |
261
|
|
|
|
|
262
|
100
|
|
|
$fasta = join "\t", ( ">$r1->[$QNAME]", |
263
|
|
|
|
$r1->[$RNAME], |
264
|
|
|
|
$r1->[$POS], |
265
|
|
|
|
$class, |
266
|
|
|
|
($r1->[$FLAG] & $READ_REVCOMP) ? '-' : '+'); |
267
|
|
|
|
$fasta .= "$RS$r2->[$SEQ]$RS"; |
268
|
|
|
|
return \$fasta; |
269
|
|
|
|
} |
270
|
|
|
|
|
271
|
|
|
|
sub DESTROY { |
272
|
|
2
|
|
my $self = shift; |
273
|
50
|
|
|
close $self->{'CANDIDATE_FH'} if(defined $self->{'CANDIDATE_FH'}); |
274
|
50
|
|
|
close $self->{'ORPHAN_FH'} if(defined $self->{'ORPHAN_FH'}); |
275
|
|
|
|
return 1; |
276
|
|
|
|
} |
277
|
|
|
|
|
278
|
|
|
|
1; |
279
|
|
|
|
|
280
|
|
|
|
__END__ |
281
|
|
|
|
|
282
|
|
|
|
=head1 NAME |
283
|
|
|
|
|
284
|
|
|
|
Sanger::CGP::TraFiC::Parser |
285
|
|
|
|
|
286
|
|
|
|
=head1 SYNOPSIS |
287
|
|
|
|
|
288
|
|
|
|
use Sanger::CGP::TraFiC::Parser; |
289
|
|
|
|
my $parser = Sanger::CGP::TraFiC::Parser->new; |
290
|
|
|
|
$parser->set_output('output/folder); |
291
|
|
|
|
$parser->process_files($list_of_bams); |
292
|
|
|
|
|
293
|
|
|
|
=head2 Constants |
294
|
|
|
|
|
295
|
|
|
|
=over |
296
|
|
|
|
|
297
|
|
|
|
=item SINGLE_END |
298
|
|
|
|
|
299
|
|
|
|
Only one end of the pair was mapped |
300
|
|
|
|
|
301
|
|
|
|
=item INTER_CHROM |
302
|
|
|
|
|
303
|
|
|
|
Both ends of pair where mapped to different chromosomes |
304
|
|
|
|
|
305
|
|
|
|
=item ABERRANT |
306
|
|
|
|
|
307
|
|
|
|
Both ends mapped to same chromosome but not as a proper pair (i.e. intRA chromosomal) |
308
|
|
|
|
|
309
|
|
|
|
=back |
310
|
|
|
|
|
311
|
|
|
|
=head1 METHODS |
312
|
|
|
|
|
313
|
|
|
|
=head3 set_output |
314
|
|
|
|
|
315
|
|
|
|
=over |
316
|
|
|
|
|
317
|
|
|
|
=item Description |
318
|
|
|
|
|
319
|
|
|
|
Set the root path that output should be sent to. Generates file-handles |
320
|
|
|
|
to the 6 files required for processing. |
321
|
|
|
|
|
322
|
|
|
|
$parser->set_output('/some/existing/path'); |
323
|
|
|
|
|
324
|
|
|
|
=back |
325
|
|
|
|
|
326
|
|
|
|
=head3 process_files |
327
|
|
|
|
|
328
|
|
|
|
=over |
329
|
|
|
|
|
330
|
|
|
|
=item Description |
331
|
|
|
|
|
332
|
|
|
|
Find and output candidate reads in the supplied files. |
333
|
|
|
|
|
334
|
|
|
|
$parser->process_files([filelist, ...]); |
335
|
|
|
|
or |
336
|
|
|
|
$parser->process_files([singlefile], readgroup_id); |
337
|
|
|
|
|
338
|
|
|
|
=item Notes |
339
|
|
|
|
|
340
|
|
|
|
When I<readgroup_id> is supplied only reads within the group will be |
341
|
|
|
|
handled. Using this inappropriately will result in no output for files |
342
|
|
|
|
that don't have the specified group. There will be no warnings when this occurs. |
343
|
|
|
|
|
344
|
|
|
|
=back |
345
|
|
|
|
|
346
|
|
|
|
=head3 pair_to_candidate |
347
|
|
|
|
|
348
|
|
|
|
=over |
349
|
|
|
|
|
350
|
|
|
|
=item Description |
351
|
|
|
|
|
352
|
|
|
|
Takes a pair of reads and determines if they are suitable |
353
|
|
|
|
for processing as aberrant. |
354
|
|
|
|
|
355
|
|
|
|
my ($class, $detail, $fasta) = $parser->pair_to_candidate($sam_r1, $sam_r2); |
356
|
|
|
|
next unless(defined $class); |
357
|
|
|
|
|
358
|
|
|
|
=item Returns |
359
|
|
|
|
|
360
|
|
|
|
\@ of the following values |
361
|
|
|
|
|
362
|
|
|
|
$class : Type of aberrant read FastA record (see L</Constants>) |
363
|
|
|
|
$fasta : FastA record ready for printing |
364
|
|
|
|
|
365
|
|
|
|
=item Notes |
366
|
|
|
|
|
367
|
|
|
|
When the pair of reads input into this function are not found to be candidates |
368
|
|
|
|
an empty array ref is returned. |
369
|
|
|
|
|
370
|
|
|
|
=back |