File Coverage

lib/Sanger/CGP/TraFiC/Parser.pm
Criterion Covered Total %
branch 20 70 28.5
subroutine 14 20 70.0
pod 3 8 37.5
total 37 98 37.7


line bran sub pod code
1       package Sanger::CGP::TraFiC::Parser;
2        
3       ##########LICENCE##########
4       # Copyright (c) 2015 Genome Research Ltd.
5       #
6       # Author: Cancer Genome Project cgpit@sanger.ac.uk
7       #
8       # This file is part of TraFiC.
9       #
10       # TraFiC is free software: you can redistribute it and/or modify it under
11       # the terms of the GNU Affero General Public License as published by the Free
12       # Software Foundation; either version 3 of the License, or (at your option) any
13       # later version.
14       #
15       # This program is distributed in the hope that it will be useful, but WITHOUT
16       # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17       # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
18       # details.
19       #
20       # You should have received a copy of the GNU Affero General Public License
21       # along with this program. If not, see <http://www.gnu.org/licenses/>.
22       #
23       # 1. The usage of a range of years within a copyright statement contained within
24       # this distribution should be interpreted as being equivalent to a list of years
25       # including the first and last year specified and all consecutive years between
26       # them. For example, a copyright statement that reads 'Copyright (c) 2005, 2007-
27       # 2009, 2011-2012' should be interpreted as being identical to a statement that
28       # reads 'Copyright (c) 2005, 2007, 2008, 2009, 2011, 2012' and a copyright
29       # statement that reads "Copyright (c) 2005-2012' should be interpreted as being
30       # identical to a statement that reads 'Copyright (c) 2005, 2006, 2007, 2008,
31       # 2009, 2010, 2011, 2012'."
32       ##########LICENCE##########
33        
34        
35   2   use strict;
36   2   use autodie qw(:all);
37   2   use English qw( -no_match_vars );
38   2   use warnings FATAL => 'all';
39   2   use Carp qw(croak);
40   2   use List::Util qw(first);
41   2   use File::Which qw(which);
42   2   use Capture::Tiny qw(capture);
43        
44   2   use Const::Fast qw(const);
45        
46   2   use Sanger::CGP::TraFiC;
47        
48       const my $SINGLE_END => 'SINGLE_END';
49       const my $INTER_CHROM => 'INTER_CHROM'; # between 2 chromosomes
50       const my $ABERRANT => 'ABERRANT'; # within a chromosome INTRA (not using as name as easy to miss similarity)
51        
52       # sam field indexes
53       const my $QNAME => 0;
54       const my $FLAG => 1;
55       const my $RNAME => 2;
56       const my $POS => 3;
57       const my $MAPQ => 4;
58       const my $RNEXT => 6;
59       const my $SEQ => 9;
60        
61       const my $N_LIMIT => q{NNNNN};
62        
63       # sam flag values
64       const my $PROPER_PAIR => 0x0002;
65       const my $READ_UNMAPPED => 0x0004;
66       const my $MATE_UNMAPPED => 0x0008;
67       const my $READ_REVCOMP => 0x0010; # will only be testing the mapped read
68       const my $READ_ONE => 0x0040; # assuming paired reads only
69        
70       sub new {
71   2 0 my ($class) = @_;
72       my $self = { };
73       bless $self, $class;
74        
75       return $self;
76       }
77        
78       sub grab_bam_header {
79   0 0 my $bam = shift;
80       my $command = which('samtools');
81 0     die "Can't find 'samtools' in path\n" unless($command);
82       $command .= ' view -H '.$bam;
83   0   my ($sti,$sto,$err) = capture { system($command); };
84       my $cleaned = q{};
85       for my $line(split /\n/, $sti) {
86 0     $cleaned .= $line."\n" if($line =~ m/^[@](HD|SQ)/);
87       }
88       return $cleaned;
89       }
90        
91       sub set_output {
92   0 1 my ($self, $root_path, $bam, $chr) = @_;
93 0     croak "$root_path does not exist." unless(-e $root_path);
94 0     croak "$root_path is not a directory." unless(-d $root_path);
95 0     croak "$root_path cannot be written to." unless(-w $root_path);
96       my $outfile;
97       my $tmpfile;
98 0     if(defined $chr) {
99 0     if($chr eq '*') {
100       $outfile = "$root_path/candidates.orphans.fa";
101       $tmpfile = "$root_path/biobambamtmp.orphans.tmp";
102       $self->{'merged_orph'} = "$root_path/merged_orphans.bam";
103       }
104       else {
105       $outfile = "$root_path/$chr.candidates.fa";
106       $tmpfile = "$root_path/$chr.biobambamtmp.tmp";
107       }
108 0     if(defined $bam) {
109       my $orphans = "$root_path/$chr.o.sam";
110       open my $orph, '>', $orphans;
111       print $orph grab_bam_header($bam);
112       $self->{'ORPHAN_FH'} = $orph;
113       }
114       }
115       else {
116       $outfile = "$root_path/candidates.fa";
117       $tmpfile = "$root_path/biobambamtmp.tmp";
118       }
119       open my $se, '>', $outfile;
120       $self->{'CANDIDATE_FH'} = $se;
121        
122       $self->{'biobambam_tmpfile'} = $tmpfile;
123       return 1;
124       }
125        
126       sub process_orphans {
127   0 0 my ($self, $indir) = @_;
128       my $file_list = $self->find_orphan_files($indir);
129       my $command = sprintf 'bamsort level=1 disablevalidation=0 inputformat=sam tmpfile=%s O=%s I=',
130       $self->{'biobambam_tmpfile'},
131       $self->{'merged_orph'};
132       my $full_command = $command.join(q{ I=}, @{$file_list} );
133 0     system($full_command) == 0 or die "Processing failed: $full_command\n";
134        
135       $self->process_files([$self->{'merged_orph'}]);
136       unlink $self->{'merged_orph'}; # contained within this process so clean it up
137       return 1;
138       }
139        
140       sub find_orphan_files {
141   0 0 my ($self, $indir) = @_;
142       my @orphans;
143 0     opendir(my $dh, $indir) || die "Unable to read from $indir: $OS_ERROR";
144       while(my $thing = readdir $dh) {
145 0     next if($thing =~ m/^[.]/);
146 0     push @orphans, "$indir/$thing" if($thing =~ m/.*[.]o[.]sam$/);
147       }
148       closedir($dh);
149       return \@orphans;
150       }
151        
152       sub process_files {
153   0 1 my ($self, $file_list, $chr, $exclude) = @_;
154       my $out_fh = $self->{'CANDIDATE_FH'};
155       for my $in(@{$file_list}) {
156       warn "Processing: $in\n";
157        
158       my $command;
159 0     if(defined $exclude) {
160       $command = '(';
161       my $in_file = $in;
162 0     if(defined $chr) {
163       $command .= sprintf 'samtools view -u %s %s | ', $in, $chr;
164       $in_file = '-';
165       }
166       $command .= sprintf 'bedtools intersect -v -ubam -abam %s -b %s | ',
167       $in_file,
168       $exclude;
169        
170 0     $command .= sprintf 'bamcollate2 exclude=SECONDARY,QCFAIL,DUP,SUPPLEMENTARY collate=1 outputformat=sam T=%s inputformat=bam classes=F,F2%s',
171       $self->{'biobambam_tmpfile'},
172       ((defined $chr) ? ',O,O2' : q{});
173        
174       $command .= ')';
175       }
176       else {
177       my ($file_type) = $in =~ m/[.](sam|bam|cram)$/;
178 0     $command = sprintf 'bamcollate2 exclude=SECONDARY,QCFAIL,DUP,SUPPLEMENTARY collate=1 outputformat=sam T=%s inputformat=%s filename=%s classes=F,F2%s%s',
  0      
179       $self->{'biobambam_tmpfile'},
180       $file_type,
181       $in,
182       ((defined $chr) ? ',O,O2' : q{}),
183       ((defined $chr) ? " ranges=$chr" : q{});
184       }
185        
186       open my $process, '-|', $command;
187       my ($read1, $read2);
188       MAIN: while($read1 = <$process>) {
189 0     next if((substr $read1,0,1) eq '@');
190 0     unless($read2 = <$process>) {
191       print {$self->{'ORPHAN_FH'}} $read1; # not chomped
192       last MAIN;
193       }
194       chomp ($read1, $read2);
195       my @r1 = split /\t/, $read1;
196       my @r2 = split /\t/, $read2;
197       while($r1[0] ne $r2[0]) {
198       $read1 =~ s/\tPG:Z[^\t]+//;
199       print {$self->{'ORPHAN_FH'}} $read1,"\n";
200       $read1 = $read2;
201       @r1 = @r2;
202       $read2 = <$process>;
203 0     unless($read2) {
204       # the old read2 has not been output yet just moved to $read1
205       $read1 =~ s/\tPG:Z[^\t]+//;
206       print {$self->{'ORPHAN_FH'}} $read1,"\n";
207       last MAIN;
208       }
209       chomp $read2;
210       @r2 = split /\t/, $read2;
211       }
212       my ($class, $fasta) = @{$self->pair_to_candidate(\@r1, \@r2)};
213 0     next unless(defined $class);
214       print $out_fh ${$fasta};
215       # last if($. > 1_000_000);
216       }
217       close $process;
218       }
219       return 1;
220       }
221        
222       sub pair_to_candidate{
223   10 1 my ($self, $read1, $read2) = @_;
224       # make assumptions that flags are correct
225       my @r1 = @{$read1};
226       my @r2 = @{$read2};
227       # both ends unmapped discard
228 100     return [] if(($r1[$FLAG] & $READ_UNMAPPED) && ($r2[$FLAG] & $READ_UNMAPPED));
229       # if proper pair return, paranoid version so check each end is mapped
230 50     return [] if(($r1[$FLAG] & $PROPER_PAIR) && !($r1[$FLAG] & $READ_UNMAPPED) && !($r2[$FLAG] & $READ_UNMAPPED));
231        
232       my $swap = 0; # want r1 to always be the mapped read for ease of use
233       my $class;
234 100     if((!($r1[$FLAG] & $READ_UNMAPPED) && ($r2[$FLAG] & $READ_UNMAPPED)) || (!($r2[$FLAG] & $READ_UNMAPPED) && ($r1[$FLAG] & $READ_UNMAPPED)) ) {
235       $class = $SINGLE_END;
236 50     $swap = 1 if($r1[$FLAG] & $READ_UNMAPPED);
237       }
238       else { # both ends mapped
239 50     return [] if(($r1[$MAPQ] == 0 && $r2[$MAPQ] == 0) || $r1[$MAPQ] == $r2[$MAPQ]); # can't determine which end should be anchor
240 100     $swap = 1 if($r1[$MAPQ] < $r2[$MAPQ]);
241 100     if($r1[$RNAME] eq $r2[$RNAME]) {
242       $class = $ABERRANT;
243       }
244       else {
245       $class = $INTER_CHROM;
246       }
247       }
248        
249       my $fasta = $self->record_out(\@r1, \@r2, $swap, $class);
250 50     return [] unless(defined $fasta);
251       return [$class, $fasta];
252       }
253        
254       sub record_out {
255   8 0 my ($self, $r1, $r2, $swap, $class) = @_;
256       # returns undef on fail and scalar ref on pass
257 100     ($r2, $r1) = ($r1,$r2) if($swap == 1);
258       my $fasta;
259 50     return $fasta if($r1->[$MAPQ] == 0); # reads with non-specific anchors are of little use
260 50     return $fasta unless(index($r2->[$SEQ], $N_LIMIT) == -1);
261        
262 100     $fasta = join "\t", ( ">$r1->[$QNAME]",
263       $r1->[$RNAME],
264       $r1->[$POS],
265       $class,
266       ($r1->[$FLAG] & $READ_REVCOMP) ? '-' : '+');
267       $fasta .= "$RS$r2->[$SEQ]$RS";
268       return \$fasta;
269       }
270        
271       sub DESTROY {
272   2   my $self = shift;
273 50     close $self->{'CANDIDATE_FH'} if(defined $self->{'CANDIDATE_FH'});
274 50     close $self->{'ORPHAN_FH'} if(defined $self->{'ORPHAN_FH'});
275       return 1;
276       }
277        
278       1;
279        
280       __END__
281        
282       =head1 NAME
283        
284       Sanger::CGP::TraFiC::Parser
285        
286       =head1 SYNOPSIS
287        
288       use Sanger::CGP::TraFiC::Parser;
289       my $parser = Sanger::CGP::TraFiC::Parser->new;
290       $parser->set_output('output/folder);
291       $parser->process_files($list_of_bams);
292        
293       =head2 Constants
294        
295       =over
296        
297       =item SINGLE_END
298        
299       Only one end of the pair was mapped
300        
301       =item INTER_CHROM
302        
303       Both ends of pair where mapped to different chromosomes
304        
305       =item ABERRANT
306        
307       Both ends mapped to same chromosome but not as a proper pair (i.e. intRA chromosomal)
308        
309       =back
310        
311       =head1 METHODS
312        
313       =head3 set_output
314        
315       =over
316        
317       =item Description
318        
319       Set the root path that output should be sent to. Generates file-handles
320       to the 6 files required for processing.
321        
322       $parser->set_output('/some/existing/path');
323        
324       =back
325        
326       =head3 process_files
327        
328       =over
329        
330       =item Description
331        
332       Find and output candidate reads in the supplied files.
333        
334       $parser->process_files([filelist, ...]);
335       or
336       $parser->process_files([singlefile], readgroup_id);
337        
338       =item Notes
339        
340       When I<readgroup_id> is supplied only reads within the group will be
341       handled. Using this inappropriately will result in no output for files
342       that don't have the specified group. There will be no warnings when this occurs.
343        
344       =back
345        
346       =head3 pair_to_candidate
347        
348       =over
349        
350       =item Description
351        
352       Takes a pair of reads and determines if they are suitable
353       for processing as aberrant.
354        
355       my ($class, $detail, $fasta) = $parser->pair_to_candidate($sam_r1, $sam_r2);
356       next unless(defined $class);
357        
358       =item Returns
359        
360       \@ of the following values
361        
362       $class : Type of aberrant read FastA record (see L</Constants>)
363       $fasta : FastA record ready for printing
364        
365       =item Notes
366        
367       When the pair of reads input into this function are not found to be candidates
368       an empty array ref is returned.
369        
370       =back