File Coverage

lib/Sanger/CGP/TraFiC/Filter.pm
Criterion Covered Total %
branch 3 82 3.6
subroutine 13 28 46.4
pod 5 13 38.4
total 21 123 17.0


line bran sub pod code
1       package Sanger::CGP::TraFiC::Filter;
2        
3       ##########LICENCE##########
4       # Copyright (c) 2015 Genome Research Ltd.
5       #
6       # Author: Cancer Genome Project cgpit@sanger.ac.uk
7       #
8       # This file is part of TraFiC.
9       #
10       # TraFiC is free software: you can redistribute it and/or modify it under
11       # the terms of the GNU Affero General Public License as published by the Free
12       # Software Foundation; either version 3 of the License, or (at your option) any
13       # later version.
14       #
15       # This program is distributed in the hope that it will be useful, but WITHOUT
16       # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17       # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
18       # details.
19       #
20       # You should have received a copy of the GNU Affero General Public License
21       # along with this program. If not, see <http://www.gnu.org/licenses/>.
22       #
23       # 1. The usage of a range of years within a copyright statement contained within
24       # this distribution should be interpreted as being equivalent to a list of years
25       # including the first and last year specified and all consecutive years between
26       # them. For example, a copyright statement that reads 'Copyright (c) 2005, 2007-
27       # 2009, 2011-2012' should be interpreted as being identical to a statement that
28       # reads 'Copyright (c) 2005, 2007, 2008, 2009, 2011, 2012' and a copyright
29       # statement that reads "Copyright (c) 2005-2012' should be interpreted as being
30       # identical to a statement that reads 'Copyright (c) 2005, 2006, 2007, 2008,
31       # 2009, 2010, 2011, 2012'."
32       ##########LICENCE##########
33        
34        
35   2   use strict;
36   2   use autodie qw(:all);
37   2   use English qw( -no_match_vars );
38   2   use warnings FATAL => 'all';
39   2   use Carp qw(croak);
40   2   use File::Path qw(make_path);
41   2   use File::Temp;
42   2   use List::Util qw(min max);
43   2   use File::Which qw(which);
44        
45   2   use Const::Fast qw(const);
46        
47   2   use Sanger::CGP::TraFiC;
48        
49       const my $TAB => "\t";
50        
51       const my $POS_L_POS => 0;
52       const my $POS_R_POS => 1;
53       const my $NEG_L_POS => 9;
54       const my $NEG_R_POS => 10;
55        
56       sub new {
57   3 0 my ($class) = @_;
58       my $self = { };
59       bless $self, $class;
60 100     croak "Unable to find standard unix 'sort' in path." unless(defined which('sort'));
61       $self->{'min_filter_reads'} = 2;
62       $self->{'min_germline_reads'} = 5;
63       $self->{'divergence'} = 200;
64       $self->{'readlength'} = 100;
65       $self->{'filters'} = [];
66       $self->{'assume_sorted'} = 0;
67       return $self;
68       }
69        
70       sub assume_sorted {
71   0 0 shift->{'assume_sorted'} = 1;
72       }
73        
74       sub set_output {
75   1 1 my ($self, $root_dir) = @_;
76 50     make_path($root_dir) unless(-e $root_dir);
77       $self->{'out_loc'} = $root_dir;
78       return 1;
79       }
80        
81       sub set_clusters {
82   0 1 my ($self, $clusters) = @_;
83        
84 0     croak "$clusters is not a valid file\n" unless(-e $clusters && -f $clusters);
85        
86       $self->{'clusters'} = $clusters;
87        
88       # grab the header line for output purposes
89       open my $CL, '<', $clusters;
90       $self->{'header'} = <$CL>;
91       close $CL;
92        
93       return 1;
94       }
95        
96       sub filter {
97   0 1 my ($self) = @_;
98       my $cluster_set = $self->_load_clusters($self->{'clusters'});
99        
100 0     if($cluster_set->{'cluster_count'} > 0) {
101       for my $filter(@{$self->{'filters'}}){
102       $self->_filter( $cluster_set,
103       $self->_load_clusters($filter, $self->{'min_filter_reads'}, $cluster_set->{'clusters'}));
104       }
105       }
106       else {
107       warn "No clusters in tumour input file, expect only a header line in output file.\n";
108       }
109        
110 0     if($cluster_set->{'cluster_count'} > 0 && defined $self->{'germline'}) {
111       $self->_filter($cluster_set,
112       $self->_load_clusters($self->{'germline'}, $self->{'min_germline_reads'}, $cluster_set->{'clusters'}));
113       }
114        
115 0     if($cluster_set->{'cluster_count'} > 0 && defined $self->{'trans_elements'}) {
116       $self->_filter($cluster_set,
117       $self->_load_clusters($self->{'trans_elements'}, $self->{'divergence'}, $cluster_set->{'clusters'}, 'divergence'),
118       1);
119       }
120        
121       $self->output($cluster_set->{'clusters'});
122       return 1;
123       }
124        
125       sub output {
126   0 1 my ($self, $cluster_set) = @_;
127       open my $OUT, '>', "$self->{out_loc}/filtered.txt.tmp";
128        
129        
130       for my $family(keys %{$cluster_set}) {
131       for my $chr(keys %{$cluster_set->{$family}}) {
132        
133       for my $cluster(@{$cluster_set->{$family}->{$chr}}) {
134       print $OUT join $TAB, $chr,
135       $family,
136       @{$cluster};
137       print $OUT $RS;
138       }
139       }
140       }
141       close $OUT;
142        
143       my $sort_command = sprintf "sort -k 1,1 -k 3,4n -k 2,2 -T %s -S 200M %s",
144       $self->{'out_loc'},
145       "$self->{out_loc}/filtered.txt.tmp";
146        
147       open my $FINAL, '>', "$self->{out_loc}/filtered.txt";
148       print $FINAL $self->{'header'};
149        
150       open my $proc, '-|', $sort_command;
151       while(my $line = <$proc>) {
152       print $FINAL $line;
153       }
154       close $proc;
155        
156       close $FINAL;
157        
158       unlink "$self->{out_loc}/filtered.txt.tmp";
159       return 1;
160       }
161        
162       sub _filter {
163   0   my ($self, $clusters, $filters, $is_te) = @_;
164       my $query = $clusters->{'clusters'}; # we want these
165       my $filter = $filters->{'clusters'}; # we want to remove these
166       my @families = sort keys %{$query};
167        
168       my $started_with = $clusters->{'cluster_count'};
169        
170       PRIMARY: for my $family(@families) {
171       # if this family doesn't exist in the filter skip it
172 0     next unless(exists $filter->{$family});
173       my @chrs = sort keys %{$query->{$family}};
174       for my $chr(@chrs) {
175       # skip this chromosome if not in filters
176 0     next unless(exists $filter->{$family}->{$chr});
177        
178       my $initial_clusters = scalar @{$query->{$family}->{$chr}};
179        
180        
181 0     if($is_te) {
182       $query->{$family}->{$chr} = $self->_filter_tes($query->{$family}->{$chr}, $filter->{$family}->{$chr});
183       }
184       else {
185       $query->{$family}->{$chr} = $self->_filter_set($query->{$family}->{$chr}, $filter->{$family}->{$chr});
186       }
187       my $remaining_set_clusters = scalar @{$query->{$family}->{$chr}};
188       $clusters->{'cluster_count'} -= $initial_clusters - $remaining_set_clusters;
189        
190       # as filter data is only loaded if there are corresponding
191       # family/chrs to filter against ensure that empty keys
192       # are removed from structure
193 0     if($remaining_set_clusters == 0) {
194       delete $query->{$family}->{$chr};
195       my @remaining_keys = keys %{$query->{$family}};
196 0     delete $query->{$family} if(scalar @remaining_keys == 0);
197       }
198        
199       # no point filtering when nothing left
200 0     last PRIMARY if($clusters->{'cluster_count'} == 0);
201        
202 0     croak "Cluster count has become negative, this suggests a programming error" if($clusters->{'cluster_count'} < 0);
203       }
204       }
205        
206       my $removed = $started_with - $clusters->{'cluster_count'};
207        
208       print "$removed clusters removed by $filters->{file}\n";
209       return 1;
210       }
211        
212       sub _filter_tes {
213   0   my ($self, $queries, $tes) = @_;
214       my $filter_tot = scalar @{$tes};
215       my $filter_i = 0;
216        
217       my $read_len = $self->{'readlength'};
218        
219       my @l_queries = @{$queries};
220        
221       my @new_clusters;
222       PRIMARY: while(scalar @l_queries > 0) {
223 0     last if($filter_i == $filter_tot);
224       my $query = shift @l_queries;
225        
226       my $q_l_pos = (min $query->[$POS_R_POS], $query->[$NEG_R_POS]) + $read_len;
227       my $q_r_pos = max $query->[$POS_L_POS], $query->[$NEG_L_POS];
228        
229       while($filter_i < $filter_tot) {
230 0     if($tes->[$filter_i]->[$POS_R_POS] < $q_l_pos) {
231       # filter data is prior to query data so catch up
232       $filter_i++;
233       next;
234       }
235 0     if($tes->[$filter_i]->[$POS_L_POS] > $q_r_pos) {
236       # filter is ahead of query so query must be valid
237       push @new_clusters, $query;
238       # move on to next query
239       next PRIMARY;
240       }
241       # if we get here the filter must overlap the query
242       next PRIMARY;
243       }
244       push @new_clusters, $query;
245       }
246 0     push @new_clusters, @l_queries if(scalar @l_queries > 0);
247        
248       return \@new_clusters;
249       }
250        
251       sub _filter_set {
252   0   my ($self, $queries, $filters) = @_;
253        
254       my $filter_tot = scalar @{$filters};
255       my $filter_i = 0;
256        
257       my $double_read_len = $self->{'readlength'} * 2;
258        
259       my @l_queries = @{$queries};
260        
261       my @new_clusters;
262       PRIMARY: while(scalar @l_queries > 0) {
263 0     last if($filter_i == $filter_tot);
264       my $query = shift @l_queries;
265        
266       my $q_l_pos = $query->[$POS_L_POS];
267       my $q_r_pos = $query->[$POS_R_POS];
268        
269       while($filter_i < $filter_tot) {
270 0     if($filters->[$filter_i]->[$POS_R_POS]+$double_read_len < $q_l_pos) {
271       # filter data is prior to query data so catch up
272       $filter_i++;
273       next;
274       }
275 0     if($filters->[$filter_i]->[$POS_L_POS]-$double_read_len > $q_r_pos) {
276       # filter is ahead of query so query must be valid
277       push @new_clusters, $query;
278       # move on to next query
279       next PRIMARY;
280       }
281       # if we get here the filter must overlap the query
282       next PRIMARY;
283       }
284       push @new_clusters, $query;
285       }
286 0     push @new_clusters, @l_queries if(scalar @l_queries > 0);
287        
288       return \@new_clusters;
289       }
290        
291       sub add_filter_file {
292   0 1 my ($self, $filter) = @_;
293 0     if(defined $filter) {
294 0     if(ref $filter eq 'ARRAY') {
295       for(@{$filter}) {
296 0     croak "$_ is not a valid file\n" unless(-e $_ && -f $_);
297       push @{$self->{'filters'}}, $_;
298       }
299       }
300       else {
301 0     croak "$filter is not a valid file\n" unless(-e $filter && -f $filter);
302       push @{$self->{'filters'}}, $filter;
303       }
304       }
305       return scalar @{$self->{'filters'}};
306       }
307        
308       sub _load_clusters {
309   0   my ($self, $input_set, $limit, $query_clusters, $divergence) = @_;
310        
311       my $cluster_file = $input_set;
312        
313       my $dir;
314        
315 0     unless($self->{'assume_sorted'}) {
316       $dir = File::Temp->newdir('traficFilterXXXX', (DIR => $self->{'out_loc'}));
317       # safest to use unix command as will be fast and lean
318       my $sort_command = sprintf "sort -k 1,1 -k 3,4n -k 2,2 -T %s -S 200M -o %s %s",
319       $dir, # temp area for sorting
320       "$dir/sorted", # output file
321       $input_set; # input file
322 0     croak "Failed to sort file using command: $sort_command" unless(system($sort_command) == 0);
323       $cluster_file = "$dir/sorted";
324       }
325        
326       my %data_set;
327       my $cluster_count = 0;
328       open my $CLUSTERS, '<', $cluster_file;
329       while (my $line = <$CLUSTERS>) {
330 0     next if($line =~ m/^#/);
331       chomp $line;
332        
333       my ($chr, $family, @detail) = split $TAB, $line;
334 0     if(defined $limit) {
335       my $score = $detail[2]; # can be reads or divergence score
336 0     if(defined $divergence) {
337 0     next unless($limit > $score); # keep the record in the set
338       }
339       else {
340       # this is only applied to reciprocal clustered data
341 0     $score += $detail[11] if(scalar @detail > 10);
342 0     next if($limit > $score); # discard the record
343       }
344       }
345        
346 0     if(defined $query_clusters) {
347 0     next unless(exists $query_clusters->{$family} && exists $query_clusters->{$family}->{$chr});
348       }
349        
350       push @{$data_set{$family}{$chr}}, \@detail;
351        
352       $cluster_count++;
353       }
354       close $CLUSTERS;
355        
356       return {'cluster_count' => $cluster_count,
357       'clusters' => \%data_set,
358       'file' => $input_set} ;
359       }
360        
361       sub set_min_filter_reads {
362   0 0 my ($self, $min_reads) = @_;
363 0     $self->{'min_filter_reads'} = $min_reads if(defined $min_reads);
364       return 1;
365       }
366        
367       sub set_min_germline_reads {
368   0 0 my ($self, $min_reads) = @_;
369 0     $self->{'min_germline_reads'} = $min_reads if(defined $min_reads);
370       return 1;
371       }
372        
373       sub set_divergence {
374   0 0 my ($self, $divergence) = @_;
375 0     $self->{'divergence'} = $divergence if(defined $divergence);
376       return 1;
377       }
378        
379       sub set_readlength {
380   0 0 my ($self, $readlength) = @_;
381 0     $self->{'readlength'} = $readlength if(defined $readlength);
382       return 1;
383       }
384        
385       sub set_germline_file {
386   0 0 my ($self, $file) = @_;
387        
388 0     croak "$file is not a valid file\n" unless(-e $file && -f $file);
389        
390       $self->{'germline'} = $file;
391       return 1;
392       }
393        
394       sub set_te_file {
395   0 0 my ($self, $file) = @_;
396 0     croak "$file is not a valid file\n" unless(-e $file && -f $file);
397       $self->{'trans_elements'} = $file;
398       return 1;
399       }
400        
401       1;
402        
403       __END__
404        
405       =head1 NAME
406        
407       Sanger::CGP::TraFiC::Filter
408        
409       =head1 SYNOPSIS
410        
411       use Sanger::CGP::TraFiC::Filter;
412       my $filter = Sanger::CGP::TraFiC::Filter->new;
413       $filter->set_output($output_dir);
414       $filter->set_clusters($sample_clusters);
415        
416       $filter->add_filter_file($filter_on_this);
417       $filter->add_filter_file($filter_on_this_too);
418       ...
419       # OR
420       $filter->add_filter_file(\@list_of_filter_files);
421        
422       $filter->filter;
423        
424       =head1 GENERAL
425        
426       Provides bulk filtering of a set of clusters. Will allow more arbitrary processing of data after
427       main processing is completed.
428        
429       =head1 METHODS
430        
431       =head2 User functions
432        
433       =head3 set_output
434        
435       Set the output location for output files.
436        
437       =head3 set_clusters
438        
439       Specify file which contains the clusters that are of interest.
440        
441       =head3 add_filter_file
442        
443       Add files that are used to filter the content of C<set_clusters>. Accepts both a simple scalar for
444       one file or an array reference for several files.
445        
446       It is recommended that the first file added is the matched normal sample as this will remove the
447       most noise from the data.
448        
449       =head3 filter
450        
451       Filter the content of C<set_clusters> against each of the items in C<add_filter_file>. Entries in
452       C<add_filter_file> are only sorted/loaded if data remains in the data set loaded from C<set_cluster>.
453        
454       =head3 output
455        
456       Output filtered data. Includes sorting.
457        
458       =head3 set_min_reads
459        
460       Set the minimum number of reads that must support a filtering record for it to be used.
461        
462       Defaults to 5.
463        
464       =head2 Internal functions
465        
466       =head3 _load_clusters
467        
468       Sort and then load cluster information into a data structure.
469        
470       Sort is always invoked to ensure that data is consistent. Most of the input should be relatively
471       small.