line |
bran |
sub |
pod |
code |
1
|
|
|
|
package Sanger::CGP::TraFiC::Filter; |
2
|
|
|
|
|
3
|
|
|
|
##########LICENCE########## |
4
|
|
|
|
# Copyright (c) 2015 Genome Research Ltd. |
5
|
|
|
|
# |
6
|
|
|
|
# Author: Cancer Genome Project cgpit@sanger.ac.uk |
7
|
|
|
|
# |
8
|
|
|
|
# This file is part of TraFiC. |
9
|
|
|
|
# |
10
|
|
|
|
# TraFiC is free software: you can redistribute it and/or modify it under |
11
|
|
|
|
# the terms of the GNU Affero General Public License as published by the Free |
12
|
|
|
|
# Software Foundation; either version 3 of the License, or (at your option) any |
13
|
|
|
|
# later version. |
14
|
|
|
|
# |
15
|
|
|
|
# This program is distributed in the hope that it will be useful, but WITHOUT |
16
|
|
|
|
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
17
|
|
|
|
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more |
18
|
|
|
|
# details. |
19
|
|
|
|
# |
20
|
|
|
|
# You should have received a copy of the GNU Affero General Public License |
21
|
|
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>. |
22
|
|
|
|
# |
23
|
|
|
|
# 1. The usage of a range of years within a copyright statement contained within |
24
|
|
|
|
# this distribution should be interpreted as being equivalent to a list of years |
25
|
|
|
|
# including the first and last year specified and all consecutive years between |
26
|
|
|
|
# them. For example, a copyright statement that reads 'Copyright (c) 2005, 2007- |
27
|
|
|
|
# 2009, 2011-2012' should be interpreted as being identical to a statement that |
28
|
|
|
|
# reads 'Copyright (c) 2005, 2007, 2008, 2009, 2011, 2012' and a copyright |
29
|
|
|
|
# statement that reads "Copyright (c) 2005-2012' should be interpreted as being |
30
|
|
|
|
# identical to a statement that reads 'Copyright (c) 2005, 2006, 2007, 2008, |
31
|
|
|
|
# 2009, 2010, 2011, 2012'." |
32
|
|
|
|
##########LICENCE########## |
33
|
|
|
|
|
34
|
|
|
|
|
35
|
|
2
|
|
use strict; |
36
|
|
2
|
|
use autodie qw(:all); |
37
|
|
2
|
|
use English qw( -no_match_vars ); |
38
|
|
2
|
|
use warnings FATAL => 'all'; |
39
|
|
2
|
|
use Carp qw(croak); |
40
|
|
2
|
|
use File::Path qw(make_path); |
41
|
|
2
|
|
use File::Temp; |
42
|
|
2
|
|
use List::Util qw(min max); |
43
|
|
2
|
|
use File::Which qw(which); |
44
|
|
|
|
|
45
|
|
2
|
|
use Const::Fast qw(const); |
46
|
|
|
|
|
47
|
|
2
|
|
use Sanger::CGP::TraFiC; |
48
|
|
|
|
|
49
|
|
|
|
const my $TAB => "\t"; |
50
|
|
|
|
|
51
|
|
|
|
const my $POS_L_POS => 0; |
52
|
|
|
|
const my $POS_R_POS => 1; |
53
|
|
|
|
const my $NEG_L_POS => 9; |
54
|
|
|
|
const my $NEG_R_POS => 10; |
55
|
|
|
|
|
56
|
|
|
|
sub new { |
57
|
|
3
|
0
|
my ($class) = @_; |
58
|
|
|
|
my $self = { }; |
59
|
|
|
|
bless $self, $class; |
60
|
100
|
|
|
croak "Unable to find standard unix 'sort' in path." unless(defined which('sort')); |
61
|
|
|
|
$self->{'min_filter_reads'} = 2; |
62
|
|
|
|
$self->{'min_germline_reads'} = 5; |
63
|
|
|
|
$self->{'divergence'} = 200; |
64
|
|
|
|
$self->{'readlength'} = 100; |
65
|
|
|
|
$self->{'filters'} = []; |
66
|
|
|
|
$self->{'assume_sorted'} = 0; |
67
|
|
|
|
return $self; |
68
|
|
|
|
} |
69
|
|
|
|
|
70
|
|
|
|
sub assume_sorted { |
71
|
|
0
|
0
|
shift->{'assume_sorted'} = 1; |
72
|
|
|
|
} |
73
|
|
|
|
|
74
|
|
|
|
sub set_output { |
75
|
|
1
|
1
|
my ($self, $root_dir) = @_; |
76
|
50
|
|
|
make_path($root_dir) unless(-e $root_dir); |
77
|
|
|
|
$self->{'out_loc'} = $root_dir; |
78
|
|
|
|
return 1; |
79
|
|
|
|
} |
80
|
|
|
|
|
81
|
|
|
|
sub set_clusters { |
82
|
|
0
|
1
|
my ($self, $clusters) = @_; |
83
|
|
|
|
|
84
|
0
|
|
|
croak "$clusters is not a valid file\n" unless(-e $clusters && -f $clusters); |
85
|
|
|
|
|
86
|
|
|
|
$self->{'clusters'} = $clusters; |
87
|
|
|
|
|
88
|
|
|
|
# grab the header line for output purposes |
89
|
|
|
|
open my $CL, '<', $clusters; |
90
|
|
|
|
$self->{'header'} = <$CL>; |
91
|
|
|
|
close $CL; |
92
|
|
|
|
|
93
|
|
|
|
return 1; |
94
|
|
|
|
} |
95
|
|
|
|
|
96
|
|
|
|
sub filter { |
97
|
|
0
|
1
|
my ($self) = @_; |
98
|
|
|
|
my $cluster_set = $self->_load_clusters($self->{'clusters'}); |
99
|
|
|
|
|
100
|
0
|
|
|
if($cluster_set->{'cluster_count'} > 0) { |
101
|
|
|
|
for my $filter(@{$self->{'filters'}}){ |
102
|
|
|
|
$self->_filter( $cluster_set, |
103
|
|
|
|
$self->_load_clusters($filter, $self->{'min_filter_reads'}, $cluster_set->{'clusters'})); |
104
|
|
|
|
} |
105
|
|
|
|
} |
106
|
|
|
|
else { |
107
|
|
|
|
warn "No clusters in tumour input file, expect only a header line in output file.\n"; |
108
|
|
|
|
} |
109
|
|
|
|
|
110
|
0
|
|
|
if($cluster_set->{'cluster_count'} > 0 && defined $self->{'germline'}) { |
111
|
|
|
|
$self->_filter($cluster_set, |
112
|
|
|
|
$self->_load_clusters($self->{'germline'}, $self->{'min_germline_reads'}, $cluster_set->{'clusters'})); |
113
|
|
|
|
} |
114
|
|
|
|
|
115
|
0
|
|
|
if($cluster_set->{'cluster_count'} > 0 && defined $self->{'trans_elements'}) { |
116
|
|
|
|
$self->_filter($cluster_set, |
117
|
|
|
|
$self->_load_clusters($self->{'trans_elements'}, $self->{'divergence'}, $cluster_set->{'clusters'}, 'divergence'), |
118
|
|
|
|
1); |
119
|
|
|
|
} |
120
|
|
|
|
|
121
|
|
|
|
$self->output($cluster_set->{'clusters'}); |
122
|
|
|
|
return 1; |
123
|
|
|
|
} |
124
|
|
|
|
|
125
|
|
|
|
sub output { |
126
|
|
0
|
1
|
my ($self, $cluster_set) = @_; |
127
|
|
|
|
open my $OUT, '>', "$self->{out_loc}/filtered.txt.tmp"; |
128
|
|
|
|
|
129
|
|
|
|
|
130
|
|
|
|
for my $family(keys %{$cluster_set}) { |
131
|
|
|
|
for my $chr(keys %{$cluster_set->{$family}}) { |
132
|
|
|
|
|
133
|
|
|
|
for my $cluster(@{$cluster_set->{$family}->{$chr}}) { |
134
|
|
|
|
print $OUT join $TAB, $chr, |
135
|
|
|
|
$family, |
136
|
|
|
|
@{$cluster}; |
137
|
|
|
|
print $OUT $RS; |
138
|
|
|
|
} |
139
|
|
|
|
} |
140
|
|
|
|
} |
141
|
|
|
|
close $OUT; |
142
|
|
|
|
|
143
|
|
|
|
my $sort_command = sprintf "sort -k 1,1 -k 3,4n -k 2,2 -T %s -S 200M %s", |
144
|
|
|
|
$self->{'out_loc'}, |
145
|
|
|
|
"$self->{out_loc}/filtered.txt.tmp"; |
146
|
|
|
|
|
147
|
|
|
|
open my $FINAL, '>', "$self->{out_loc}/filtered.txt"; |
148
|
|
|
|
print $FINAL $self->{'header'}; |
149
|
|
|
|
|
150
|
|
|
|
open my $proc, '-|', $sort_command; |
151
|
|
|
|
while(my $line = <$proc>) { |
152
|
|
|
|
print $FINAL $line; |
153
|
|
|
|
} |
154
|
|
|
|
close $proc; |
155
|
|
|
|
|
156
|
|
|
|
close $FINAL; |
157
|
|
|
|
|
158
|
|
|
|
unlink "$self->{out_loc}/filtered.txt.tmp"; |
159
|
|
|
|
return 1; |
160
|
|
|
|
} |
161
|
|
|
|
|
162
|
|
|
|
sub _filter { |
163
|
|
0
|
|
my ($self, $clusters, $filters, $is_te) = @_; |
164
|
|
|
|
my $query = $clusters->{'clusters'}; # we want these |
165
|
|
|
|
my $filter = $filters->{'clusters'}; # we want to remove these |
166
|
|
|
|
my @families = sort keys %{$query}; |
167
|
|
|
|
|
168
|
|
|
|
my $started_with = $clusters->{'cluster_count'}; |
169
|
|
|
|
|
170
|
|
|
|
PRIMARY: for my $family(@families) { |
171
|
|
|
|
# if this family doesn't exist in the filter skip it |
172
|
0
|
|
|
next unless(exists $filter->{$family}); |
173
|
|
|
|
my @chrs = sort keys %{$query->{$family}}; |
174
|
|
|
|
for my $chr(@chrs) { |
175
|
|
|
|
# skip this chromosome if not in filters |
176
|
0
|
|
|
next unless(exists $filter->{$family}->{$chr}); |
177
|
|
|
|
|
178
|
|
|
|
my $initial_clusters = scalar @{$query->{$family}->{$chr}}; |
179
|
|
|
|
|
180
|
|
|
|
|
181
|
0
|
|
|
if($is_te) { |
182
|
|
|
|
$query->{$family}->{$chr} = $self->_filter_tes($query->{$family}->{$chr}, $filter->{$family}->{$chr}); |
183
|
|
|
|
} |
184
|
|
|
|
else { |
185
|
|
|
|
$query->{$family}->{$chr} = $self->_filter_set($query->{$family}->{$chr}, $filter->{$family}->{$chr}); |
186
|
|
|
|
} |
187
|
|
|
|
my $remaining_set_clusters = scalar @{$query->{$family}->{$chr}}; |
188
|
|
|
|
$clusters->{'cluster_count'} -= $initial_clusters - $remaining_set_clusters; |
189
|
|
|
|
|
190
|
|
|
|
# as filter data is only loaded if there are corresponding |
191
|
|
|
|
# family/chrs to filter against ensure that empty keys |
192
|
|
|
|
# are removed from structure |
193
|
0
|
|
|
if($remaining_set_clusters == 0) { |
194
|
|
|
|
delete $query->{$family}->{$chr}; |
195
|
|
|
|
my @remaining_keys = keys %{$query->{$family}}; |
196
|
0
|
|
|
delete $query->{$family} if(scalar @remaining_keys == 0); |
197
|
|
|
|
} |
198
|
|
|
|
|
199
|
|
|
|
# no point filtering when nothing left |
200
|
0
|
|
|
last PRIMARY if($clusters->{'cluster_count'} == 0); |
201
|
|
|
|
|
202
|
0
|
|
|
croak "Cluster count has become negative, this suggests a programming error" if($clusters->{'cluster_count'} < 0); |
203
|
|
|
|
} |
204
|
|
|
|
} |
205
|
|
|
|
|
206
|
|
|
|
my $removed = $started_with - $clusters->{'cluster_count'}; |
207
|
|
|
|
|
208
|
|
|
|
print "$removed clusters removed by $filters->{file}\n"; |
209
|
|
|
|
return 1; |
210
|
|
|
|
} |
211
|
|
|
|
|
212
|
|
|
|
sub _filter_tes { |
213
|
|
0
|
|
my ($self, $queries, $tes) = @_; |
214
|
|
|
|
my $filter_tot = scalar @{$tes}; |
215
|
|
|
|
my $filter_i = 0; |
216
|
|
|
|
|
217
|
|
|
|
my $read_len = $self->{'readlength'}; |
218
|
|
|
|
|
219
|
|
|
|
my @l_queries = @{$queries}; |
220
|
|
|
|
|
221
|
|
|
|
my @new_clusters; |
222
|
|
|
|
PRIMARY: while(scalar @l_queries > 0) { |
223
|
0
|
|
|
last if($filter_i == $filter_tot); |
224
|
|
|
|
my $query = shift @l_queries; |
225
|
|
|
|
|
226
|
|
|
|
my $q_l_pos = (min $query->[$POS_R_POS], $query->[$NEG_R_POS]) + $read_len; |
227
|
|
|
|
my $q_r_pos = max $query->[$POS_L_POS], $query->[$NEG_L_POS]; |
228
|
|
|
|
|
229
|
|
|
|
while($filter_i < $filter_tot) { |
230
|
0
|
|
|
if($tes->[$filter_i]->[$POS_R_POS] < $q_l_pos) { |
231
|
|
|
|
# filter data is prior to query data so catch up |
232
|
|
|
|
$filter_i++; |
233
|
|
|
|
next; |
234
|
|
|
|
} |
235
|
0
|
|
|
if($tes->[$filter_i]->[$POS_L_POS] > $q_r_pos) { |
236
|
|
|
|
# filter is ahead of query so query must be valid |
237
|
|
|
|
push @new_clusters, $query; |
238
|
|
|
|
# move on to next query |
239
|
|
|
|
next PRIMARY; |
240
|
|
|
|
} |
241
|
|
|
|
# if we get here the filter must overlap the query |
242
|
|
|
|
next PRIMARY; |
243
|
|
|
|
} |
244
|
|
|
|
push @new_clusters, $query; |
245
|
|
|
|
} |
246
|
0
|
|
|
push @new_clusters, @l_queries if(scalar @l_queries > 0); |
247
|
|
|
|
|
248
|
|
|
|
return \@new_clusters; |
249
|
|
|
|
} |
250
|
|
|
|
|
251
|
|
|
|
sub _filter_set { |
252
|
|
0
|
|
my ($self, $queries, $filters) = @_; |
253
|
|
|
|
|
254
|
|
|
|
my $filter_tot = scalar @{$filters}; |
255
|
|
|
|
my $filter_i = 0; |
256
|
|
|
|
|
257
|
|
|
|
my $double_read_len = $self->{'readlength'} * 2; |
258
|
|
|
|
|
259
|
|
|
|
my @l_queries = @{$queries}; |
260
|
|
|
|
|
261
|
|
|
|
my @new_clusters; |
262
|
|
|
|
PRIMARY: while(scalar @l_queries > 0) { |
263
|
0
|
|
|
last if($filter_i == $filter_tot); |
264
|
|
|
|
my $query = shift @l_queries; |
265
|
|
|
|
|
266
|
|
|
|
my $q_l_pos = $query->[$POS_L_POS]; |
267
|
|
|
|
my $q_r_pos = $query->[$POS_R_POS]; |
268
|
|
|
|
|
269
|
|
|
|
while($filter_i < $filter_tot) { |
270
|
0
|
|
|
if($filters->[$filter_i]->[$POS_R_POS]+$double_read_len < $q_l_pos) { |
271
|
|
|
|
# filter data is prior to query data so catch up |
272
|
|
|
|
$filter_i++; |
273
|
|
|
|
next; |
274
|
|
|
|
} |
275
|
0
|
|
|
if($filters->[$filter_i]->[$POS_L_POS]-$double_read_len > $q_r_pos) { |
276
|
|
|
|
# filter is ahead of query so query must be valid |
277
|
|
|
|
push @new_clusters, $query; |
278
|
|
|
|
# move on to next query |
279
|
|
|
|
next PRIMARY; |
280
|
|
|
|
} |
281
|
|
|
|
# if we get here the filter must overlap the query |
282
|
|
|
|
next PRIMARY; |
283
|
|
|
|
} |
284
|
|
|
|
push @new_clusters, $query; |
285
|
|
|
|
} |
286
|
0
|
|
|
push @new_clusters, @l_queries if(scalar @l_queries > 0); |
287
|
|
|
|
|
288
|
|
|
|
return \@new_clusters; |
289
|
|
|
|
} |
290
|
|
|
|
|
291
|
|
|
|
sub add_filter_file { |
292
|
|
0
|
1
|
my ($self, $filter) = @_; |
293
|
0
|
|
|
if(defined $filter) { |
294
|
0
|
|
|
if(ref $filter eq 'ARRAY') { |
295
|
|
|
|
for(@{$filter}) { |
296
|
0
|
|
|
croak "$_ is not a valid file\n" unless(-e $_ && -f $_); |
297
|
|
|
|
push @{$self->{'filters'}}, $_; |
298
|
|
|
|
} |
299
|
|
|
|
} |
300
|
|
|
|
else { |
301
|
0
|
|
|
croak "$filter is not a valid file\n" unless(-e $filter && -f $filter); |
302
|
|
|
|
push @{$self->{'filters'}}, $filter; |
303
|
|
|
|
} |
304
|
|
|
|
} |
305
|
|
|
|
return scalar @{$self->{'filters'}}; |
306
|
|
|
|
} |
307
|
|
|
|
|
308
|
|
|
|
sub _load_clusters { |
309
|
|
0
|
|
my ($self, $input_set, $limit, $query_clusters, $divergence) = @_; |
310
|
|
|
|
|
311
|
|
|
|
my $cluster_file = $input_set; |
312
|
|
|
|
|
313
|
|
|
|
my $dir; |
314
|
|
|
|
|
315
|
0
|
|
|
unless($self->{'assume_sorted'}) { |
316
|
|
|
|
$dir = File::Temp->newdir('traficFilterXXXX', (DIR => $self->{'out_loc'})); |
317
|
|
|
|
# safest to use unix command as will be fast and lean |
318
|
|
|
|
my $sort_command = sprintf "sort -k 1,1 -k 3,4n -k 2,2 -T %s -S 200M -o %s %s", |
319
|
|
|
|
$dir, # temp area for sorting |
320
|
|
|
|
"$dir/sorted", # output file |
321
|
|
|
|
$input_set; # input file |
322
|
0
|
|
|
croak "Failed to sort file using command: $sort_command" unless(system($sort_command) == 0); |
323
|
|
|
|
$cluster_file = "$dir/sorted"; |
324
|
|
|
|
} |
325
|
|
|
|
|
326
|
|
|
|
my %data_set; |
327
|
|
|
|
my $cluster_count = 0; |
328
|
|
|
|
open my $CLUSTERS, '<', $cluster_file; |
329
|
|
|
|
while (my $line = <$CLUSTERS>) { |
330
|
0
|
|
|
next if($line =~ m/^#/); |
331
|
|
|
|
chomp $line; |
332
|
|
|
|
|
333
|
|
|
|
my ($chr, $family, @detail) = split $TAB, $line; |
334
|
0
|
|
|
if(defined $limit) { |
335
|
|
|
|
my $score = $detail[2]; # can be reads or divergence score |
336
|
0
|
|
|
if(defined $divergence) { |
337
|
0
|
|
|
next unless($limit > $score); # keep the record in the set |
338
|
|
|
|
} |
339
|
|
|
|
else { |
340
|
|
|
|
# this is only applied to reciprocal clustered data |
341
|
0
|
|
|
$score += $detail[11] if(scalar @detail > 10); |
342
|
0
|
|
|
next if($limit > $score); # discard the record |
343
|
|
|
|
} |
344
|
|
|
|
} |
345
|
|
|
|
|
346
|
0
|
|
|
if(defined $query_clusters) { |
347
|
0
|
|
|
next unless(exists $query_clusters->{$family} && exists $query_clusters->{$family}->{$chr}); |
348
|
|
|
|
} |
349
|
|
|
|
|
350
|
|
|
|
push @{$data_set{$family}{$chr}}, \@detail; |
351
|
|
|
|
|
352
|
|
|
|
$cluster_count++; |
353
|
|
|
|
} |
354
|
|
|
|
close $CLUSTERS; |
355
|
|
|
|
|
356
|
|
|
|
return {'cluster_count' => $cluster_count, |
357
|
|
|
|
'clusters' => \%data_set, |
358
|
|
|
|
'file' => $input_set} ; |
359
|
|
|
|
} |
360
|
|
|
|
|
361
|
|
|
|
sub set_min_filter_reads { |
362
|
|
0
|
0
|
my ($self, $min_reads) = @_; |
363
|
0
|
|
|
$self->{'min_filter_reads'} = $min_reads if(defined $min_reads); |
364
|
|
|
|
return 1; |
365
|
|
|
|
} |
366
|
|
|
|
|
367
|
|
|
|
sub set_min_germline_reads { |
368
|
|
0
|
0
|
my ($self, $min_reads) = @_; |
369
|
0
|
|
|
$self->{'min_germline_reads'} = $min_reads if(defined $min_reads); |
370
|
|
|
|
return 1; |
371
|
|
|
|
} |
372
|
|
|
|
|
373
|
|
|
|
sub set_divergence { |
374
|
|
0
|
0
|
my ($self, $divergence) = @_; |
375
|
0
|
|
|
$self->{'divergence'} = $divergence if(defined $divergence); |
376
|
|
|
|
return 1; |
377
|
|
|
|
} |
378
|
|
|
|
|
379
|
|
|
|
sub set_readlength { |
380
|
|
0
|
0
|
my ($self, $readlength) = @_; |
381
|
0
|
|
|
$self->{'readlength'} = $readlength if(defined $readlength); |
382
|
|
|
|
return 1; |
383
|
|
|
|
} |
384
|
|
|
|
|
385
|
|
|
|
sub set_germline_file { |
386
|
|
0
|
0
|
my ($self, $file) = @_; |
387
|
|
|
|
|
388
|
0
|
|
|
croak "$file is not a valid file\n" unless(-e $file && -f $file); |
389
|
|
|
|
|
390
|
|
|
|
$self->{'germline'} = $file; |
391
|
|
|
|
return 1; |
392
|
|
|
|
} |
393
|
|
|
|
|
394
|
|
|
|
sub set_te_file { |
395
|
|
0
|
0
|
my ($self, $file) = @_; |
396
|
0
|
|
|
croak "$file is not a valid file\n" unless(-e $file && -f $file); |
397
|
|
|
|
$self->{'trans_elements'} = $file; |
398
|
|
|
|
return 1; |
399
|
|
|
|
} |
400
|
|
|
|
|
401
|
|
|
|
1; |
402
|
|
|
|
|
403
|
|
|
|
__END__ |
404
|
|
|
|
|
405
|
|
|
|
=head1 NAME |
406
|
|
|
|
|
407
|
|
|
|
Sanger::CGP::TraFiC::Filter |
408
|
|
|
|
|
409
|
|
|
|
=head1 SYNOPSIS |
410
|
|
|
|
|
411
|
|
|
|
use Sanger::CGP::TraFiC::Filter; |
412
|
|
|
|
my $filter = Sanger::CGP::TraFiC::Filter->new; |
413
|
|
|
|
$filter->set_output($output_dir); |
414
|
|
|
|
$filter->set_clusters($sample_clusters); |
415
|
|
|
|
|
416
|
|
|
|
$filter->add_filter_file($filter_on_this); |
417
|
|
|
|
$filter->add_filter_file($filter_on_this_too); |
418
|
|
|
|
... |
419
|
|
|
|
# OR |
420
|
|
|
|
$filter->add_filter_file(\@list_of_filter_files); |
421
|
|
|
|
|
422
|
|
|
|
$filter->filter; |
423
|
|
|
|
|
424
|
|
|
|
=head1 GENERAL |
425
|
|
|
|
|
426
|
|
|
|
Provides bulk filtering of a set of clusters. Will allow more arbitrary processing of data after |
427
|
|
|
|
main processing is completed. |
428
|
|
|
|
|
429
|
|
|
|
=head1 METHODS |
430
|
|
|
|
|
431
|
|
|
|
=head2 User functions |
432
|
|
|
|
|
433
|
|
|
|
=head3 set_output |
434
|
|
|
|
|
435
|
|
|
|
Set the output location for output files. |
436
|
|
|
|
|
437
|
|
|
|
=head3 set_clusters |
438
|
|
|
|
|
439
|
|
|
|
Specify file which contains the clusters that are of interest. |
440
|
|
|
|
|
441
|
|
|
|
=head3 add_filter_file |
442
|
|
|
|
|
443
|
|
|
|
Add files that are used to filter the content of C<set_clusters>. Accepts both a simple scalar for |
444
|
|
|
|
one file or an array reference for several files. |
445
|
|
|
|
|
446
|
|
|
|
It is recommended that the first file added is the matched normal sample as this will remove the |
447
|
|
|
|
most noise from the data. |
448
|
|
|
|
|
449
|
|
|
|
=head3 filter |
450
|
|
|
|
|
451
|
|
|
|
Filter the content of C<set_clusters> against each of the items in C<add_filter_file>. Entries in |
452
|
|
|
|
C<add_filter_file> are only sorted/loaded if data remains in the data set loaded from C<set_cluster>. |
453
|
|
|
|
|
454
|
|
|
|
=head3 output |
455
|
|
|
|
|
456
|
|
|
|
Output filtered data. Includes sorting. |
457
|
|
|
|
|
458
|
|
|
|
=head3 set_min_reads |
459
|
|
|
|
|
460
|
|
|
|
Set the minimum number of reads that must support a filtering record for it to be used. |
461
|
|
|
|
|
462
|
|
|
|
Defaults to 5. |
463
|
|
|
|
|
464
|
|
|
|
=head2 Internal functions |
465
|
|
|
|
|
466
|
|
|
|
=head3 _load_clusters |
467
|
|
|
|
|
468
|
|
|
|
Sort and then load cluster information into a data structure. |
469
|
|
|
|
|
470
|
|
|
|
Sort is always invoked to ensure that data is consistent. Most of the input should be relatively |
471
|
|
|
|
small. |