File Coverage

lib/Sanger/CGP/TraFiC/Cluster.pm
Criterion Covered Total %
branch 23 32 71.8
subroutine 20 21 95.2
pod 5 12 41.6
total 48 65 73.8


line bran sub pod code
1       package Sanger::CGP::TraFiC::Cluster;
2        
3       ##########LICENCE##########
4       # Copyright (c) 2015 Genome Research Ltd.
5       #
6       # Author: Cancer Genome Project cgpit@sanger.ac.uk
7       #
8       # This file is part of TraFiC.
9       #
10       # TraFiC is free software: you can redistribute it and/or modify it under
11       # the terms of the GNU Affero General Public License as published by the Free
12       # Software Foundation; either version 3 of the License, or (at your option) any
13       # later version.
14       #
15       # This program is distributed in the hope that it will be useful, but WITHOUT
16       # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17       # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
18       # details.
19       #
20       # You should have received a copy of the GNU Affero General Public License
21       # along with this program. If not, see <http://www.gnu.org/licenses/>.
22       #
23       # 1. The usage of a range of years within a copyright statement contained within
24       # this distribution should be interpreted as being equivalent to a list of years
25       # including the first and last year specified and all consecutive years between
26       # them. For example, a copyright statement that reads 'Copyright (c) 2005, 2007-
27       # 2009, 2011-2012' should be interpreted as being identical to a statement that
28       # reads 'Copyright (c) 2005, 2007, 2008, 2009, 2011, 2012' and a copyright
29       # statement that reads "Copyright (c) 2005-2012' should be interpreted as being
30       # identical to a statement that reads 'Copyright (c) 2005, 2006, 2007, 2008,
31       # 2009, 2010, 2011, 2012'."
32       ##########LICENCE##########
33        
34        
35   2   use strict;
36   2   use autodie qw(:all);
37   2   use English qw( -no_match_vars );
38   2   use warnings FATAL => 'all';
39   2   use Carp qw(croak);
40   2   use Const::Fast qw(const);
41   2   use File::Which qw(which);
42        
43   2   use Sanger::CGP::TraFiC;
44        
45       const my $TAB => "\t";
46        
47       const my @CLASSES => qw(SINGLE_END INTER_CHROM ABERRANT);
48       const my @HEADER_ROOT => qw(CHR FAMILY);
49       const my @HEADER_MAIN => qw(L_POS R_POS TOTAL_READS SINGLE_END_COUNT INTER_CHROM_COUNT ABERRANT_COUNT SINGLE_END_READS INTER_CHROM_READS ABERRANT_READS);
50        
51       sub new {
52   2 1 my ($class) = @_;
53 50     croak "Unable to find standard unix 'sort' in path." unless(defined which('sort'));
54       my $self = { 'min_reads' => 5,
55       'paired_alu' => 0,};
56       bless $self, $class;
57       return $self;
58       }
59        
60       sub reciprocal_clusters {
61   0 1 my $self = shift;
62       # results are small so load into hash structures to simplify searching by family/chr
63       my $pos = $self->clustered_struct('+');
64       my $neg = $self->clustered_struct('-');
65       $self->reciprocal_hits($pos, $neg);
66       return 1;
67       }
68        
69       sub reciprocal_hits {
70   1 0 my ($self, $pos, $neg) = @_;
71       # keyed as family->chr
72       # if family in pos not found in neg then can skip the whole block
73       # if no chr for a family then you can skip that too...
74       my @reciprocal_clusters;
75       my @families = sort keys %{$pos};
76       for my $family(@families) {
77 50     unless(exists $neg->{$family}) {
78       delete $pos->{$family};
79       next;
80       }
81       my @chrs = sort keys %{$pos->{$family}};
82       for my $chr(@chrs) {
83 100     unless(exists $neg->{$family}->{$chr}) {
84       delete $pos->{$family}->{$chr};
85       next;
86       }
87       push @reciprocal_clusters, @{$self->compare_hits($chr, $family, $pos->{$family}->{$chr}, $neg->{$family}->{$chr})};
88       }
89       }
90       $self->output_reciprocal(\@reciprocal_clusters);
91       return 1;
92       }
93        
94       sub output_reciprocal {
95   1 0 my ($self, $clusters) = @_;
96        
97       # output clusters
98       open my $RECIP, '>', "$self->{reciprocal_clusters}.raw";
99       for my $cluster(@{$clusters}) {
100       my ($chr, $family, $pos, $neg) = @{$cluster};
101       print $RECIP (join $TAB,
102       $chr, $family, # shared
103       $pos->{'l_pos'}, $pos->{'r_pos'}, @{$pos->{'detail'}}, # pos cluster info
104       $neg->{'l_pos'}, $neg->{'r_pos'}, @{$neg->{'detail'}}) # neg cluster info
105       ,$RS;
106       }
107       close $RECIP;
108        
109       # sort clusters
110       # safest to use unix command as will be fast and lean
111       my $sort_command = sprintf "sort -k 1,1 -k 3,4n -k 2,2 -o %s %s",
112       "$self->{reciprocal_clusters}.sort", # output file
113       "$self->{reciprocal_clusters}.raw"; # input file
114 50     croak "Failed to sort file using command: $sort_command" unless(system($sort_command) == 0);
115        
116       # write header
117       open $RECIP, '>', $self->{'reciprocal_clusters'};
118       print $RECIP '#',(join $TAB, @HEADER_ROOT);
119       for(@HEADER_MAIN) {
120       print $RECIP $TAB,'P_',$_;
121       }
122       for(@HEADER_MAIN) {
123       print $RECIP $TAB,'N_',$_;
124       }
125       print $RECIP $RS;
126        
127       # append the sorted results
128       open my $SORT, '<', "$self->{reciprocal_clusters}.sort";
129       while(<$SORT>) { print $RECIP $_; }
130       close $SORT;
131       close $RECIP;
132        
133       # cleanup intermediate files
134       unlink "$self->{reciprocal_clusters}.raw";
135       unlink "$self->{reciprocal_clusters}.sort";
136       return 1;
137       }
138        
139       sub compare_hits {
140   2 0 my ($self, $chr, $family, $pos_hits, $neg_hits) = @_;
141       my $neg_tot = scalar @{$neg_hits};
142       my $neg_i = 0;
143       my @reciprocal;
144       PRIMARY: for my $pos(@{$pos_hits}) {
145       # if pos l_pos is > neg r_pos+200
146       while($neg_i < $neg_tot) {
147 100     if($neg_hits->[$neg_i]->{'l_pos'} < $pos->{'r_pos'}) {
148       $neg_i++;
149       next;
150       }
151 50     if($neg_hits->[$neg_i]->{'r_pos'}+200 < $pos->{'l_pos'}) {
152       # neg data is prior to pos data so catch up
153       $neg_i++;
154       next;
155       }
156 50     if($neg_hits->[$neg_i]->{'l_pos'}-200 > $pos->{'r_pos'}) {
157       # neg is ahead of pos so catchup pos and re-check this entry
158       next PRIMARY;
159       }
160       push @reciprocal, [$chr, $family, $pos, $neg_hits->[$neg_i]];
161       $neg_i++;
162       }
163       }
164       return \@reciprocal;
165       }
166        
167       sub clustered_struct {
168   2 0 my ($self, $strand) = @_;
169       my %clusters;
170       open my $CLUSTERS, '<', $self->{$strand.'_clusters'};
171       while (my $cluster = <$CLUSTERS>) {
172 100     next if(substr($cluster, 0,1) eq '#');
173       chomp $cluster;
174       my ($chr, $family, $l_pos, $r_pos, @detail) = split /\t/, $cluster;
175       push @{$clusters{$family}{$chr}}, { 'l_pos' => int $l_pos,
176       'r_pos' => int $r_pos,
177       'detail' => \@detail,
178       };
179       }
180       close $CLUSTERS;
181       return \%clusters;
182       }
183        
184       sub cluster_hits {
185   2 1 my ($self) = @_;
186       $self->cluster_file('+');
187       $self->cluster_file('-');
188       return 1;
189       }
190        
191       sub cluster_file {
192   8 0 my ($self, $strand) = @_;
193       open my $HITS, '<', $self->{$strand.'_sorted_hits'};
194       my ($p_chr, $p_pos, $p_fam) = (q{}, 0, q{});
195       my $cluster = [];
196       open my $CLUSTERED, '>', $self->{$strand.'_clusters'};
197       print $CLUSTERED '#',(join $TAB, @HEADER_ROOT, @HEADER_MAIN),$RS;
198       while (my $line = <$HITS>) {
199       chomp $line;
200       my ($rn, $chr, $pos, $fam, $class) = split $TAB, $line;
201 100     next if($fam eq 'Alu' && $class ne 'INTER_CHROM' && $self->{'paired_alu'} == 1);
202        
203 100     if($p_chr eq $chr && $p_fam eq $fam && ($pos - $p_pos) <= 200) {
204       push @{$cluster}, [$rn, $chr, $pos, $class];
205       }
206       else {
207       $cluster = $self->finalise_cluster($cluster, $p_fam, $strand, $CLUSTERED);
208       push @{$cluster}, [$rn, $chr, $pos, $class];
209       }
210       ($p_chr, $p_pos, $p_fam) = ($chr, $pos, $fam);
211       }
212       $self->finalise_cluster($cluster, $p_fam, $strand, $CLUSTERED);
213       close $CLUSTERED;
214       close $HITS;
215       return 1;
216       }
217        
218       sub finalise_cluster {
219   102408 0 my ($self, $cluster, $fam, $strand, $CLUSTERED) = @_;
220 100     if(scalar @{$cluster} >= $self->{'min_reads'}) {
221       my %classes = map { $_ => [] } @CLASSES;
222       for my $item(@{$cluster}) {
223       push @{$classes{$item->[3]}}, $item->[0];
224       }
225       # chr, family, l_pos, r_pos, read_count_*, read_names_*
226       print $CLUSTERED (join $TAB, $cluster->[0]->[1],
227       $fam,
228       $cluster->[0]->[2],
229       $cluster->[-1]->[2],
230       scalar @{$classes{'SINGLE_END'}} + scalar @{$classes{'INTER_CHROM'}} + scalar @{$classes{'ABERRANT'}},
231       scalar @{$classes{'SINGLE_END'}},
232       scalar @{$classes{'INTER_CHROM'}},
233       scalar @{$classes{'ABERRANT'}},
234       ${_format_readnames($classes{'SINGLE_END'})},
235       ${_format_readnames($classes{'INTER_CHROM'})},
236       ${_format_readnames($classes{'ABERRANT'})},
237       ),$RS;
238       }
239       $cluster = [];
240       return $cluster;
241       }
242        
243       sub _format_readnames {
244   170   my $reads = shift;
245       my $ret_val = q{.};
246 100     if(defined $reads && @{$reads} > 0) {
247       $ret_val = join ',', @{$reads};
248       }
249       return \$ret_val
250       }
251        
252       sub set_min_reads {
253   6 1 my ($self, $min_reads) = @_;
254       $self->{'min_reads'} = $min_reads || 5;
255       return 1;
256       }
257        
258       sub set_paired_alu {
259   2 0 my ($self, $paired_alu) = @_;
260 50     $self->{'paired_alu'} = 1 if(defined $paired_alu);
261       return 1;
262       }
263        
264       sub set_input_output {
265   1 1 my ($self, $root_path) = @_;
266 50     croak "$root_path does not exist." unless(-e $root_path);
267 50     croak "$root_path is not a directory." unless(-d $root_path);
268 50     croak "$root_path cannot be written to." unless(-w $root_path);
269        
270       $self->{'+_sorted_hits'} = "$root_path/pos_hits.txt";
271       $self->{'-_sorted_hits'} = "$root_path/neg_hits.txt";
272        
273       $self->{'+_clusters'} = "$root_path/pos_clusters.txt";
274       $self->{'-_clusters'} = "$root_path/neg_clusters.txt";
275       $self->{'reciprocal_clusters'} = "$root_path/reciprocal_clusters.txt";
276        
277       return 1;
278       }
279        
280        
281       1;
282        
283       __END__
284        
285       =head1 NAME
286        
287       Sanger::CGP::TraFiC::Cluster
288        
289       =head1 SYNOPSIS
290        
291       my $cluster = Sanger::CGP::TraFiC::Cluster->new;
292        
293       $cluster->set_input_output($output_folder);
294       # which should already contain data from Formatter.pm
295        
296       $cluster->set_min_reads($minimum_reads_required);
297       $cluster->cluster_hits;
298       $cluster->reciprocal_clusters;
299        
300       =head1 GENERAL
301        
302       Take the merged repeat-masker *.out and fasta data generated by Formatter.pm and generate clusters
303       of nearby/overlapping events.
304        
305       Generates three files:
306        
307       =over
308        
309       =item pos_clusters.txt
310        
311       Data for reads anchored on positive strand that are able to be clustered with greater or equal to
312       the value of I<min_reads>.
313        
314       =item neg_clusters.txt
315        
316       Data for reads anchored on negative strand that are able to be clustered with greater or equal to
317       the value of I<min_reads>.
318        
319       =item reciprocal_clusters.txt
320        
321       Where entries in pos/neg_clusters.txt support each other they are additionally output here. The
322       pos_cluster is always the primary entry.
323        
324       =back
325        
326       =head2 Format of output
327        
328       The output files all follow the same format
329        
330       =head3 Core fields
331        
332       These are present at the start of each line in every file.
333        
334       =over
335        
336       =item CHR
337        
338       Chromosome or sequence identifier.
339        
340       =item FAMILY
341        
342       Masking identified reads as hitting repeat of this type.
343        
344       =back
345        
346       =head3 Data fields
347        
348       These are present in the pos/neg_clusters.txt files as described here.
349        
350       In the reciprocal_clusters.txt file these are presented twice, the positive cluster first (prefixed B<P_>)
351       and then the negative cluster data (prefixed B<N_>).
352        
353       =over
354        
355       =item L_POS
356        
357       Left most position of cluster.
358        
359       =item R_POS
360        
361       Left most position of cluster.
362        
363       =item TOTAL_READS
364        
365       Total number of reads that support this cluster
366        
367       =item SINGLE_END_COUNT
368        
369       Number of single end mapped reads that contribute to this cluster.
370        
371       =item INTER_CHROM_COUNT
372        
373       Number of inter-chromosomal mapped reads that contribute to this cluster.
374        
375       =item ABERRANT_COUNT
376        
377       Number of aberrantly paired reads that contribute to this cluster.
378        
379       =item SINGLE_END_READS
380        
381       Names of reads contributing to I<SINGLE_END_COUNT>.
382        
383       =item INTER_CHROM_READS
384        
385       Names of reads contributing to I<INTER_CHROM_COUNT>.
386        
387       =item ABERRANT_READS
388        
389       Names of reads contributing to I<ABERRANT_COUNT>.
390        
391       =back
392        
393       =head1 METHODS
394        
395       =head2 Constructor/configuration
396        
397       =head3 new
398        
399       No options, sets up object with default values.
400        
401       =head3 set_input_output
402        
403       =over
404        
405       =item Description
406        
407       Define the path to the input files (generated by L<Sanger::CGP::TraFiC::Formatter>). The output will
408       be written to this area as it is expected for these file to be generated in a single step.
409        
410       =item Args
411        
412       Path to output of L<Sanger::CGP::TraFiC::Formatter>
413        
414       =back
415        
416       =head3 set_min_reads
417        
418       =over
419        
420       =item Description
421        
422       Define the minimum number of reads required to generate a cluster. Defaults to 5.
423        
424       =item Args
425        
426       Integer, number of reads that must exist within a cluster.
427        
428       =back
429        
430       =head2 Processing
431        
432       =head3 cluster_hits
433        
434       =over
435        
436       =item Description
437        
438       Load the hit files, cluster and output in L<cluster format|Format of output>.
439        
440       =item Note
441        
442       No arguments as paths are determined at higher level.
443        
444       =back
445        
446       =head3 reciprocal_clusters
447        
448       =over
449        
450       =item Description
451        
452       Compare the clustered data for pos and neg data identifying reciprocal entities. A reciprocal output
453       file is generated. See L<formatting section|Format of output>.
454        
455       =item Note
456        
457       No arguments as paths are determined at higher level.
458        
459       =back