File Coverage

lib/Sanger/CGP/TraFiC/Formatter.pm
Criterion Covered Total %
branch 25 38 65.7
subroutine 18 20 90.0
pod 6 9 66.6
total 49 67 73.1


line bran sub pod code
1       package Sanger::CGP::TraFiC::Formatter;
2        
3       ##########LICENCE##########
4       # Copyright (c) 2015 Genome Research Ltd.
5       #
6       # Author: Cancer Genome Project cgpit@sanger.ac.uk
7       #
8       # This file is part of TraFiC.
9       #
10       # TraFiC is free software: you can redistribute it and/or modify it under
11       # the terms of the GNU Affero General Public License as published by the Free
12       # Software Foundation; either version 3 of the License, or (at your option) any
13       # later version.
14       #
15       # This program is distributed in the hope that it will be useful, but WITHOUT
16       # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17       # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
18       # details.
19       #
20       # You should have received a copy of the GNU Affero General Public License
21       # along with this program. If not, see <http://www.gnu.org/licenses/>.
22       #
23       # 1. The usage of a range of years within a copyright statement contained within
24       # this distribution should be interpreted as being equivalent to a list of years
25       # including the first and last year specified and all consecutive years between
26       # them. For example, a copyright statement that reads 'Copyright (c) 2005, 2007-
27       # 2009, 2011-2012' should be interpreted as being identical to a statement that
28       # reads 'Copyright (c) 2005, 2007, 2008, 2009, 2011, 2012' and a copyright
29       # statement that reads "Copyright (c) 2005-2012' should be interpreted as being
30       # identical to a statement that reads 'Copyright (c) 2005, 2006, 2007, 2008,
31       # 2009, 2010, 2011, 2012'."
32       ##########LICENCE##########
33        
34        
35   2   use strict;
36   2   use autodie qw(:all);
37   2   use English qw( -no_match_vars );
38   2   use warnings FATAL => 'all';
39   2   use Carp qw(croak);
40   2   use File::Path qw(make_path remove_tree);
41   2   use File::Temp qw(tempdir);
42   2   use File::Which qw(which);
43   2   use Const::Fast qw(const);
44        
45   2   use Sanger::CGP::TraFiC;
46        
47       const my $TAB => "\t";
48        
49       sub new {
50   3 1 my ($class) = @_;
51       my $self = { };
52       bless $self, $class;
53 100     croak "Unable to find standard unix 'sort' in path." unless(defined which('sort'));
54       $self->{'silent'} = 0;
55       return $self;
56       }
57        
58       sub set_output {
59   1 1 my ($self, $root_path) = @_;
60 50     croak "$root_path does not exist." unless(-e $root_path);
61 50     croak "$root_path is not a directory." unless(-d $root_path);
62 50     croak "$root_path cannot be written to." unless(-w $root_path);
63       $self->{'temp'} = tempdir ( 'TraFiC_XXXX', DIR => $root_path );
64       make_path($self->{'temp'});
65        
66       $self->{'+_tmp'} = "$self->{temp}/pos_hits.txt";
67       $self->{'-_tmp'} = "$self->{temp}/neg_hits.txt";
68        
69       open my $pos, '>', $self->{'+_tmp'};
70       open my $neg, '>', $self->{'-_tmp'};
71        
72       $self->{'+'} = $pos;
73       $self->{'-'} = $neg;
74        
75       $self->{'+_final'} = "$root_path/pos_hits.txt";
76       $self->{'-_final'} = "$root_path/neg_hits.txt";
77       return 1;
78       }
79        
80       sub format_rm_results {
81   0 1 my ($self, $out_list, $fa_list) = @_;
82       my $rm_hits = $self->load_rm_output($out_list);
83       $self->output_matched_hits($rm_hits, $fa_list);
84       $self->sort_hits;
85       return 1;
86       }
87        
88       sub sort_hits {
89   1 0 my ($self) = @_;
90       close $self->{'+'};
91       close $self->{'-'};
92       # remove the fh entries so destroy doesn't try again.
93       delete $self->{'+'};
94       delete $self->{'-'};
95       $self->sort_file('+');
96       $self->sort_file('-');
97       return 1;
98       }
99        
100       sub sort_file {
101   2 0 my ($self, $strand) = @_;
102       # safest to use unix command as will be fast and lean
103       # sort -k 2,2 -k 3,3n -o file -T $self->{temp} -S BUFFER_SIZE
104       my $sort_command = sprintf "sort -k 4,4 -k 2,2 -k 3,3n -T %s -S 200M -o %s %s",
105       $self->{temp},
106       $self->{$strand.'_final'}, # output file
107       $self->{$strand.'_tmp'}; # input file
108 50     croak "Failed to sort file using command: $sort_command" unless(system($sort_command) == 0);
109       return 1;
110       }
111        
112       sub output_matched_hits {
113   1 0 my ($self, $rm_hits, $fa_list) = @_;
114        
115       my ($readname, $chr, $pos, $class, $strand);
116       my $remaining_hits = scalar(keys %{$rm_hits});
117 50     print "Correlating $remaining_hits records\n" unless($self->{'silent'});
118       SHORT: for my $in_fa(@{$fa_list}) {
119       open my $FA_FILE, '<', $in_fa;
120       while (my $line = <$FA_FILE>) {
121       <$FA_FILE>; # discard the sequence line
122       chomp $line;
123       ($readname, $chr, $pos, $class, $strand) = split /\t/, substr($line, 1); # ditch leading '>'
124 50     if(exists $rm_hits->{$readname}) {
125       print {$self->{$strand}} join $TAB, ($readname,
126       $chr,
127       $pos,
128       $rm_hits->{$readname},
129       $class,
130       $strand);
131       print {$self->{$strand}} $RS;
132       delete $rm_hits->{$readname}; # removing things from the hash will speed up subsequent lookups
133 100     if(--$remaining_hits == 0) {
134       close $FA_FILE;
135 50     print "All records correlated\n" unless($self->{'silent'});
136       last SHORT;
137       }
138 50     print "\t$remaining_hits remaining\n" if($self->{'silent'} != 1 && $remaining_hits % 5000 == 0);
139       }
140       }
141       close $FA_FILE;
142       }
143       return 1;
144       }
145        
146       sub load_rm_output {
147   1 1 my ($self, $out_list) = @_;
148       my %rm_hits;
149       my $last_score = 0;
150 50     print "Loading repeat masker output\n" unless($self->{'silent'});
151       for my $rm_out(@{$out_list}) {
152 50     print "\t$rm_out\n" unless($self->{'silent'});
153       open my $RM_FILE, '<', $rm_out;
154       while (my $line = <$RM_FILE>) {
155       chomp $line;
156       $line =~ s/^\s+//;
157       $line =~ s/\s+$//;
158        
159       # best hits always end with an id
160       # alternate hits (shorter matches) are terminated with
161       # an additional (unheaded) column containing '*'
162       # we only want the best hits
163 100     next unless($line =~ m/\d$/);
164        
165       my @bits = split /[\s\/]+/, $line;
166        
167       # all reads are output adjacent, however not ordered by score.
168 100     if(exists $rm_hits{$bits[4]}) {
169       # if this read is already stored but this record has a better score
170       # then replace it, otherwise skip
171 100     $rm_hits{$bits[4]} = $bits[11] if($bits[0] > $last_score);
172       }
173       else {
174       $rm_hits{$bits[4]} = $bits[11]; # readname and family are only bits needed
175       }
176       $last_score = $bits[0];
177       }
178       close $RM_FILE;
179       }
180 50     print "Loading complete\n" unless($self->{'silent'});
181       return \%rm_hits;
182       }
183        
184       sub silence {
185   1 1 shift->{'silent'} = 1;
186       }
187        
188       sub unsilence {
189   0 1 shift->{'silent'} = 0;
190       }
191        
192       sub DESTROY {
193   3   my $self = shift;
194 50     close $self->{'+'} if(defined $self->{'+'});
195 50     close $self->{'-'} if(defined $self->{'-'});
196        
197 100     remove_tree($self->{'temp'}) if(defined $self->{'temp'});;
198       return 1;
199       }
200        
201        
202       1;
203        
204       __END__
205        
206       =head1 NAME
207        
208       Sanger::CGP::TraFiC::Formatter
209        
210       =head1 SYNOPSIS
211        
212       use Sanger::CGP::TraFiC::Formatter;
213       my $formatter = Sanger::CGP::TraFiC::Formatter->new;
214       $formatter->set_output($output_dir);
215       $formatter->format_rm_results($repeatmasker_files, $fasta_files);
216        
217       =head1 GENERAL
218        
219       Designed to take one or more repeatmasker *.out files along with the input files to generate two
220       output files (positive anchors, negative anchors) containing the full information about the mapped
221       end of those reads that were successfully processed by repeat masker.
222        
223       All the data required is encoded in the fasta header for each read:
224        
225       =over
226        
227       =item 1. readname
228        
229       =item 2. chr
230        
231       =item 3. pos
232        
233       =item 4. repeat family
234        
235       =item 5. strand (+/-)
236        
237       =back
238        
239       2,3 and 5 are based on the data for the mapped end of the pair.
240        
241       This means we only need to store the readname and family of repeat in memory from the rm output as
242       the rest can be read back from the rm input files with no memory overhead.
243        
244       =head1 METHODS
245        
246       =head2 Constructor/configuration
247        
248       =head3 new
249        
250       =over
251        
252       =item Description
253        
254       Initialises the object and ensures that unix sort is available in path.
255        
256       my $formatter = Sanger::CGP::TraFiC::Formatter->new;
257        
258       =item Errors
259        
260       Will C<croak> if unable to find C<sort> in path with this message:
261        
262       Unable to find standard unix 'sort' in path
263        
264       =back
265        
266       =head3 set_output
267        
268       =over
269        
270       =item Description
271        
272       Specify the output folder for collated results.
273        
274       $formatter->set_output($output_path);
275        
276       =back
277        
278       =head3 silence
279        
280       =over
281        
282       =item Description
283        
284       Allows user to silence internal messages. Mainly added for testing system
285        
286       $formatter->silence;
287        
288       =back
289        
290       =head3 unsilence
291        
292       =over
293        
294       =item Decription
295        
296       Allows user resume internal messages.
297        
298       $formatter->unsilence;
299        
300       =back
301        
302       =head2 Processing
303        
304       =head3 format_rm_results
305        
306       =over
307        
308       =item Description
309        
310       Main processing function. C<set_output> must be called prior to this.
311        
312       $formatter->format_rm_results(\@rm_out_files, \@rm_in_fasta);
313        
314       =item Args
315        
316       rm_out_files - \@ of repeat masker output files (*.out).
317       rm_in_fasta - \@ reference of the fasta files presented to repeat masker.
318        
319       =item Returns
320        
321       Nothing is returned, results are written to the specified output location as pos/neg_hits.txt
322        
323       =back
324        
325       =head3 load_rm_output
326        
327       =over
328        
329       =item Description
330        
331       Load the repeat-masker output file data. Only best/longest hits are retained. Not really intended
332       for external use, but no reason you can't use it if you want to parse a RM file for the basic data
333       retrieved.
334        
335       =item Args
336        
337       rm_out_files - \@ of repeat masker output files (*.out).
338        
339       =item Returns
340        
341       \% where key is readname and value is the family that masked this read.
342        
343       =back
344        
345