line |
bran |
sub |
pod |
code |
1
|
|
|
|
package Sanger::CGP::TraFiC::Formatter; |
2
|
|
|
|
|
3
|
|
|
|
##########LICENCE########## |
4
|
|
|
|
# Copyright (c) 2015 Genome Research Ltd. |
5
|
|
|
|
# |
6
|
|
|
|
# Author: Cancer Genome Project cgpit@sanger.ac.uk |
7
|
|
|
|
# |
8
|
|
|
|
# This file is part of TraFiC. |
9
|
|
|
|
# |
10
|
|
|
|
# TraFiC is free software: you can redistribute it and/or modify it under |
11
|
|
|
|
# the terms of the GNU Affero General Public License as published by the Free |
12
|
|
|
|
# Software Foundation; either version 3 of the License, or (at your option) any |
13
|
|
|
|
# later version. |
14
|
|
|
|
# |
15
|
|
|
|
# This program is distributed in the hope that it will be useful, but WITHOUT |
16
|
|
|
|
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
17
|
|
|
|
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more |
18
|
|
|
|
# details. |
19
|
|
|
|
# |
20
|
|
|
|
# You should have received a copy of the GNU Affero General Public License |
21
|
|
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>. |
22
|
|
|
|
# |
23
|
|
|
|
# 1. The usage of a range of years within a copyright statement contained within |
24
|
|
|
|
# this distribution should be interpreted as being equivalent to a list of years |
25
|
|
|
|
# including the first and last year specified and all consecutive years between |
26
|
|
|
|
# them. For example, a copyright statement that reads 'Copyright (c) 2005, 2007- |
27
|
|
|
|
# 2009, 2011-2012' should be interpreted as being identical to a statement that |
28
|
|
|
|
# reads 'Copyright (c) 2005, 2007, 2008, 2009, 2011, 2012' and a copyright |
29
|
|
|
|
# statement that reads "Copyright (c) 2005-2012' should be interpreted as being |
30
|
|
|
|
# identical to a statement that reads 'Copyright (c) 2005, 2006, 2007, 2008, |
31
|
|
|
|
# 2009, 2010, 2011, 2012'." |
32
|
|
|
|
##########LICENCE########## |
33
|
|
|
|
|
34
|
|
|
|
|
35
|
|
2
|
|
use strict; |
36
|
|
2
|
|
use autodie qw(:all); |
37
|
|
2
|
|
use English qw( -no_match_vars ); |
38
|
|
2
|
|
use warnings FATAL => 'all'; |
39
|
|
2
|
|
use Carp qw(croak); |
40
|
|
2
|
|
use File::Path qw(make_path remove_tree); |
41
|
|
2
|
|
use File::Temp qw(tempdir); |
42
|
|
2
|
|
use File::Which qw(which); |
43
|
|
2
|
|
use Const::Fast qw(const); |
44
|
|
|
|
|
45
|
|
2
|
|
use Sanger::CGP::TraFiC; |
46
|
|
|
|
|
47
|
|
|
|
const my $TAB => "\t"; |
48
|
|
|
|
|
49
|
|
|
|
sub new { |
50
|
|
3
|
1
|
my ($class) = @_; |
51
|
|
|
|
my $self = { }; |
52
|
|
|
|
bless $self, $class; |
53
|
100
|
|
|
croak "Unable to find standard unix 'sort' in path." unless(defined which('sort')); |
54
|
|
|
|
$self->{'silent'} = 0; |
55
|
|
|
|
return $self; |
56
|
|
|
|
} |
57
|
|
|
|
|
58
|
|
|
|
sub set_output { |
59
|
|
1
|
1
|
my ($self, $root_path) = @_; |
60
|
50
|
|
|
croak "$root_path does not exist." unless(-e $root_path); |
61
|
50
|
|
|
croak "$root_path is not a directory." unless(-d $root_path); |
62
|
50
|
|
|
croak "$root_path cannot be written to." unless(-w $root_path); |
63
|
|
|
|
$self->{'temp'} = tempdir ( 'TraFiC_XXXX', DIR => $root_path ); |
64
|
|
|
|
make_path($self->{'temp'}); |
65
|
|
|
|
|
66
|
|
|
|
$self->{'+_tmp'} = "$self->{temp}/pos_hits.txt"; |
67
|
|
|
|
$self->{'-_tmp'} = "$self->{temp}/neg_hits.txt"; |
68
|
|
|
|
|
69
|
|
|
|
open my $pos, '>', $self->{'+_tmp'}; |
70
|
|
|
|
open my $neg, '>', $self->{'-_tmp'}; |
71
|
|
|
|
|
72
|
|
|
|
$self->{'+'} = $pos; |
73
|
|
|
|
$self->{'-'} = $neg; |
74
|
|
|
|
|
75
|
|
|
|
$self->{'+_final'} = "$root_path/pos_hits.txt"; |
76
|
|
|
|
$self->{'-_final'} = "$root_path/neg_hits.txt"; |
77
|
|
|
|
return 1; |
78
|
|
|
|
} |
79
|
|
|
|
|
80
|
|
|
|
sub format_rm_results { |
81
|
|
0
|
1
|
my ($self, $out_list, $fa_list) = @_; |
82
|
|
|
|
my $rm_hits = $self->load_rm_output($out_list); |
83
|
|
|
|
$self->output_matched_hits($rm_hits, $fa_list); |
84
|
|
|
|
$self->sort_hits; |
85
|
|
|
|
return 1; |
86
|
|
|
|
} |
87
|
|
|
|
|
88
|
|
|
|
sub sort_hits { |
89
|
|
1
|
0
|
my ($self) = @_; |
90
|
|
|
|
close $self->{'+'}; |
91
|
|
|
|
close $self->{'-'}; |
92
|
|
|
|
# remove the fh entries so destroy doesn't try again. |
93
|
|
|
|
delete $self->{'+'}; |
94
|
|
|
|
delete $self->{'-'}; |
95
|
|
|
|
$self->sort_file('+'); |
96
|
|
|
|
$self->sort_file('-'); |
97
|
|
|
|
return 1; |
98
|
|
|
|
} |
99
|
|
|
|
|
100
|
|
|
|
sub sort_file { |
101
|
|
2
|
0
|
my ($self, $strand) = @_; |
102
|
|
|
|
# safest to use unix command as will be fast and lean |
103
|
|
|
|
# sort -k 2,2 -k 3,3n -o file -T $self->{temp} -S BUFFER_SIZE |
104
|
|
|
|
my $sort_command = sprintf "sort -k 4,4 -k 2,2 -k 3,3n -T %s -S 200M -o %s %s", |
105
|
|
|
|
$self->{temp}, |
106
|
|
|
|
$self->{$strand.'_final'}, # output file |
107
|
|
|
|
$self->{$strand.'_tmp'}; # input file |
108
|
50
|
|
|
croak "Failed to sort file using command: $sort_command" unless(system($sort_command) == 0); |
109
|
|
|
|
return 1; |
110
|
|
|
|
} |
111
|
|
|
|
|
112
|
|
|
|
sub output_matched_hits { |
113
|
|
1
|
0
|
my ($self, $rm_hits, $fa_list) = @_; |
114
|
|
|
|
|
115
|
|
|
|
my ($readname, $chr, $pos, $class, $strand); |
116
|
|
|
|
my $remaining_hits = scalar(keys %{$rm_hits}); |
117
|
50
|
|
|
print "Correlating $remaining_hits records\n" unless($self->{'silent'}); |
118
|
|
|
|
SHORT: for my $in_fa(@{$fa_list}) { |
119
|
|
|
|
open my $FA_FILE, '<', $in_fa; |
120
|
|
|
|
while (my $line = <$FA_FILE>) { |
121
|
|
|
|
<$FA_FILE>; # discard the sequence line |
122
|
|
|
|
chomp $line; |
123
|
|
|
|
($readname, $chr, $pos, $class, $strand) = split /\t/, substr($line, 1); # ditch leading '>' |
124
|
50
|
|
|
if(exists $rm_hits->{$readname}) { |
125
|
|
|
|
print {$self->{$strand}} join $TAB, ($readname, |
126
|
|
|
|
$chr, |
127
|
|
|
|
$pos, |
128
|
|
|
|
$rm_hits->{$readname}, |
129
|
|
|
|
$class, |
130
|
|
|
|
$strand); |
131
|
|
|
|
print {$self->{$strand}} $RS; |
132
|
|
|
|
delete $rm_hits->{$readname}; # removing things from the hash will speed up subsequent lookups |
133
|
100
|
|
|
if(--$remaining_hits == 0) { |
134
|
|
|
|
close $FA_FILE; |
135
|
50
|
|
|
print "All records correlated\n" unless($self->{'silent'}); |
136
|
|
|
|
last SHORT; |
137
|
|
|
|
} |
138
|
50
|
|
|
print "\t$remaining_hits remaining\n" if($self->{'silent'} != 1 && $remaining_hits % 5000 == 0); |
139
|
|
|
|
} |
140
|
|
|
|
} |
141
|
|
|
|
close $FA_FILE; |
142
|
|
|
|
} |
143
|
|
|
|
return 1; |
144
|
|
|
|
} |
145
|
|
|
|
|
146
|
|
|
|
sub load_rm_output { |
147
|
|
1
|
1
|
my ($self, $out_list) = @_; |
148
|
|
|
|
my %rm_hits; |
149
|
|
|
|
my $last_score = 0; |
150
|
50
|
|
|
print "Loading repeat masker output\n" unless($self->{'silent'}); |
151
|
|
|
|
for my $rm_out(@{$out_list}) { |
152
|
50
|
|
|
print "\t$rm_out\n" unless($self->{'silent'}); |
153
|
|
|
|
open my $RM_FILE, '<', $rm_out; |
154
|
|
|
|
while (my $line = <$RM_FILE>) { |
155
|
|
|
|
chomp $line; |
156
|
|
|
|
$line =~ s/^\s+//; |
157
|
|
|
|
$line =~ s/\s+$//; |
158
|
|
|
|
|
159
|
|
|
|
# best hits always end with an id |
160
|
|
|
|
# alternate hits (shorter matches) are terminated with |
161
|
|
|
|
# an additional (unheaded) column containing '*' |
162
|
|
|
|
# we only want the best hits |
163
|
100
|
|
|
next unless($line =~ m/\d$/); |
164
|
|
|
|
|
165
|
|
|
|
my @bits = split /[\s\/]+/, $line; |
166
|
|
|
|
|
167
|
|
|
|
# all reads are output adjacent, however not ordered by score. |
168
|
100
|
|
|
if(exists $rm_hits{$bits[4]}) { |
169
|
|
|
|
# if this read is already stored but this record has a better score |
170
|
|
|
|
# then replace it, otherwise skip |
171
|
100
|
|
|
$rm_hits{$bits[4]} = $bits[11] if($bits[0] > $last_score); |
172
|
|
|
|
} |
173
|
|
|
|
else { |
174
|
|
|
|
$rm_hits{$bits[4]} = $bits[11]; # readname and family are only bits needed |
175
|
|
|
|
} |
176
|
|
|
|
$last_score = $bits[0]; |
177
|
|
|
|
} |
178
|
|
|
|
close $RM_FILE; |
179
|
|
|
|
} |
180
|
50
|
|
|
print "Loading complete\n" unless($self->{'silent'}); |
181
|
|
|
|
return \%rm_hits; |
182
|
|
|
|
} |
183
|
|
|
|
|
184
|
|
|
|
sub silence { |
185
|
|
1
|
1
|
shift->{'silent'} = 1; |
186
|
|
|
|
} |
187
|
|
|
|
|
188
|
|
|
|
sub unsilence { |
189
|
|
0
|
1
|
shift->{'silent'} = 0; |
190
|
|
|
|
} |
191
|
|
|
|
|
192
|
|
|
|
sub DESTROY { |
193
|
|
3
|
|
my $self = shift; |
194
|
50
|
|
|
close $self->{'+'} if(defined $self->{'+'}); |
195
|
50
|
|
|
close $self->{'-'} if(defined $self->{'-'}); |
196
|
|
|
|
|
197
|
100
|
|
|
remove_tree($self->{'temp'}) if(defined $self->{'temp'});; |
198
|
|
|
|
return 1; |
199
|
|
|
|
} |
200
|
|
|
|
|
201
|
|
|
|
|
202
|
|
|
|
1; |
203
|
|
|
|
|
204
|
|
|
|
__END__ |
205
|
|
|
|
|
206
|
|
|
|
=head1 NAME |
207
|
|
|
|
|
208
|
|
|
|
Sanger::CGP::TraFiC::Formatter |
209
|
|
|
|
|
210
|
|
|
|
=head1 SYNOPSIS |
211
|
|
|
|
|
212
|
|
|
|
use Sanger::CGP::TraFiC::Formatter; |
213
|
|
|
|
my $formatter = Sanger::CGP::TraFiC::Formatter->new; |
214
|
|
|
|
$formatter->set_output($output_dir); |
215
|
|
|
|
$formatter->format_rm_results($repeatmasker_files, $fasta_files); |
216
|
|
|
|
|
217
|
|
|
|
=head1 GENERAL |
218
|
|
|
|
|
219
|
|
|
|
Designed to take one or more repeatmasker *.out files along with the input files to generate two |
220
|
|
|
|
output files (positive anchors, negative anchors) containing the full information about the mapped |
221
|
|
|
|
end of those reads that were successfully processed by repeat masker. |
222
|
|
|
|
|
223
|
|
|
|
All the data required is encoded in the fasta header for each read: |
224
|
|
|
|
|
225
|
|
|
|
=over |
226
|
|
|
|
|
227
|
|
|
|
=item 1. readname |
228
|
|
|
|
|
229
|
|
|
|
=item 2. chr |
230
|
|
|
|
|
231
|
|
|
|
=item 3. pos |
232
|
|
|
|
|
233
|
|
|
|
=item 4. repeat family |
234
|
|
|
|
|
235
|
|
|
|
=item 5. strand (+/-) |
236
|
|
|
|
|
237
|
|
|
|
=back |
238
|
|
|
|
|
239
|
|
|
|
2,3 and 5 are based on the data for the mapped end of the pair. |
240
|
|
|
|
|
241
|
|
|
|
This means we only need to store the readname and family of repeat in memory from the rm output as |
242
|
|
|
|
the rest can be read back from the rm input files with no memory overhead. |
243
|
|
|
|
|
244
|
|
|
|
=head1 METHODS |
245
|
|
|
|
|
246
|
|
|
|
=head2 Constructor/configuration |
247
|
|
|
|
|
248
|
|
|
|
=head3 new |
249
|
|
|
|
|
250
|
|
|
|
=over |
251
|
|
|
|
|
252
|
|
|
|
=item Description |
253
|
|
|
|
|
254
|
|
|
|
Initialises the object and ensures that unix sort is available in path. |
255
|
|
|
|
|
256
|
|
|
|
my $formatter = Sanger::CGP::TraFiC::Formatter->new; |
257
|
|
|
|
|
258
|
|
|
|
=item Errors |
259
|
|
|
|
|
260
|
|
|
|
Will C<croak> if unable to find C<sort> in path with this message: |
261
|
|
|
|
|
262
|
|
|
|
Unable to find standard unix 'sort' in path |
263
|
|
|
|
|
264
|
|
|
|
=back |
265
|
|
|
|
|
266
|
|
|
|
=head3 set_output |
267
|
|
|
|
|
268
|
|
|
|
=over |
269
|
|
|
|
|
270
|
|
|
|
=item Description |
271
|
|
|
|
|
272
|
|
|
|
Specify the output folder for collated results. |
273
|
|
|
|
|
274
|
|
|
|
$formatter->set_output($output_path); |
275
|
|
|
|
|
276
|
|
|
|
=back |
277
|
|
|
|
|
278
|
|
|
|
=head3 silence |
279
|
|
|
|
|
280
|
|
|
|
=over |
281
|
|
|
|
|
282
|
|
|
|
=item Description |
283
|
|
|
|
|
284
|
|
|
|
Allows user to silence internal messages. Mainly added for testing system |
285
|
|
|
|
|
286
|
|
|
|
$formatter->silence; |
287
|
|
|
|
|
288
|
|
|
|
=back |
289
|
|
|
|
|
290
|
|
|
|
=head3 unsilence |
291
|
|
|
|
|
292
|
|
|
|
=over |
293
|
|
|
|
|
294
|
|
|
|
=item Decription |
295
|
|
|
|
|
296
|
|
|
|
Allows user resume internal messages. |
297
|
|
|
|
|
298
|
|
|
|
$formatter->unsilence; |
299
|
|
|
|
|
300
|
|
|
|
=back |
301
|
|
|
|
|
302
|
|
|
|
=head2 Processing |
303
|
|
|
|
|
304
|
|
|
|
=head3 format_rm_results |
305
|
|
|
|
|
306
|
|
|
|
=over |
307
|
|
|
|
|
308
|
|
|
|
=item Description |
309
|
|
|
|
|
310
|
|
|
|
Main processing function. C<set_output> must be called prior to this. |
311
|
|
|
|
|
312
|
|
|
|
$formatter->format_rm_results(\@rm_out_files, \@rm_in_fasta); |
313
|
|
|
|
|
314
|
|
|
|
=item Args |
315
|
|
|
|
|
316
|
|
|
|
rm_out_files - \@ of repeat masker output files (*.out). |
317
|
|
|
|
rm_in_fasta - \@ reference of the fasta files presented to repeat masker. |
318
|
|
|
|
|
319
|
|
|
|
=item Returns |
320
|
|
|
|
|
321
|
|
|
|
Nothing is returned, results are written to the specified output location as pos/neg_hits.txt |
322
|
|
|
|
|
323
|
|
|
|
=back |
324
|
|
|
|
|
325
|
|
|
|
=head3 load_rm_output |
326
|
|
|
|
|
327
|
|
|
|
=over |
328
|
|
|
|
|
329
|
|
|
|
=item Description |
330
|
|
|
|
|
331
|
|
|
|
Load the repeat-masker output file data. Only best/longest hits are retained. Not really intended |
332
|
|
|
|
for external use, but no reason you can't use it if you want to parse a RM file for the basic data |
333
|
|
|
|
retrieved. |
334
|
|
|
|
|
335
|
|
|
|
=item Args |
336
|
|
|
|
|
337
|
|
|
|
rm_out_files - \@ of repeat masker output files (*.out). |
338
|
|
|
|
|
339
|
|
|
|
=item Returns |
340
|
|
|
|
|
341
|
|
|
|
\% where key is readname and value is the family that masked this read. |
342
|
|
|
|
|
343
|
|
|
|
=back |
344
|
|
|
|
|
345
|
|
|
|
|