Skip to content

Commit

Permalink
Fixed an untested regression in Bio::DB::Fasta
Browse files Browse the repository at this point in the history
  • Loading branch information
fangly committed Sep 24, 2012
1 parent a5e6170 commit 44b28c0
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 30 deletions.
11 changes: 4 additions & 7 deletions Bio/DB/Fasta.pm
Expand Up @@ -132,12 +132,10 @@ package Bio::DB::Fasta;
use strict;
use IO::File;
use File::Spec;
use File::Basename qw(basename dirname);
use Bio::PrimarySeqI;

use base qw(Bio::DB::IndexedBase);

my $termination_length;
our $obj_class = 'Bio::PrimarySeq::Fasta';
our $file_glob = '*.{fa,FA,fasta,FASTA,fast,FAST,dna,DNA,fna,FNA,faa,FAA,fsa,FSA}';

Expand All @@ -157,8 +155,7 @@ our $file_glob = '*.{fa,FA,fasta,FASTA,fast,FAST,dna,DNA,fna,FNA,faa,FAA,fsa,FSA

sub _calculate_offsets {
# Bio::DB::IndexedBase calls this to calculate offsets
my ($self, $file, $offsets) = @_;
my $fileno = $self->_path2fileno(basename($file));
my ($self, $fileno, $file, $offsets) = @_;

my $fh = IO::File->new($file) or $self->throw( "Could not open $file: $!");
binmode $fh;
Expand All @@ -167,9 +164,9 @@ sub _calculate_offsets {
$last_line, %offsets);
my ($l3_len, $l2_len, $l_len, $blank_lines) = (0, 0, 0, 0);

my $termination_length = $self->{termination_length};
while (my $line = <$fh>) {
# Account for crlf-terminated Windows files
$termination_length ||= ($line =~ /\r\n$/) ? 2 : 1;
if (index($line, '>') == 0) {
if ($line =~ /^>(\S+)/) {
print STDERR "Indexed $count sequences...\n"
Expand Down Expand Up @@ -284,8 +281,8 @@ sub subseq {
my $data;

my $fh = $self->_fh($id) or return;
my $filestart = $self->_caloffset($id, $start, $termination_length);
my $filestop = $self->_caloffset($id, $stop , $termination_length);
my $filestart = $self->_calc_offset($id, $start);
my $filestop = $self->_calc_offset($id, $stop );

seek($fh, $filestart,0);
read($fh, $data, $filestop-$filestart+1);
Expand Down
51 changes: 36 additions & 15 deletions Bio/DB/IndexedBase.pm
Expand Up @@ -203,10 +203,12 @@ When a sequence is deleted from one of the files, this deletion is not detected
by the module and removed from the index. As a result, a "ghost" entry will
remain in the index and will return garbage results if accessed.
Currently, the only way to accommodate sequence removal is to rebuild the entire
index, either by deleting it manually, or by passing -reindex=E<gt>1 to new()
when initializing the module.
Also, if you are indexing a directory, it is wise to not add or remove files
from it.
In case you have changed the files in a directory, or the sequences in a file,
you can to rebuild the entire index, either by deleting it manually, or by
passing -reindex=E<gt>1 to new() when initializing the module.
=head1 SEE ALSO
Expand Down Expand Up @@ -247,7 +249,7 @@ use IO::File;
use AnyDBM_File;
use Fcntl;
use File::Spec;
use File::Basename qw(dirname);
use File::Basename qw(basename dirname);
use Bio::PrimarySeq;

use base qw(Bio::DB::SeqI);
Expand Down Expand Up @@ -603,19 +605,22 @@ sub _index_files {

$self->_set_pack_method( @$files );

# get name of index file
# Get name of index file
my $index = $self->index_name;

# if caller has requested reindexing, unlink the index file.
# If caller has requested reindexing, unlink the index file.
unlink $index if $force_reindex;

# get the modification time of the index
# Get the modification time of the index
my $indextime = (stat $index)[9] || 0;

# list recently updated files
# Register files and find if there has been any update
my $modtime = 0;
my @updated;
for my $file (@$files) {
# Register file
$self->_path2fileno(basename($file));
# Any update?
my $m = (stat $file)[9] || 0;
if ($m > $modtime) {
$modtime = $m;
Expand All @@ -625,19 +630,22 @@ sub _index_files {
}
}

# Get termination length from first file
$self->{termination_length} = $self->_calc_termination_length( $files->[0] );

# Reindex contents of changed files if needed
my $reindex = $force_reindex || (scalar @updated > 0);
$self->{offsets} = $self->_open_index($index, $reindex) or return;

if ($reindex) {
# reindex contents of changed files
$self->{indexing} = $index;
for my $file (@updated) {
&{$self->{offset_meth}}($self, $file, $self->{offsets});
my $fileno = $self->_path2fileno(basename($file));
&{$self->{offset_meth}}($self, $fileno, $file, $self->{offsets});
}
delete $self->{indexing};
}

# closing and reopening might help corrupted index file problem on Windows
# Closing and reopening might help corrupted index file problem on Windows
$self->_close_index($self->{offsets});

return $self->{offsets} = $self->_open_index($index);
Expand Down Expand Up @@ -735,10 +743,23 @@ sub _check_linelength {
}


sub _caloffset {
# Get the offset of the n-th residue of the sequence with the given id
sub _calc_termination_length {
# Try the beginning of the file to determine termination length
# Account for crlf-terminated Windows and Mac files
my ($self, $file) = @_;
my $fh = IO::File->new($file) or $self->throw( "Could not open $file: $!");
my $line = <$fh>;
close $fh;
$self->{termination_length} = ($line =~ /\r\n$/) ? 2 : 1;
return $self->{termination_length};
}


sub _calc_offset {
# Get the offset of the n-th residue of the sequence with the given ID
# and termination length (tl)
my ($self, $id, $n, $tl) = @_;
my ($self, $id, $n) = @_;
my $tl = $self->{termination_length};
$n--;
my ($offset, $seqlen, $linelen) = (&{$self->{unpackmeth}}($self->{offsets}{$id}))[0,1,3];
$n = 0 if $n < 0;
Expand Down
11 changes: 4 additions & 7 deletions Bio/DB/Qual.pm
Expand Up @@ -147,11 +147,9 @@ package Bio::DB::Qual;
use strict;
use IO::File;
use File::Spec;
use File::Basename qw(basename dirname);

use base qw(Bio::DB::IndexedBase);

my $termination_length;
our $obj_class = 'Bio::Seq::PrimaryQual::Qual';
our $file_glob = '*.{qual,QUAL,qa,QA}';

Expand All @@ -171,8 +169,7 @@ our $file_glob = '*.{qual,QUAL,qa,QA}';

sub _calculate_offsets {
# Bio::DB::IndexedBase calls this to calculate offsets
my ($self, $file, $offsets) = @_;
my $fileno = $self->_path2fileno(basename($file));
my ($self, $fileno, $file, $offsets) = @_;

my $fh = IO::File->new($file) or $self->throw("Could not open $file: $!");
binmode $fh;
Expand All @@ -181,9 +178,9 @@ sub _calculate_offsets {
$numres, %offsets);
my ($l3_len, $l2_len, $l_len, $blank_lines) = (0, 0, 0, 0);

my $termination_length = $self->{termination_length};
while (my $line = <$fh>) {
# Account for crlf-terminated Windows files
$termination_length ||= ($line =~ /\r\n$/) ? 2 : 1;
if (index($line, '>') == 0) {
if ($line =~ /^>(\S+)/) {
print STDERR "Indexed $count quality scores...\n"
Expand Down Expand Up @@ -324,8 +321,8 @@ sub subqual {

# Fetch full quality string
my $fh = $self->_fh($id) or return;
my $filestart = $self->_caloffset($id, $string_start, $termination_length);
my $filestop = $self->_caloffset($id, $string_stop , $termination_length);
my $filestart = $self->_calc_offset($id, $string_start);
my $filestop = $self->_calc_offset($id, $string_stop );
seek($fh, $filestart,0);
my $data;
read($fh, $data, $filestop-$filestart+1);
Expand Down
21 changes: 20 additions & 1 deletion t/LocalDB/Fasta.t
Expand Up @@ -2,7 +2,7 @@ BEGIN {
use lib '.';
use Bio::Root::Test;

test_begin( -tests => 66,
test_begin( -tests => 69,
-requires_modules => [qw(Bio::DB::Fasta Bio::SeqIO)]);
}
use strict;
Expand Down Expand Up @@ -53,6 +53,24 @@ my $test_files = [
is $primary_seq->trunc(11, 20)->seq, 'ttctcggggt';
is $primary_seq->description, 'test description', 'bug 3126';
is $primary_seq->seq, 'tcatgttggcttctcggggtttttatggattaatacattttccaaacgattctttgcgccttctgtggtgccgccttctccgaaggaactgacgaaaaatgacgtggatttgctgacaaatccaggcgaggaatatttggacggattgatgaaatggcacggcgacgagcgacccgtgttcaaaagagaggacatttatcgttggtcggatagttttccagaatatcggctaagaatgatttgtctgaaagacacgacaagggtcattgcagtcggtcaatattgttactttgatgctctgaaagaaaggagagcagccattgttcttcttaggattgggatggacggatcctgaatatcgtaatcgggcagttatggagcttcaagcttcgatggcgctggaggagagggatcggtatccgactgccaacgcggcatcgcatccaaataagttcatgaaacgattttggcacatattcaacggcctcaaagagcacgaggacaaaggtcacaaggctgccgctgtttcatacaagagcttctacgacctcanagacatgatcattcctgaaaatctggatgtcagtggtattactgtaaatgatgcacgaaaggtgccacaaagagatataatcaactacgatcaaacatttcatccatatcatcgagaaatggttataatttctcacatgtatgacaatgatgggtttggaaaagtgcgtatgatgaggatggaaatgtacttggaattgtctagcgatgtctttanaccaacaagactgcacattagtcaattatgcagatagcc';

}


{
# Re-open an existing index.
# Doing this test properly involves unloading and reloading Bio::DB::Fasta.

SKIP: {
test_skip(-tests => 1, -requires_modules => [qw(Class::Unload)]);
use_ok('Class::Unload');
Class::Unload->unload( 'Bio::DB::Fasta' );
Class::Unload->unload( 'Bio::DB::IndexedBase' );
require Bio::DB::Fasta;
}

ok my $db = Bio::DB::Fasta->new($test_dir), 'Re-open an existing index';
is $db->seq('AW057119', 1, 10), 'tcatgttggc';
}


Expand Down Expand Up @@ -227,6 +245,7 @@ my $test_files = [
exit;



sub extract_gi {
# Extract GI from RefSeq
my $header = shift;
Expand Down

0 comments on commit 44b28c0

Please sign in to comment.