Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Revert "merged from phylip_fixes"
This reverts commit d876623.
  • Loading branch information
daisieh committed Sep 7, 2012
1 parent 76f9253 commit aff4e8e
Show file tree
Hide file tree
Showing 2 changed files with 95 additions and 60 deletions.
89 changes: 52 additions & 37 deletions Bio/AlignIO/phylip.pm
Expand Up @@ -179,17 +179,19 @@ sub next_aln {
# skip blank lines until we see header line
# if we see a non-blank line that isn't the seqcount and residuecount line
# then bail out of next_aln (return)
HEADER: while ($entry = $self->_readline) {
next if $entry =~ /^\s?$/;
if ($entry =~ /\s*(\d+)\s+(\d+)/) {
while ($entry = $self->_readline) {
if ($entry =~ /^\s?$/) {
next;
} elsif ($entry =~ /\s*(\d+)\s+(\d+)/) {
($seqcount, $residuecount) = ($1, $2);

last;
} else {
$self->warn ("Failed to parse PHYLIP: Did not see a sequence count and residue count.");
return;
}
last HEADER;
}
return unless $seqcount and $residuecount;

# first alignment section
# First alignment section. We expect to see a name and (part of) a sequence.
my $idlen = $self->idlength;
$count = 0;
my $iter = 1;
Expand Down Expand Up @@ -277,39 +279,52 @@ sub next_aln {
}
return if scalar @names < 1;

# sequence creation
$count = 0;
foreach $name ( @names ) {
$count++;
if( $name =~ /(\S+)\/(\d+)-(\d+)/ ) {
$seqname = $1;
$start = $2;
$end = $3;
} else {
$seqname=$name;
$start = 1;
$str = $hash{$count};
# $str =~ s/[^A-Za-z]//g;
#$end = length($str);
}
# consistency test
$self->throw("Length of sequence [$seqname] is not [$residuecount] it is ".CORE::length($hash{$count})."! ")
unless CORE::length($hash{$count}) == $residuecount;

$seq = Bio::LocatableSeq->new('-seq' => $hash{$count},
'-display_id' => $seqname,
'-start' => $start,
(defined $end) ? ('-end' => $end) : (),
'-alphabet' => $self->alphabet,
);
$aln->add_seq($seq);
$count++;
# if we've read as many seqs as we're supposed to, move on.
if ($count == $seqcount) {
last;
}
}

}
return $aln if $aln->num_sequences;
return;
# if we are interleaved, we're going to keep seeing chunks of sequence until we get all of it.
if ($self->interleaved) {
while (length($hash{$seqcount-1}) < $residuecount) {
$count = 0;
while ($entry = $self->_readline) {
if ($entry =~ /^\s*$/) { # eat newlines
if ($count != 0) { # there was a newline at an unexpected place!
$self->warn("Failed to parse PHYLIP: Interleaved file is missing a segment: saw $count, expected $seqcount.");
return;
}
next;
} else { # start taking in chunks
$entry =~ s/\s//g;
$hash{$count} .= $entry;
$count++;
}
if ($count >= $seqcount) { # we've read all of the sequences for this chunk, so move on.
last;
}
}
}
}
if ((scalar @names) != $seqcount) {
$self->warn("Failed to parse PHYLIP: Did not see the correct number of seqs: saw " . scalar(@names) . ", expected $seqcount.");
return;
}
for ($count=0; $count<$seqcount; $count++) {
$str = $hash{$count};
my $seqname = @names[$count];
if (length($str) != $residuecount) {
$self->warn("Failed to parse PHYLIP: Sequence $seqname was the wrong length: " . length($str) . " instead of $residuecount.");
}
$seq = Bio::LocatableSeq->new('-seq' => $hash{$count},
'-display_id' => $seqname);
$aln->add_seq($seq);
}
return $aln;
}


=head2 write_aln
Title : write_aln
Expand Down
66 changes: 43 additions & 23 deletions t/AlignIO/phylip.t
Expand Up @@ -7,7 +7,7 @@ BEGIN {
use lib '.';
use Bio::Root::Test;

test_begin(-tests => 16);
test_begin(-tests => 17);

use_ok('Bio::AlignIO::phylip');
}
Expand All @@ -16,12 +16,49 @@ my $DEBUG = test_debug();

my ($str,$aln,$strout,$status);

# PHYLIP sequential/non-interleaved
$strout = Bio::AlignIO->new('-file' => test_input_file('noninterleaved.phy'), '-interleaved' => 0,
'-format' => 'phylip');
$aln = $strout->next_aln($aln);
isa_ok($aln,'Bio::Align::AlignI');
is($aln->get_seq_by_pos(2)->seq(), 'CCTCAGATCACTCTTTGGCAACGACCCCTCGTCACAATAA'.
'AGGTAGGGGGGCAACTAAAGGAAGCTCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGACATGAATT'.
'TGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGGTTTATCAAAGTAAGACAGTATGATCAGA'.
'TACCCATAGAGATCTGTGGACATAAAGCTATAGGTACAGTATTAGTAGGACCCACACCTGTCAATATAATTG'.
'GAAGAAATCTGTTGACTCAGATTGGTTGCACTTTAAATTTT' );

# PHYLIP interleaved with long Ids

$str = Bio::AlignIO->new(
'-file' => test_input_file("protpars_longid.phy"),
'-format' => 'phylip',
'longid' => 1);

$aln = $str->next_aln();
#isa_ok($str,'Bio::AlignIO');
isa_ok($aln,'Bio::Align::AlignI');
is $aln->get_seq_by_pos(1)->get_nse, 'S I N F R U P 0 0 1 /1-84';
is $aln->get_seq_by_pos(2)->get_nse, 'SINFRUP002/1-84';

# PHYLIP interleaved, multiple segments
$str = Bio::AlignIO->new(
'-file' => test_input_file("protpars.phy"),
'-format' => 'phylip');

$aln = $str->next_aln();
#isa_ok($str,'Bio::AlignIO');
isa_ok($aln,'Bio::Align::AlignI');
is $aln->get_seq_by_pos(1)->get_nse, 'SINFRUP001/1-4940';
# is $aln->get_seq_by_pos(2)->get_nse, 'SINFRUP002/1-84';


# PHYLIP interleaved

$str = Bio::AlignIO->new(
'-file' => test_input_file("testaln.phylip"),
'-format' => 'phylip');
isa_ok($str,'Bio::AlignIO');
$aln = $str->next_aln();
#isa_ok($str,'Bio::AlignIO');
isa_ok($aln,'Bio::Align::AlignI');
is $aln->get_seq_by_pos(1)->get_nse, 'Homo_sapie/1-45';

Expand All @@ -45,25 +82,8 @@ TODO: {
is($ls->length,47);
}

# PHYLIP sequential/non-interleaved
$strout = Bio::AlignIO->new('-file' => test_input_file('noninterleaved.phy'),
'-format' => 'phylip');
$aln = $strout->next_aln($aln);
isa_ok($aln,'Bio::Align::AlignI');
is($aln->get_seq_by_pos(2)->seq(), 'CCTCAGATCACTCTTTGGCAACGACCCCTCGTCACAATAA'.
'AGGTAGGGGGGCAACTAAAGGAAGCTCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGACATGAATT'.
'TGCCAGGAAGATGGAAACCAAAAATGATAGGGGGAATTGGAGGGTTTATCAAAGTAAGACAGTATGATCAGA'.
'TACCCATAGAGATCTGTGGACATAAAGCTATAGGTACAGTATTAGTAGGACCCACACCTGTCAATATAATTG'.
'GAAGAAATCTGTTGACTCAGATTGGTTGCACTTTAAATTTT' );

# PHYLIP interleaved with long Ids
$str = Bio::AlignIO->new(
'-file' => test_input_file("protpars_longid.phy"),
'-format' => 'phylip',
'longid' => 1);

isa_ok($str,'Bio::AlignIO');
# check to see that newlines between header and sequences are parsed correctly
$str = Bio::AlignIO->new('-file' => test_input_file("codeml45b.mlc"), '-format' => 'phylip', '-longid' => 1);
$aln = $str->next_aln();
isa_ok($aln,'Bio::Align::AlignI');
is $aln->get_seq_by_pos(1)->get_nse, 'S I N F R U P 0 0 1 /1-84';
is $aln->get_seq_by_pos(2)->get_nse, 'SINFRUP002/1-84';
my $ls = $aln->get_seq_by_pos(9);
ok($ls->display_id eq "Pop_trich_ch", "newline between header and sequences is parsed correctly");

0 comments on commit aff4e8e

Please sign in to comment.