Skip to content

Commit

Permalink
Sanity check and using using Bioperl or Genbank style for coordinate …
Browse files Browse the repository at this point in the history
…style
  • Loading branch information
fangly committed Nov 25, 2011
1 parent 78a8716 commit 2997f00
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 56 deletions.
66 changes: 36 additions & 30 deletions Bio/Seq/SimulatedRead.pm
Expand Up @@ -52,13 +52,13 @@ read on the reference sequence, and the sequencing errors to generate.
The sequence of the read is automatically calculated based on this information.
By default, the description of the reads contain tracking information and will
look like this:
look like this (Bioperl-style):
reference=human_chr2 position=3-12 strand=-1 mid=ACGT errors=1%G,4-,8+AAA description="The human genome"
reference=human_chr2 start=3 end=12 strand=-1 mid=ACGT errors=1%G,4-,8+AAA description="The human genome"
or
or Genbank-style:
reference=human_chr2 position=12-3 mid=ACGT errors=1%G,4-,8+AAA description="The human genome"
reference=human_chr2 position=complement(3..12) mid=ACGT errors=1%G,4-,8+AAA description="The human genome"
Creating a simulated read follows these steps:
1/ Define the read start(), end(), strand() and qual_levels() if you want
Expand Down Expand Up @@ -110,7 +110,7 @@ use base qw( Bio::Seq::Quality Bio::LocatableSeq );
-mid => String of a MID to prepend to the read. See mid().
-track => Track where the read came from in the read description?
See track().
-trackstyle => Define what coordinate system to use. See trackstyle().
-coordstyle => Define what coordinate system to use. See coordstyle().
All other methods from Bio::LocatableSeq are available.
Returns : new Bio::Seq::SimulatedRead object
Expand All @@ -119,10 +119,10 @@ use base qw( Bio::Seq::Quality Bio::LocatableSeq );
sub new {
my ($class, @args) = @_;
my $self = $class->SUPER::new(@args);
my ($qual_levels, $reference, $mid, $errors, $track, $trackstyle) =
$self->_rearrange([qw(QUAL_LEVELS REFERENCE MID ERRORS TRACK TRACKSTYLE)], @args);
$trackstyle = defined $trackstyle ? $trackstyle : 'use_strand';
$self->trackstyle($trackstyle);
my ($qual_levels, $reference, $mid, $errors, $track, $coordstyle) =
$self->_rearrange([qw(QUAL_LEVELS REFERENCE MID ERRORS TRACK COORDSTYLE)], @args);
$coordstyle = defined $coordstyle ? $coordstyle : 'bioperl';
$self->coordstyle($coordstyle);
$track = defined $track ? $track : 1;
$self->track($track);
$qual_levels = defined $qual_levels ? $qual_levels : [];
Expand Down Expand Up @@ -247,8 +247,8 @@ sub _create_desc {
}
# Position of read on reference sequence: start, end and strand
my $strand = $self->strand;
if ($self->trackstyle eq 'use_strand') {
$desc_str .= 'position='.$self->start.'-'.$self->end.' ';
if ($self->coordstyle eq 'bioperl') {
$desc_str .= 'start='.$self->start.' end='.$self->end.' ';
if (defined $strand) {
# Strand of the reference sequence that the read is on
$strand = '+1' if $strand == 1;
Expand All @@ -257,10 +257,10 @@ sub _create_desc {
} else {
if ( (defined $strand) && ($strand == -1) ) {
# Reverse complemented
$desc_str .= 'position='.$self->end.'-'.$self->start.' ';
$desc_str .= 'position=complement('.$self->start.'..'.$self->end.') ';
} else {
# Regular (forward) orientation
$desc_str .= 'position='.$self->start.'-'.$self->end.' ';
$desc_str .= 'position='.$self->start.'..'.$self->end.' ';
}
}
# Description of the original sequence
Expand Down Expand Up @@ -619,28 +619,34 @@ sub track {
}


=head2 trackstyle
=head2 coordstyle
Title : trackstyle
Function : When tracking is on, define what coordinate system to use:
use_strand uses the 'strand' keyword (default)
no_strand does not use the 'strand' keyword
Definition by example for a read taken from the reverse-complement
of a sequence:
use_strand: position=1-10 strand=-1 Bioperl-style
no_strand: position=10-1 More compact
Usage : my $trackstyle = $read->track();
Arguments: 'use_strand' or 'no_strand'
Returns : 'use_strand' or 'no_strand'
Title : coordstyle
Function : When tracking is on, define which 1-based coordinate system to use
in the read description:
* 'bioperl' uses the start, end and strand keywords (default),
similarly to the GFF3 format. Example:
start=1 end=10 strand=+1
start=1 end=10 strand=-1
* 'genbank' does only provide the position keyword. Example:
position=1..10
position=complement(1..10)
Usage : my $coordstyle = $read->track();
Arguments: 'bioperl' or 'genbank'
Returns : 'bioperl' or 'genbank'
=cut

sub trackstyle {
my ($self, $trackstyle) = @_;
if (defined $trackstyle) {
$self->{trackstyle} = $trackstyle;
sub coordstyle {
my ($self, $coordstyle) = @_;
my %styles = ( 'bioperl' => undef, 'genbank' => undef );
if (defined $coordstyle) {
if (not exists $styles{$coordstyle}) {
die "Error: Invalid coordinate style '$coordstyle'\n";
}
$self->{coordstyle} = $coordstyle;
}
return $self->{trackstyle};
return $self->{coordstyle};
}


Expand Down
52 changes: 26 additions & 26 deletions t/Seq/SimulatedRead.t
Expand Up @@ -62,54 +62,54 @@ is $read->end, 12;
is $read->seq, 'TAAAAAAACCCC';
is join(' ',@{$read->qual}), '';
is $read->track, 1;
is $read->desc, 'reference=human_id position=1-12 strand=+1 description="The human genome"';
is $read->desc, 'reference=human_id start=1 end=12 strand=+1 description="The human genome"';

ok $read = Bio::Seq::SimulatedRead->new( -reference => $ref, -track => 1, -trackstyle => 'use_strand' );
is $read->desc, 'reference=human_id position=1-12 strand=+1 description="The human genome"';
ok $read = Bio::Seq::SimulatedRead->new( -reference => $ref, -track => 1, -coordstyle => 'bioperl' );
is $read->desc, 'reference=human_id start=1 end=12 strand=+1 description="The human genome"';

ok $read = Bio::Seq::SimulatedRead->new( -reference => $ref, -track => 1, -trackstyle => 'no_strand' );
is $read->desc, 'reference=human_id position=1-12 description="The human genome"';
ok $read = Bio::Seq::SimulatedRead->new( -reference => $ref, -track => 1, -coordstyle => 'genbank' );
is $read->desc, 'reference=human_id position=1..12 description="The human genome"';

ok $read = Bio::Seq::SimulatedRead->new( -reference => $ref, -qual_levels => [30, 10]);
is $read->start, 1;
is $read->end, 12;
is $read->seq, 'TAAAAAAACCCC';
is join(' ', @{$read->qual}), '30 30 30 30 30 30 30 30 30 30 30 30';
is $read->track, 1;
is $read->desc, 'reference=human_id position=1-12 strand=+1 description="The human genome"';
is $read->desc, 'reference=human_id start=1 end=12 strand=+1 description="The human genome"';
is $read->revcom->seq, 'GGGGTTTTTTTA';

ok $read = Bio::Seq::SimulatedRead->new( -reference => $ref2 );
is $read->start, 1;
is $read->end, 8;
is $read->seq, 'ACGTACGT';
is join(' ',@{$read->qual}), '';
is $read->desc, 'reference=other_genome position=1-8 strand=+1 description="\"Secret\" sequence"';
is $read->desc, 'reference=other_genome start=1 end=8 strand=+1 description="\"Secret\" sequence"';

ok $read = Bio::Seq::SimulatedRead->new( -reference => $ref3 );
is $read->start, 1;
is $read->end, 37;
is $read->seq, 'ACACTGATCTAGCGTCGTGCTAGCTGACGTAGCTGAT';
is join(' ',@{$read->qual}), '';
is $read->desc, 'position=1-37 strand=+1';
is $read->desc, 'start=1 end=37 strand=+1';

ok $read = Bio::Seq::SimulatedRead->new( -reference => $ref, -strand => -1, -qual_levels => [30, 10]);
is $read->seq, 'GGGGTTTTTTTA';
is join(' ', @{$read->qual}), '30 30 30 30 30 30 30 30 30 30 30 30';
is $read->desc, 'reference=human_id position=1-12 strand=-1 description="The human genome"';
is $read->desc, 'reference=human_id start=1 end=12 strand=-1 description="The human genome"';

ok $read = Bio::Seq::SimulatedRead->new( -reference => $ref, -strand => -1, -qual_levels => [30, 10], -trackstyle => 'no_strand' );
is $read->desc, 'reference=human_id position=12-1 description="The human genome"';
ok $read = Bio::Seq::SimulatedRead->new( -reference => $ref, -strand => -1, -qual_levels => [30, 10], -coordstyle => 'genbank' );
is $read->desc, 'reference=human_id position=complement(1..12) description="The human genome"';

ok $read = Bio::Seq::SimulatedRead->new( -reference => $ref, -start => 2, -end => 8, -qual_levels => [30, 10]);
is $read->seq, 'AAAAAAA';
is join(' ', @{$read->qual}), '30 30 30 30 30 30 30';
is $read->desc, 'reference=human_id position=2-8 strand=+1 description="The human genome"';
is $read->desc, 'reference=human_id start=2 end=8 strand=+1 description="The human genome"';

ok $read = Bio::Seq::SimulatedRead->new( -reference => $ref, -strand => -1, -start => 2, -end => 8, -qual_levels => [30, 10]);
is $read->seq, 'TTTTTTT';
is join(' ', @{$read->qual}), '30 30 30 30 30 30 30';
is $read->desc, 'reference=human_id position=2-8 strand=-1 description="The human genome"';
is $read->desc, 'reference=human_id start=2 end=8 strand=-1 description="The human genome"';

$errors = {};
$errors->{'6'}->{'+'} = 'GG';
Expand All @@ -118,7 +118,7 @@ is $read->start, 2;
is $read->end, 8;
is $read->seq, 'TTTTTTGGT';
is join(' ', @{$read->qual}), '30 30 30 30 30 30 10 10 30';
is $read->desc, 'reference=human_id position=2-8 strand=-1 errors=6+G,6+G description="The human genome"';
is $read->desc, 'reference=human_id start=2 end=8 strand=-1 errors=6+G,6+G description="The human genome"';

$errors = {};
$errors->{'6'}->{'+'} = 'GG';
Expand All @@ -129,7 +129,7 @@ is $read->start, 2;
is $read->end, 8;
is $read->seq, 'TAAAAGGA';
is join(' ', @{$read->qual}), '10 30 30 30 30 10 10 30';
is $read->desc, 'reference=human_id position=2-8 strand=+1 errors=1%T,3-,6+G,6+G description="The human genome"';
is $read->desc, 'reference=human_id start=2 end=8 strand=+1 errors=1%T,3-,6+G,6+G description="The human genome"';

$errors = {};
$errors->{'6'}->{'+'} = 'GG';
Expand All @@ -138,28 +138,28 @@ is $read->start, 1;
is $read->end, 12;
is $read->seq, 'TAAAAAGGAACCCC';
is join(' ', @{$read->qual}), '30 30 30 30 30 30 10 10 30 30 30 30 30 30';
is $read->desc, 'reference=human_id position=1-12 strand=+1 errors=6+G,6+G description="The human genome"';
is $read->desc, 'reference=human_id start=1 end=12 strand=+1 errors=6+G,6+G description="The human genome"';

ok $read = Bio::Seq::SimulatedRead->new( -reference => $ref, -errors => $errors, -mid => 'ACGT', -errors => $errors, -qual_levels => [30, 10]);
is $read->start, 1;
is $read->end, 12;
is $read->seq, 'ACGTTAGGAAAAAACCCC';
is join(' ', @{$read->qual}), '30 30 30 30 30 30 10 10 30 30 30 30 30 30 30 30 30 30';
is $read->desc, 'reference=human_id position=1-12 strand=+1 mid=ACGT errors=6+G,6+G description="The human genome"';
is $read->desc, 'reference=human_id start=1 end=12 strand=+1 mid=ACGT errors=6+G,6+G description="The human genome"';

ok $read = Bio::Seq::SimulatedRead->new( -reference => $ref, -mid => 'TTTAAA', -qual_levels => [30, 10]);
is $read->start, 1;
is $read->end, 12;
is $read->seq, 'TTTAAATAAAAAAACCCC';
is join(' ', @{$read->qual}), '30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30';
is $read->desc, 'reference=human_id position=1-12 strand=+1 mid=TTTAAA description="The human genome"';
is $read->desc, 'reference=human_id start=1 end=12 strand=+1 mid=TTTAAA description="The human genome"';

ok $read = Bio::Seq::SimulatedRead->new( -reference => $ref, -mid => '', -qual_levels => []);
is $read->start, 1;
is $read->end, 12;
is $read->seq, 'TAAAAAAACCCC';
is join(' ', @{$read->qual}), '';
is $read->desc, 'reference=human_id position=1-12 strand=+1 description="The human genome"';
is $read->desc, 'reference=human_id start=1 end=12 strand=+1 description="The human genome"';


# Redundant errors
Expand All @@ -173,7 +173,7 @@ is $read->start, 2;
is $read->end, 8;
is $read->seq, 'TAAAAGGA';
is join(' ', @{$read->qual}), '10 30 30 30 30 10 10 30';
is $read->desc, 'reference=human_id position=2-8 strand=+1 errors=1%A,1%G,1%T,3-,3-,6+G,6+G description="The human genome"';
is $read->desc, 'reference=human_id start=2 end=8 strand=+1 errors=1%A,1%G,1%T,3-,3-,6+G,6+G description="The human genome"';


# Specifying errors() after new()
Expand All @@ -183,22 +183,22 @@ is $read->start, 1;
is $read->end, 12;
is $read->seq, 'TAAAAAAACCCC';
is join(' ', @{$read->qual}), '';
is $read->desc, 'reference=human_id position=1-12 strand=+1 description="The human genome"';
is $read->desc, 'reference=human_id start=1 end=12 strand=+1 description="The human genome"';

$errors = {};
ok $read->errors($errors), 'errors()';
is $read->start, 1;
is $read->end, 12;
is $read->seq, 'TAAAAAAACCCC';
is $read->desc, 'reference=human_id position=1-12 strand=+1 description="The human genome"';
is $read->desc, 'reference=human_id start=1 end=12 strand=+1 description="The human genome"';

$errors = {};
$errors->{'6'}->{'+'} = 'GG';
ok $read->errors($errors);
is $read->seq, 'TAAAAAGGAACCCC';
is $read->start, 1;
is $read->end, 12;
is $read->desc, 'reference=human_id position=1-12 strand=+1 errors=6+G,6+G description="The human genome"';
is $read->desc, 'reference=human_id start=1 end=12 strand=+1 errors=6+G,6+G description="The human genome"';


# More tracking tests
Expand All @@ -208,7 +208,7 @@ is $read->track, 0;
is $read->desc, undef;
ok $read->track(1);
is $read->track, 1;
is $read->desc, 'reference=human_id position=1-12 strand=+1 errors=6+G,6+G description="The human genome"';
is $read->desc, 'reference=human_id start=1 end=12 strand=+1 errors=6+G,6+G description="The human genome"';


# qual_levels() method
Expand All @@ -232,13 +232,13 @@ ok $read->mid, 'ACGT';

is $read->seq, 'ACGTTAAAAAAACCCC';
is join(' ', @{$read->qual}), '30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30';
is $read->desc, 'reference=human_id position=1-12 strand=+1 mid=ACGT description="The human genome"';
is $read->desc, 'reference=human_id start=1 end=12 strand=+1 mid=ACGT description="The human genome"';

ok $read->mid('TTTAAA');
ok $read->mid, 'TTTAAA';
is $read->seq, 'TTTAAATAAAAAAACCCC';
is join(' ', @{$read->qual}), '30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30';
is $read->desc, 'reference=human_id position=1-12 strand=+1 mid=TTTAAA description="The human genome"';
is $read->desc, 'reference=human_id start=1 end=12 strand=+1 mid=TTTAAA description="The human genome"';


# Edge case... mutation of the last bases of a simulated read with MID
Expand Down

0 comments on commit 2997f00

Please sign in to comment.