[Bioperl-guts-l] bioperl commit
Chad Matsalla
matsallac at pub.open-bio.org
Mon Mar 22 16:54:35 EST 2004
matsallac
Mon Mar 22 16:54:34 EST 2004
Update of /home/repository/bioperl/bioperl-live/Bio/SeqIO
In directory pub.open-bio.org:/tmp/cvs-serv17828/Bio/SeqIO
Modified Files:
scf.pm
Log Message:
Because of a 'feature' discovered by Anthony Underwood (Hi Anthony!) I made a
change to scf.pm.
The original specification for scf as found here:
(http://staden.sourceforge.net/manual/formats_unix_8.html)
allows the program writing scf to place base information before sample
information in the file. I did not detect that behavior in any of my programs
so I read the samples and then the bases sequentially.
I changed scf.pm to 'seek' around the file looking for information based
on what is contained in the header rather then assuming a given program
behaves logically.
I wasn't able to find a 'seek' in and of the bioperl IO modules so I used
a perl call. I hope that doesn't break any platform-specific seek issues.
bioperl-live/Bio/SeqIO scf.pm,1.27,1.28
===================================================================
RCS file: /home/repository/bioperl/bioperl-live/Bio/SeqIO/scf.pm,v
retrieving revision 1.27
retrieving revision 1.28
diff -u -r1.27 -r1.28
--- /home/repository/bioperl/bioperl-live/Bio/SeqIO/scf.pm 2003/01/28 23:27:05 1.27
+++ /home/repository/bioperl/bioperl-live/Bio/SeqIO/scf.pm 2004/03/22 21:54:34 1.28
@@ -135,10 +135,11 @@
# the rest of the the information is different between the
$creator->{header} = $self->_get_header($buffer);
if ($creator->{header}->{'version'} lt "3.00") {
+ warn("scf.pm is working with a version 2 scf.\n");
# first gather the trace information
$length = $creator->{header}->{'samples'}*
$creator->{header}->{sample_size}*4;
- $buffer = $self->read_from_buffer($fh,$buffer,$length);
+ $buffer = $self->read_from_buffer($fh,$buffer,$length,$creator->{header}->{samples_offset});
# @read = unpack "n$length",$buffer;
# these traces need to be split
# returns a reference to a hash
@@ -148,7 +149,7 @@
$offset = $creator->{header}->{bases_offset};
$length = ($creator->{header}->{bases} * 12);
seek $fh,$offset,0;
- $buffer = $self->read_from_buffer($fh,$buffer,$length);
+ $buffer = $self->read_from_buffer($fh,$buffer,$length,$creator->{header}->{bases_offset});
# now distill the information into its fractions.
# the old way : $self->_set_v2_bases($buffer);
# ref to an array, ref to a hash, string
@@ -158,54 +159,65 @@
$creator->{accuracies}) = $self->_parse_v2_bases($buffer);
} else {
+ warn("scf.pm is working with a version 3+ scf.\n");
my $transformed_read;
- foreach (qw(a c g t)) {
- $length = $creator->{header}->{'samples'}*
+ my $current_read_position = $creator->{header}->{sample_offset};
+ $length = $creator->{header}->{'samples'}*
$creator->{header}->{sample_size};
- $buffer = $self->read_from_buffer($fh,$buffer,$length);
- my $byte = "n";
- if ($creator->{header}->{sample_size} == 1){
+ # $dumper->dumpValue($creator->{header});
+ foreach (qw(a c g t)) {
+ # print("Reading the samples for $_ from the buffer.\n");
+ $buffer = $self->read_from_buffer($fh,$buffer,$length,$current_read_position);
+ my $byte = "n";
+ if ($creator->{header}->{sample_size} == 1) {
$byte = "c";
- }
- @read = unpack "${byte}${length}",$buffer;
-
- # this little spurt of nonsense is because
- # the trace values are given in the binary
- # file as unsigned shorts but they really
- # are signed deltas. 30000 is an arbitrary number
- # (will there be any traces with a given
- # point greater then 30000? I hope not.
- # once the read is read, it must be changed
- # from relative
- foreach (@read) {
- if ($_ > 30000) {
- $_ -= 65536;
- }
- }
- $transformed_read = $self->_delta(\@read,"backward");
- # For 8-bit data we need to emulate a signed/unsigned
- # cast that is implicit in the C implementations.....
- if($creator->{header}->{sample_size} == 1){
+ }
+ @read = unpack "${byte}${length}",$buffer;
+ # print("This is the read: ".join(',', at read)."\n");
+ # this little spurt of nonsense is because
+ # the trace values are given in the binary
+ # file as unsigned shorts but they really
+ # are signed deltas. 30000 is an arbitrary number
+ # (will there be any traces with a given
+ # point greater then 30000? I hope not.
+ # once the read is read, it must be changed
+ # from relative
+ foreach (@read) {
+ if ($_ > 30000) {
+ $_ -= 65536;
+ }
+ }
+ $transformed_read = $self->_delta(\@read,"backward");
+ # For 8-bit data we need to emulate a signed/unsigned
+ # cast that is implicit in the C implementations.....
+ if ($creator->{header}->{sample_size} == 1) {
foreach (@{$transformed_read}) {
$_ += 256 if ($_ < 0);
}
- }
- $creator->{'traces'}->{$_} = join(' ',@{$transformed_read});
+ }
+ $current_read_position += $length;
+ # print("This is the transformed read: ",join(',',@{$transformed_read}),"\n");
+ $creator->{'traces'}->{$_} = join(' ',@{$transformed_read});
}
# now go and get the peak index information
$offset = $creator->{header}->{bases_offset};
$length = ($creator->{header}->{bases} * 4);
- seek $fh,$offset,0;
- $buffer = $self->read_from_buffer($fh,$buffer,$length);
+ # print("Reading peak index information from the buffer\n");
+ $buffer = $self->read_from_buffer($fh,$buffer,$length,$offset);
$creator->{peak_indices} = $self->_get_v3_peak_indices($buffer);
+ $offset += $length;
# now go and get the accuracy information
- $buffer = $self->read_from_buffer($fh,$buffer,$length);
- $creator->{accuracies} = $self->_get_v3_base_accuracies($buffer);
+ # where, oh where should we go to get this?
+ # print("Reading accuracies from the buffer starting at position $offset\n");
+ $buffer = $self->read_from_buffer($fh,$buffer,$length,$offset);
+ $creator->{accuracies} = $self->_get_v3_base_accuracies($buffer);
# OK, now go and get the base information.
- $length = $creator->{header}->{bases};
- $buffer = $self->read_from_buffer($fh,$buffer,$length);
- $creator->{'sequence'} = unpack("a$length",$buffer);
+ $offset += $length;
+ $length = $creator->{header}->{bases};
+ # print("Reading bases from the buffer.\n");
+ $buffer = $self->read_from_buffer($fh,$buffer,$length,$offset);
+ $creator->{'sequence'} = unpack("a$length",$buffer);
# now, finally, extract the calls from the accuracy information.
$creator->{qualities} = $self->_get_v3_quality(
$creator->{'sequence'},$creator->{accuracies});
@@ -308,7 +320,9 @@
my @read;
$last_base = $offset + $qlength;
for (;$offset < $last_base; $offset += $qlength) {
- @read = unpack "c$qlength", substr($buffer,$offset,$qlength);
+ # a bioperler (perhaps me?) changed the unpack string to include 'n' rather than 'C'
+ # on 040322 I think that 'C' is correct. please email chad if you would like to accuse me of being incorrect
+ @read = unpack "C$qlength", substr($buffer,$offset,$qlength);
$accuracies->{$currbase} = \@read;
}
}
@@ -1111,7 +1125,20 @@
=cut
sub read_from_buffer {
- my ($self,$fh,$buffer,$length) = @_;
+ my ($self,$fh,$buffer,$length,$start_position) = @_;
+ # print("Reading from a buffer!!! length($length) ");
+ if ($start_position) {
+ # print(" startposition($start_position)(".sprintf("%X", $start_position).")\n");
+ }
+ # print("\n");
+ if ($start_position) {
+ # print("seeking to this position in the file: (".$start_position.")\n");
+ seek ($fh,$start_position,0);
+ # print("done. here is where I am now: (".tell($fh).")\n");
+ }
+ else {
+ # print("You did not specify a start position. Going from this position (the current position) (".tell($fh).")\n");
+ }
read $fh, $buffer, $length;
unless (length($buffer) == $length) {
$self->warn("The read was incomplete! Trying harder.");
More information about the Bioperl-guts-l
mailing list