Reputation: 21
I have been struggling to solve this problem but to no avail. In the code below, I use a char pointer (all_seq), but when I free it, at the end of the code, sometimes it segfaults. I say sometimes, because it apparently depends on the input file. With a small input file, it's fine. With a big one, it's also fine. With an intermediate size file, it segfaults. I just can't figure out why...
Any idea will be greatly appreciated. Thanks.
/* SeqProb (Sequence Probability) - 2012/03/13 */
/* Given a series of aligned sequences -in one file, one per line- this program calculates the probability of having, in each location, the letter in the first sequence. */
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
FILE *input_file;
FILE *output_file;
///////////////////////////////
// DECLARATION OF FUNCTIONS: //
///////////////////////////////
int GetSize(FILE *file, int *length, int *lines);
double CalcProb(char *one_seq);
//////////////////////////////////////////////////////////////////
// ************************************************************ //
// ************** START MAIN BLOCK ************** //
// ************************************************************ //
//////////////////////////////////////////////////////////////////
int main( int argc, char *argv[] ) // We expect three command-line arguments: 1)program name, 2)file to read and 3)file to write.
{
int i=0,j=0;
int length, lines, size;
double prob=0;
char *all_seq, *one_seq;
////////////////////////////
// CHECK INPUT ARGUMENTS: //
////////////////////////////
// Check if arguments in command line are right and if file can be opened for reading:
if ( argc != 3 ) { // There should be 3 arguments for correct execution
printf("\n Usage: %s <input file> <output file>\n\n ", argv[0]); // We print argv[0] assuming it is the program name
exit(1);
}
if (!(input_file=fopen(argv[1],"r"))){
printf("\n Error: can't open input file \n\n");
exit(2);
}
if (!(output_file=fopen(argv[2],"w"))){
printf("\n Error: can't open output file \n\n");
exit(3);
}
/////////////////////////////////////////
// GET LENGTH AND NUMBER OF SEQUENCES: //
/////////////////////////////////////////
GetSize(input_file,&length,&lines);
size=length*lines;
printf("\n length of sequences = %d\n",length);
printf(" number of sequences = %d\n",lines);
//////////////////////////////////////////////////////
// ALLOCATE MEMORY FOR all_seq AND one_seq VECTORS: //
//////////////////////////////////////////////////////
all_seq = (char *) malloc((length*lines+10)*sizeof(char));
one_seq = (char *) malloc((lines+10)*sizeof(char));
if (all_seq == NULL){ // In case of memory allocation failure, exit.
printf ("Error: Can't allocate memory \n");
exit(4);
}
if (one_seq == NULL){ // In case of memory allocation failure, exit.
printf ("Error: Can't allocate memory \n");
exit(5);
}
/////////////////////////////////////////////
// INITIALIZE VECTORS all_seq and one_seq: //
/////////////////////////////////////////////
for (i=0;i<=size;i++) {
all_seq[i]=0;
}
for (i=0;i<=length;i++) {
one_seq[i]=0;
}
///////////////////////////////////////////////////////
// LOAD SEQUENCES FROM INPUT FILE IN VECTOR all_seq: //
///////////////////////////////////////////////////////
// Read input file, containing sequences, and put every letter into the all_seq vector, one sequence right after the other (we'll also include the escape character (\n) at the end of each line).
// Note that, in this way, the letter at location (row,col) in the input file will be given by all_seq[(length+1)*row+col]:
printf("\n Processing file: %s\n\n", argv[1]);
rewind(input_file);
for ( i=0; i<=size; ++i) {
fscanf(input_file,"%c",&all_seq[i]);
// printf("%d: %c\n",i, all_seq[i]);
}
// printf("%c",all_seq[2]);
///////////////////////////////////////////////////////////////
// CALCUALTE PROBABILITIES OF EACH LETTER IN FIRST SEQUENCE: //
///////////////////////////////////////////////////////////////
// Read one column at a time, write it to the one_seq vetor, and calculate the probability of occurrance of the first letter in that column; repeat for all columns:
for ( j=0; j<length; j++) {
for ( i=0; i<lines; i++) {
one_seq[i]=all_seq[(length+1)*i+j];
}
prob = CalcProb(one_seq);
// printf("col=%d: prob=%f \n\n",j,prob);
// Write output file: the first column is just an enumeration, from 1 to the length of the sequences, and the second column gives the probability for finding the first letter in the column, anywhere along the column (a measure of its conservation).
fprintf(output_file,"%d %f\n",j+1,prob);
}
////////////////////////////////////
/// CLOSE FILES AND FREE MEMORY: ///
////////////////////////////////////
fclose(input_file);
fclose(output_file);
free( all_seq );
free( one_seq );
return(0);
}
//////////////////////////////////////////////////////////////////
// ************************************************************ //
// *************** END MAIN BLOCK *************** //
// ************************************************************ //
//////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////
// FUNCTION GetSize: LENGTH AND NUMBER OF SEQUENCES //
//////////////////////////////////////////////////////
int GetSize(FILE *file, int *length, int *lines)
{
char c;
int i=0;
int *len;
// Count how many characters in first line:
while ((c=fgetc(file)) != '\n') {
++i;
}
*length=i;
*lines=0;
*len=0;
// Now count many lines, and check that all lines have same length:
i=0;
rewind(file);
while ( (c=fgetc(file)) != EOF ) {
++i;
if (c=='\n') {
*lines=*lines+1;
*len=i-1;
i=0;
// printf("length=%d\n",*len);
if ( *len != *length && *len != 0) {
printf("ERROR: not all lines in input file have the same length.\n");
exit(6);
}
if ( *len == 0) {
*lines=*lines-1;
}
}
}
// printf("lines=%d\n",*lines);
return(0);
}
///////////////////////////////////////////////////////////////////////////////
// FUNCTION CalcProb: PROBABILITY OF THE FIRST CHARACTER IN THE GIVEN VECTOR //
///////////////////////////////////////////////////////////////////////////////
double CalcProb(char *one_seq)
{
int i=0;
double count=0, prob=0;
while ( one_seq[i] != '\0' ) {
// printf("one_seq[%d]= %d\n",i,one_seq[i]);
if ( toupper(one_seq[i]) == toupper(one_seq[0]) ) {
count=count+1;
}
i=i+1;
}
prob=count/i;
// printf("count=%f, i=%d, prob=%f: \n",count,i,prob);
return(prob);
}
Upvotes: 2
Views: 6309
Reputation: 39837
Errors in freeing memory are generally caused by freeing the same pointer twice (not likely here), freeing an invalid pointer, or a previous buffer underrun (of the same block) or overrun (the block before it) causing administrative data to become corrupted. Since you say it happens with large files as input rather than small, you're most likely overrunning a buffer.
It looks like you're overrunning one_seq[] by iterating between 0 and length
, when you allocate only lines
elements.
Note also that you should be able to drop your +10
safety zone, and you should also fix your for loops to be <
the last index, not <=
.
Upvotes: 6