Alex
Alex

Reputation: 301

C - Trying to go back to previous line in the file

I have to read a text file which can begin with optional comments. In practice I have to skip any line at the beginning of the file that doesn't begin with '@' or '>'. In my test case the file looks like:

# Sun Jul 12 22:04:52 2009 /share/apps/corona/bin/filter_fasta.pl --output=/data/results/solid0065/primary.20090712170542775 
# Cwd: /state/partition1/home/pipeline
# Title: solid0065_20090629_FC1_Tomate_Heinz_4_5_Kb_Tomate_Heinz_4_5_Kb_01
>125_963_316_F3
T1230330231223011323010013


So I have to skip the first 3 line (but in general I have to skip n lines). I have to repeat this with 2 or 4 files [which are inside FILE** inputFiles]. I've tried with this loop:

buffer = (char*) malloc (sizeof(char) * 5000);
if (buffer == NULL)
    notEnoughMemory();

for (i = 0; i < (cIn-1); i++){
    fgetpos(inputFiles[i], &position);
    fgets(buffer, 4999, inputFiles[i]);
    while ((buffer[0] != '@') && (buffer[0] != '>')){
        fgetpos(inputFiles[i], &position);
        fgets(buffer, 4999, inputFiles[i]);
    }
    fsetpos(inputFiles[i], &position);
}


Where cIn is number_of_input_files + 1. Trying to debug it the loop correctly stops after it reads the fourth line. But when I use setpos it doesn't go back to the beginning of the fourth line as I'd expect, but at the middle of the third. In fact if, exactly after the fsetpos(), I print buffer after these operations:

fgets(buffer, 4999, inputFiles[i]);
fgets(buffer, 4999, inputFiles[i]);

I get:

FC1_Tomate_Heinz_4_5_Kb_Tomate_Heinz_4_5_Kb_01
>125_963_316_F3

Any idea? Thanks in advance

Upvotes: 0

Views: 6072

Answers (4)

Arpan Saini
Arpan Saini

Reputation: 5181

You can get the position at any given point. It's really helpful when you checking the null in while condition, but after come inside you want to set the cursor back to previous line.

fpos_t position;

fgetpos (file, &position);

Then Can set back to the same position:

fsetpos (file, &position);

Please follow the docs, It's tried and tested , working fine. http://www.cplusplus.com/reference/cstdio/fgetpos/

Upvotes: 0

wildplasser
wildplasser

Reputation: 44240

(IMHO )Best is to read the entire file into one big buffer (mmap is also an option, if available) , then find and fix the line endings and fasta headers. This will also reduce memory fragmentation. And it simpifies the 'parser' a lot.

EDIT: added source (it is not perfect, but last time I checked it, it worked ;-) Might be incomplete, I snipped it from a larger program.

struct fastapart {
  char * name;
  char * data;
  unsigned size;
  struct roedel *friends;
  };
struct fastafile {
  size_t totsize;
  char *tot;
  unsigned count;
  struct fastapart *parts;
  int *alloc;
  };

struct fastafile * read_complete_fasta(char *name)
{
int rc,state;
struct fastafile * result;
size_t pos,len,cnt,idx;
struct strbuff *fwd=NULL,*rev = NULL;

result = malloc (sizeof *result);
if (!result) return NULL;
result->tot = read_complete_file(name , &result->totsize);
if (!result->tot) goto failfree;

result->count = 0;
result->parts = NULL;

for (pos=cnt=state=0; pos < result->totsize; ) {
switch (state) {
case 0: /* find first '>' */
  if (result->tot[pos] == '>') { pos++; state=2; continue; }
  pos += strcspn( result->tot+pos, "\n" );
case 1: /* not found: sync to newline */
  if (result->tot[pos] == '\n') { pos++; state=0; continue; }
  else pos++;
  continue;;
case 2: /* Got '>'; grab name */
  len = strcspn( result->tot+pos, " \t\n" );
  if (cnt >= result->count) {
    size_t siz;
    siz = result->count ? 2* result->count: 16;
    result->parts = realloc( result->parts
      , siz * sizeof *result->parts);
    for (  ; result->count < siz;result->count ++) {
      result->parts[cnt].name = NULL;
      result->parts[cnt].data = NULL;
      result->parts[cnt].friends = NULL;
      result->parts[cnt].size = 0;
      }
    }

  result->parts[cnt].name = result->tot+pos;
  result->parts[cnt].name[len] = 0;
  pos += 1+len;
  len = strspn( result->tot+pos, " \t\n" );
  pos += len;
  state++;
  continue;
case 3: /* grab data; for the moment, throw away reversed data */
  if (result->tot[pos] == '>') {
    if (fwd) {
      memcpy(result->parts[cnt].data, fwd->data, fwd->used ); result->parts[cnt].size = fwd->used;
      result->parts[cnt].data [ fwd->used ] = 0;
      fwd->used = 0; }
    if (rev) {
      /* memcpy(result->parts[cnt].data+result->parts[cnt].size, rev->data, rev->used );  */
      rev->used = 0;
      }
    if (result->parts[cnt].data) cnt++;
    pos++; state=2;
    continue;
    }
  len = strcspn( result->tot+pos, "\t\n" );
  if (!len) { /* empty line; what to do? skip it! */
    fprintf(stderr, "Empty\n" );
    pos++; state=1;
    continue; }
  if (!result->parts[cnt].data) {result->parts[cnt].data = result->tot+pos;  }
  fwd = strbuff_add(fwd, result->tot+pos, len);
  pos += len;
  if (result->tot[pos] == '\t' ) {
    pos += strspn(result->tot+pos, " \t" );
    len = strcspn( result->tot+pos, "\n" );
    rev =  strbuff_add(rev, result->tot+pos, len);
    pos += len;
    }
  pos += strspn(result->tot+pos, " \t\r\n" );
  }}
if (state == 3) {
  if (fwd) {
    memcpy(result->parts[cnt].data, fwd->data, fwd->used ); result->parts[cnt].size = fwd->used;
    result->parts[cnt].data [ fwd->used ] = 0;
    fwd->used = 0;
    }
  if (rev) {
    /* memcpy(result->parts[cnt].data+result->parts[cnt].size, rev->data, rev->used );  */
    rev->used = 0;
    }
  if (result->parts[cnt].data) cnt++;
  }
  /* final realloc */
result->parts = realloc( result->parts, cnt * sizeof *result->parts);
result->count  = cnt;
free (fwd);
free (rev);

result->alloc = malloc( result->count * sizeof result->alloc[0] );
if (result->alloc) {
  for (cnt = 0; cnt <  result->count; cnt++ ) result->alloc[cnt] = cnt;
  }
return result;

failfree:
free (fwd);
free (rev);
free (result);
return NULL;
}

char * read_complete_file(char *name, size_t *sizep)
{
int fd, rc;
size_t size, len;
char *result;

struct stat st;

fd = open(name, O_RDONLY);

if (fd == -1) goto fail;
rc = fstat(fd, &st);
if (rc == -1) goto closefail;
result = malloc (1+st.st_size );
if (!result ) goto closefail;
result[st.st_size] = 0;

for (size = 0; size < st.st_size;) {
  rc = read(fd, result, st.st_size - size);
  if (rc < 0) goto freeclosefail;
  size += rc;
  }

fprintf(stderr, "Read %lu bytes FROM %s\n"
  , (unsigned long) size, name);
close(fd);
*sizep = size;
return result;

freeclosefail:
  free(result);
closefail:
  close(fd);
fail:
  *sizep=0; return NULL;
}

Upvotes: 1

nos
nos

Reputation: 229058

You could just skip processing the lines you are not interrested in:

for (i = 0; i < (cIn-1); i++){

    while (fgets(buffer, 4999, inputFiles[i])){
       if(buffer[0] == '@' || buffer[0] == '>') {
          puts(buffer);
        }
        /* else do nothing*/
    }
}

Then you just replace the puts(buffer); with the code you need to handle the valid lines. (allthough, from your example it sounds like you rather want to only ignore lines starting with a #, ?)

Upvotes: 0

0xfee1dead
0xfee1dead

Reputation: 116

Instead of fgetpos(); fsetpos(); you might use
fseek(inputFiles[i], -strlen(buffer), SEEK_CUR);

Upvotes: 2

Related Questions