Joe Armstrong
Joe Armstrong

Reputation: 1

Binary search between two files in C

I'm trying to apply some previously written code to some DNA sequencing data that I have collected. I would like to match my sequencing reads to a document containing specific barcodes and output the number of hits for each match. I am having some very basic trouble with calling the two files to compare. The code compiles, however, when I try to call the file "MOBYtest" in line 77, the output is 0, which indicates that the file is not being called appropriately. Does anyone have any insight into how I can incorporate the two files into the following code?

####################################################################################################
#### Supplementary File 2, barcodebin.c ############################################################
####
#### Barcode assigned to a gene using standard binary search
####
#### Can Alkan
#### 2010
####################################################################################################
*/
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>

typedef struct spair{
  char *name;
  char *seq;
  int count;
}_spair;


int binSearch(char *seq, int s, int e);
void revcomp(char *s, char *r);

static int maxfunc(const void *p1, const void *p2);

struct spair **seqs;
int seqlen;

int main(int argc, char **argv){
  FILE *fp;
  int i;
  char fname[100];
  int nseq;
  char sequence[100]; char name[100];
  char sname[100];
  char revseq[100];
  int retCode=100000;
  int s, e; int olds;
  int found;
  int crop;
  int len;
  int longest_barcode=0;
  int fastq=0;
  char line[1000];
  int loop_done=0;
  int crop_lim;
  int min_barcode = 5;
  int verbose = 0;
  
  fname[0]=0; nseq = 0; sname[0]=0;
  for (i=1;i<argc;i++){
    if (!strcmp(argv[i], "-i"))
      strcpy(fname, argv[i+1]);
    else if (!strcmp(argv[i], "-s"))
      strcpy(sname, argv[i+1]);
    else if (!strcmp(argv[i], "-n"))
      nseq = atoi(argv[i+1]);
    else if (!strcmp(argv[i], "-m"))
      min_barcode = atoi(argv[i+1]);
    else if (!strcmp(argv[i], "-v"))
      verbose = 1;
  }

  if (fname[0]==0 || nseq==0 || sname[0]==0) return 0;

  fp = fopen(fname, "MOBYtest");
  if ( !fp ) {
    fprintf(stderr, "Could not open file %s\n", fname);
    return 255;
  }
    

  i=0;

  seqs = (struct spair ** ) malloc (sizeof (struct spair *) * nseq);
 
  if (verbose) {
    fprintf(stderr, "Loading  %d sequences ..", nseq);
  }

  while (fscanf(fp, ">%s\n%s\n", name, sequence) > 0 ){
    seqlen=strlen(sequence);
    if (seqlen > longest_barcode) longest_barcode = seqlen;
    if (seqlen>=3){
      seqs[i] = (struct spair * ) malloc (sizeof (struct spair));
      seqs[i]->name = (char *) malloc (sizeof (char) * (strlen (name)+1));
      seqs[i]->seq = (char *) malloc (sizeof (char) * (2*seqlen+1));
      seqs[i]->count = 0;

      strcpy(seqs[i]->seq, sequence);
      strcpy(seqs[i]->name, name);

      i++;
    }
  }

  nseq = i;

  qsort(seqs, nseq, sizeof(struct spair *), maxfunc);

  if (verbose) {
    fprintf(stderr, "  done. %d left. Longest barcode: %d bp\n", nseq, longest_barcode);
  }

  fclose(fp);

  fp = fopen(sname, "r");
  if ( !fp ) {
    fprintf(stderr, "Could not open file %s\n", sname);
    return 255;
  }


  while (fscanf(fp, "%s\n%s\n", name, sequence) > 0){



    if (name[0]=='@'){
      fgets(line, 1000, fp); // + line
      fgets(line, 1000, fp); // quality line
    }


    sequence[longest_barcode]=0;

    strcpy(revseq, sequence);
    s = 0, e = nseq; retCode = 10000; olds=-10;
    found=0;


    crop = 0;
    len = strlen(revseq);

    crop_lim = longest_barcode - min_barcode + 1; // ad hoc

    while (crop<crop_lim){
      revseq[len-crop]=0;

      while (s<e && retCode!=0){
    retCode=binSearch(revseq, s, e);
    if (retCode < 0){e = (s+e)/2;}
    else if (retCode > 0){olds=s; s = (s+e)/2;}
    
    else if (retCode == 0){

      seqs[(s+e)/2]->count++;
      crop = crop_lim+1;
      break;
    }
    
    if (s>=e || s==olds){

      crop++;

      s = 0, e = nseq; retCode = 10000; olds=-10;
      break;

    }
    
      }
    }

  }
 
  if (verbose) {
    fprintf(stderr, "  done.\n");
  }
  fprintf(stdout, "Barcode_Name\tBarcode_Length\tFrequency\n");
  for (i=0;i<nseq;i++)
    fprintf(stdout, "%s\t%d\t%d\n", seqs[i]->name, strlen(seqs[i]->seq), seqs[i]->count);
}

int binSearch(char *seq, int s, int e){
  int med = (s+e)/2;
  
  int ret;

  ret = strcmp(seq, seqs[med]->seq);

  return ret;
}

void revcomp(char *s, char *r){
  int len = strlen(s);
  int  i;

  for (i=0;i<len;i++){
    switch(s[i]){
    case 'A': r[len-i-1] = 'T'; break;
    case 'T': r[len-i-1] = 'A'; break;
    case 'C': r[len-i-1] = 'G'; break;
    case 'G': r[len-i-1] = 'C'; break;
    default:  r[len-i-1] = s[i]; break;
    }
  }
  r[len]=0;
}


static int maxfunc(const void *p1, const void *p2){
  struct spair *a, *b;

  int ret;
  a = *((struct spair **)p1);
  b = *((struct spair **)p2);

  ret = strcmp(a->seq, b->seq);

  return ret;


}

Upvotes: 0

Views: 77

Answers (1)

einpoklum
einpoklum

Reputation: 132108

Your problem is here:

fp = fopen(fname, "MOBYtest");

that's not how you call fopen(). If you look at the man page for fopen() you'll see the signature:

FILE *fopen(const char *pathname, const char *mode);

So, you don't "plug" your filename into the program, you pass it when invoking, i.e. if you compiled this file as my_prog, you would execute something like:

my_prog -i "MOBYtest" -s "my_list_of_barcodes_I_guess" -n 100

and the MOBYtest string would come into fopen() via the pathname parameter. Now, the mode for opening the file is one or "r", "w", a" etc., which correspond to "read", "write", "append" etc. Again, see the man page.


Some more advice:

  • Always turn the warning flags on during compilation. You definitely got some warnings about your code - heed them.
  • Use common/standard facilities for common tasks such as argument parsing - don't write your own parsing code. Specifically, you could use getopt, available on any POSIX system; or maybe try argparse (a library available on GitHub, could be a bit challenging for newbie to set up and include in the build).

Upvotes: 1

Related Questions