Reputation: 5365
A "kmer" is a DNA sequence of length K. A valid DNA sequence (for my purposes) can only contain the 4 following bases: A,C,T,G. I am looking for a C++ algorithm that simply outputs all possible combinations of these bases in alphabetical order into a string array. For example, if K = 2, The program should generate the following array:
kmers[0] = AA
kmers[1] = AC
kmers[2] = AG
kmers[3] = AT
kmers[4] = CA
kmers[5] = CC
kmers[6] = CG
kmers[7] = CT
kmers[8] = GA
kmers[9] = GC
kmers[10] = GG
kmers[11] = GT
kmers[12] = TA
kmers[13] = TC
kmers[14] = TG
kmers[15] = TT
If I'm thinking about this correctly, the problem really breaks down to converting a decimal integer to base 4 then substituting the appropriate bases. I thought I could use itoa for this, but itoa is not C standard, and my compiler did not support it. I welcome any clever ideas. Here is my sample code:
#include <iostream>
#include <string>
#include <math.h>
#define K 3
using namespace std;
int main() {
int num_kmers = pow(4,K);
string* kmers = NULL;
/* Allocate memory for kmers array */
kmers = new string[num_kmers];
/* Populate kmers array */
for (int i=0; i< pow(4,K); i++) {
// POPULATE THE kmers ARRAY HERE
}
/* Display all possible kmers */
for (int i=0; i< pow(4,K); i++)
cout << kmers[i] << "\n";
delete [] kmers;
}
Upvotes: 8
Views: 5220
Reputation: 121
#include <iostream>
#include <string>
#include <string_view>
#include <vector>
class kmer
{
public:
kmer();
kmer(std::string &str, int &k) : k_(k), str_(str)
{
for(int i = 0; i <= str_.size() - k_; ++i)
value_.push_back(str_.substr(i, k_));
}
auto value() { return value_; }
auto size() { return value_.size(); }
private:
std::vector<std::string_view> value_;
std::string_view str_;
int k_;
};
int main()
{
std::string seq{"AGCTAGCT"};
kmer k4_mer(seq, 4);
std::cout << "Total " << k4_mer.size() << " mers\n";
for(auto &i : k4_mer.value())
std::cout << i << "\n";
return 0;
}
Upvotes: 0
Reputation: 179907
Whoa, this shouldn't be so hard.
std::string kmers(int i, int K) {
static const char* DNA = "ACGT";
if (K==0) return std::string();
return DNA[i%4] + kmers(i / 4, K-1);
}
const int K = 2;
int main () {
for (int i = 0; i != 1<<(2*K); ++i)
std::cout << "kmers[" << i << "] = " << kmers(i,K) << std::endl;
}
Upvotes: 2
Reputation: 168636
This seems to me to be a perfect fit for a custom iterator. That way your main program can be simple:
std::vector<std::string> v;
v.reserve(kmerIterator<4>::end() - kmerIterator<4>::begin());
std::copy(kmerIterator<4>::begin(), kmerIterator<4>::end(),
std::back_inserter(v));
But, since we have implemented the kmer concept as an interator, we get to use all of the other generic algorithms, too. Since we implemented the kmer iterator as a Random Access Iterator, then finding the i-th kmer is trivial:
kmerIterator<4>::begin()[i]
Here is my complete program:
#include <iostream>
#include <iterator>
#include <algorithm>
#include <vector>
template<unsigned int n>
class kmerIterator :
public std::iterator<std::random_access_iterator_tag,
std::string,
unsigned int>
{
private:
typedef kmerIterator k; // 'cause I'm lazy
difference_type it;
kmerIterator(difference_type i) : it(i) {}
public:
kmerIterator() : it() {}
static k begin() {
return 0;
}
static k end() {
return difference_type(1) << n*2;
}
k& operator++() { ++it; return *this; }
k operator++(int) { return it++; }
k& operator--() { --it; return *this; }
k operator--(int) { return it--; }
k operator+(difference_type delta) { return it + delta; }
k operator-(difference_type delta) { return it - delta; }
difference_type operator-(const k& rhs) { return it - rhs.it; }
bool operator<(const k& rhs) const { return it < rhs.it; }
bool operator>(const k& rhs) const { return it > rhs.it; }
bool operator<=(const k& rhs) const { return it <= rhs.it; }
bool operator>=(const k& rhs) const { return it >= rhs.it; }
k operator+=(difference_type delta) { return it+=delta; }
k operator-=(difference_type delta) { return it-=delta; }
std::string operator[](difference_type delta) const { return *k(it+delta); }
std::string operator*() const {
std::string rz;
int i = 2*n;
do {
i -= 2;
rz += "ACGT"[(it>>i)&3];
} while(i);
return rz;
}
};
int main() {
std::copy(kmerIterator<2>::begin(),
kmerIterator<2>::end(),
std::ostream_iterator<std::string>(std::cout, "\n"));
std::vector<std::string> v;
v.reserve(kmerIterator<4>::end() - kmerIterator<4>::begin());
std::copy(kmerIterator<4>::begin(),
kmerIterator<4>::end(),
std::back_inserter(v));
std::cout << v[42] << "\n";
std::cout << kmerIterator<4>::begin()[56] << "\n";
}
Upvotes: 2
Reputation: 137820
You don't need to convert decimal to anything, once the user input text has been accepted.
It's also probably a mistake to create an array of strings, which grows exponentially with K
. Just print the output.
char bases[] = { 'A', 'C', 'T', 'G' };
std::vector< int > sequence( K ); // allow dynamic K
std::vector< char > output( K * ( 2 << K * 2 ) ); // flat sequence of sequences
std::vector< char >::iterator out_it = output.begin();
int i;
do {
// print current sequence
for ( i = 0; i < K; ++ i ) std::cout << bases[ sequence[ i ] ];
std::cout << '\n';
// store current sequence
for ( i = 0; i < K; ++ i ) * out_it ++ = bases[ sequence[ i ] ];
// advance to next sequence
for ( i = K; i > 0; -- i ) {
// increment the last base that we can
if ( sequence[ i - 1 ] != sizeof bases - 1 ) {
++ sequence[ i - 1 ];
break;
}
// reset bases that can't be incremented
sequence[ i - 1 ] = 0;
}
} while ( i > 0 ); // if i <= 0, failed to increment anything, we're done.
Upvotes: 2
Reputation: 48785
You would need to use recursion to be flexible (i.e. so that you could change K easily).
void populate(int depth, string base, string* kmers, int* kmers_offset)
{
if(depth == K)
{
kmers[*kmers_offset].assign(base);
(*kmers_offset)++;
}
else
{
static char bases[] = { 'A', 'C', 'G', 'T' };
for(int i = 0; i < 4; ++i)
populate(depth + 1, base + bases[i], kmers, kmers_offset);
}
}
And then call it like this:
int kmers_offset = 0;
populate(0, "", kmers, &kmers_offset);
Cheers.
Upvotes: 6