Sunday, January 12, 2025
Google search engine
HomeData Modelling & AICounting k-mers via Suffix Array

Counting k-mers via Suffix Array

Pre-requisite: Suffix Array. What are k-mers? The term k-mers typically refers to all the possible substrings of length k that are contained in a string. Counting all the k-mers in DNA/RNA sequencing reads is the preliminary step of many bioinformatics applications. What is a Suffix Array? A suffix array is a sorted array of all suffixes of a string. It is a data structure used, among others, in full text indices, data compression algorithms. More information can be found here. Problem: We are given a string str and an integer k. We have to find all pairs (substr, i) such that substr is a length – k substring of str that occurs exactly i times. 

Steps involved in the approach: 

Let’s take the word “banana$” as an example. 

Step 1: Compute the suffix array of the given text.

          6     $   
          5     a$
          3     ana$
          1     anana$
          0     banana$
          4     na$                    
          2     nana$

Step 2: Iterate through the suffix array keeping “curr_count”

  1. If the length of current suffix is less than k, then skip the iteration. That is, if k = 2, then iteration would be skipped when current suffix is $
  2. If the current suffix begins with the same length – k substring as the previous suffix, then increment curr_count. For example, during fourth iteration current suffix “anana$” starts with same substring of length k “an” as previous suffix “ana$” started with. So, we will increment curr_count in this case. 
  3. If condition 2 is not satisfied, then if length of previous suffix is equal to k, then that it is a valid pair and we will output it along with its current count, otherwise, we will skip that iteration.
                 curr_count  Valid Pair
 6     $           1                     
 5     a$          1
 3     ana$        1         (a$, 1)
 1     anana$      1
 0     banana$     2         (an, 2)
 4     na$         1         (ba, 1)               
 2     nana$       1         (na, 2)

Examples: 

Input : banana$ // Input text
Output : (a$, 1) // k- mers
         (an, 2)
         (ba, 1)
         (na, 2)
 
Input : neveropen
Output : (ee, 2) 
         (ek, 2)
         (fo, 1)
         (ge, 2)
         (ks, 2)
         (or, 1)
         (sf, 1)

The following is the C code for approach explained above: 

C++




// C++ program to solve K-mers counting problem
#include <bits/stdc++.h>
using namespace std;
 
// Structure to store data of a rotation
struct rotation {
    int index;
    char* suffix;
};
 
// Compares the rotations and
// sorts the rotations alphabetically
int cmpfunc(const void* x, const void* y)
{
    struct rotation* rx = (struct rotation*)x;
    struct rotation* ry = (struct rotation*)y;
    return strcmp(rx->suffix, ry->suffix);
}
 
// Takes input_text and its length as arguments
// and returns the corresponding suffix array
char** computeSuffixArray(char* input_text, int len_text)
{
    int i;
 
    // Array of structures to store rotations
    // and their indexes
    struct rotation suff[len_text];
 
    // Structure is needed to maintain old
    // indexes of rotations after sorting them
    for (i = 0; i < len_text; i++) {
        suff[i].index = i;
        suff[i].suffix = (input_text + i);
    }
 
    // Sorts rotations using comparison function
    // defined above
    qsort(suff, len_text, sizeof(struct rotation), cmpfunc);
 
    // Stores the suffixes of sorted rotations
    char** suffix_arr
        = (char**)malloc(len_text * sizeof(char*));
 
    for (i = 0; i < len_text; i++) {
        suffix_arr[i]
            = (char*)malloc((len_text + 1) * sizeof(char));
        strcpy(suffix_arr[i], suff[i].suffix);
    }
 
    // Returns the computed suffix array
    return suffix_arr;
}
 
// Takes suffix array, its size and valid length as
// arguments and outputs the valid pairs of k - mers
void findValidPairs(char** suffix_arr, int n, int k)
{
    int curr_count = 1, i;
    char* prev_suff = (char*)malloc(n * sizeof(char));
 
    // Iterates over the suffix array,
    // keeping a current count
    for (i = 0; i < n; i++) {
 
        // Skipping the current suffix
        // if it has length < valid length
        if (strlen(suffix_arr[i]) < k) {
 
            if (i != 0 && strlen(prev_suff) == k) {
                cout << "(" << prev_suff << ", "
                     << curr_count << ")" << endl;
                curr_count = 1;
            }
 
            strcpy(prev_suff, suffix_arr[i]);
            continue;
        }
 
        // Incrementing the curr_count if first
        // k chars of prev_suff and current suffix
        // are same
        if (!(memcmp(prev_suff, suffix_arr[i], k))) {
            curr_count++;
        }
        else {
 
            // Pair is valid when i!=0 (as there is
            // no prev_suff for i = 0) and when strlen
            // of prev_suff is k
            if (i != 0 && strlen(prev_suff) == k) {
                cout << "(" << prev_suff << ", "
                     << curr_count << ")" << endl;
                curr_count = 1;
                memcpy(prev_suff, suffix_arr[i], k);
                prev_suff[k] = '\0';
            }
            else {
                memcpy(prev_suff, suffix_arr[i], k);
                prev_suff[k] = '\0';
                continue;
            }
        }
 
        // Modifying prev_suff[i] to current suffix
        memcpy(prev_suff, suffix_arr[i], k);
        prev_suff[k] = '\0';
    }
 
    // Printing the last valid pair
    cout << "(" << prev_suff << ", " << curr_count << ")"
         << endl;
}
 
// Driver program to test functions above
int main()
{
    char input_text[] = "neveropen";
    int k = 2;
    int len_text = strlen(input_text);
 
    // Computes the suffix array of our text
    cout << "Input Text: " << input_text << endl;
    char** suffix_arr
        = computeSuffixArray(input_text, len_text);
 
    // Finds and outputs all valid pairs
    cout << "k-mers: " << endl;
    findValidPairs(suffix_arr, len_text, k);
 
    return 0;
}


C




// C program to solve K-mers counting problem
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
 
// Structure to store data of a rotation
struct rotation {
    int index;
    char* suffix;
};
 
// Compares the rotations and
// sorts the rotations alphabetically
int cmpfunc(const void* x, const void* y)
{
    struct rotation* rx = (struct rotation*)x;
    struct rotation* ry = (struct rotation*)y;
    return strcmp(rx->suffix, ry->suffix);
}
 
// Takes input_text and its length as arguments
// and returns the corresponding suffix array
char** computeSuffixArray(char* input_text,
                            int len_text)
{
    int i;
 
    // Array of structures to store rotations
    // and their indexes
    struct rotation suff[len_text];
 
    // Structure is needed to maintain old
    // indexes of rotations after sorting them
    for (i = 0; i < len_text; i++) {
        suff[i].index = i;
        suff[i].suffix = (input_text + i);
    }
 
    // Sorts rotations using comparison function
    // defined above
    qsort(suff, len_text, sizeof(struct rotation), cmpfunc);
 
    // Stores the suffixes of sorted rotations
    char** suffix_arr =
    (char**)malloc(len_text * sizeof(char*));
 
    for (i = 0; i < len_text; i++) {
        suffix_arr[i] =
        (char*)malloc((len_text + 1) * sizeof(char));
        strcpy(suffix_arr[i], suff[i].suffix);
    }
 
    // Returns the computed suffix array
    return suffix_arr;
}
 
// Takes suffix array, its size and valid length as
// arguments and outputs the valid pairs of k - mers
void findValidPairs(char** suffix_arr, int n, int k)
{
    int curr_count = 1, i;
    char* prev_suff = (char*)malloc(n * sizeof(char));
 
    // Iterates over the suffix array,
    // keeping a current count
    for (i = 0; i < n; i++) {
 
        // Skipping the current suffix
        // if it has length < valid length
        if (strlen(suffix_arr[i]) < k) {
 
            if (i != 0 && strlen(prev_suff) == k) {
                printf("(%s, %d)\n", prev_suff, curr_count);
                curr_count = 1;}
 
            strcpy(prev_suff, suffix_arr[i]);
            continue;
        }
 
        // Incrementing the curr_count if first
        // k chars of prev_suff and current suffix
        // are same
        if (!(memcmp(prev_suff, suffix_arr[i], k))) {
            curr_count++;
        }
        else {
 
            // Pair is valid when i!=0 (as there is
            // no prev_suff for i = 0) and when strlen
            // of prev_suff is k
            if (i != 0 && strlen(prev_suff) == k) {
                printf("(%s, %d)\n", prev_suff, curr_count);
                curr_count = 1;
                memcpy(prev_suff, suffix_arr[i], k);
                prev_suff[k] = '\0';
            }
            else {
                memcpy(prev_suff, suffix_arr[i], k);
                prev_suff[k] = '\0';
                continue;
            }
        }
 
        // Modifying prev_suff[i] to current suffix
        memcpy(prev_suff, suffix_arr[i], k);
        prev_suff[k] = '\0';
    }
 
    // Printing the last valid pair
    printf("(%s, %d)\n", prev_suff, curr_count);
}
 
// Driver program to test functions above
int main()
{
    char input_text[] = "neveropen";
    int k = 2;
    int len_text = strlen(input_text);
 
    // Computes the suffix array of our text
    printf("Input Text: %s\n", input_text);
    char** suffix_arr =
    computeSuffixArray(input_text, len_text);
 
    // Finds and outputs all valid pairs
    printf("k-mers: \n");
    findValidPairs(suffix_arr, len_text, k);
 
    return 0;
}


Output: 

Input Text: banana$ 
k-mers: 
(a$, 1)
(an, 2)
(ba, 1)
(na, 2)

Time Complexity: O(s*len_text*log(len_text)), assuming s is the length of the longest suffix.

Feeling lost in the world of random DSA topics, wasting time without progress? It’s time for a change! Join our DSA course, where we’ll guide you on an exciting journey to master DSA efficiently and on schedule.
Ready to dive in? Explore our Free Demo Content and join our DSA course, trusted by over 100,000 neveropen!

RELATED ARTICLES

Most Popular

Recent Comments