Problem

A matrix is a rectangular table of values divided into rows and columns. An m×n matrix has m rows and n columns. Given a matrix A, we write Ai,j to indicate the value found at the intersection of row i and column j.

Say that we have a collection of DNA strings, all having the same length n. Their profile matrix is a 4×n matrix P in which P1,j represents the number of times that ‘A’ occurs in the jth position of one of the strings, P2,j represents the number of times that C occurs in the jth position, and so on (see below).

A consensus string c is a string of length n formed from our collection by taking the most common symbol at each position; the jth symbol of c therefore corresponds to the symbol having the maximum value in the j-th column of the profile matrix. Of course, there may be more than one most common symbol, leading to multiple possible consensus strings.

            A T C C A G C T
            G G G C A A C T
            A T G G A T C T
DNA Strings A A G C A A C C
            T T G G A A C T
            A T G C C A T T
            A T G G C A C T

            A   5 1 0 0 5 5 0 0
Profile     C   0 0 1 4 2 0 6 1
            G   1 1 6 3 0 1 0 0
            T   1 5 0 0 0 1 1 6

Consensus    A T G C A A C T

Given: A collection of at most 10 DNA strings of equal length (at most 1 kbp) in FASTA format.
Return: A consensus string and profile matrix for the collection. (If several possible consensus strings exist, then you may return any one of them.)

Sample Dataset

>Rosalind_1
ATCCAGCT
>Rosalind_2
GGGCAACT
>Rosalind_3
ATGGATCT
>Rosalind_4
AAGCAACC
>Rosalind_5
TTGGAACT
>Rosalind_6
ATGCCATT
>Rosalind_7
ATGGCACT

Sample Output

ATGCAACT
A: 5 1 0 0 5 5 0 0
C: 0 0 1 4 2 0 6 1
G: 1 1 6 3 0 1 0 0
T: 1 5 0 0 0 1 1 6

Solution

这道题给出alignment,要求生成一个碱基的频数表,并给出consensus序列。

C version

在Computing GC content这一道题中就引入了FASTA格式,但那道题太简单,根本不需要存储序列,而这道题不同,我们需要考虑到序列的解析存储问题。这道题虽然说FASTA的描述行和序列行在不同序列中都是等长的,但现实中的FASTA文件并非如此,我们需要考虑到这一点,并实现一个通用的解析函数。这里再次使用了链表,最终读出来的序列,也是以链表形式返回,当然我们很容易写个wrapper function,返回SEQ结构体的指针构成的向量。

首先定义了readFASTA.h

typedef struct SEQ {  
  char * desc;    
  char * seq;    
  int width;    
  struct SEQ * next;
} SEQ;  
  
typedef struct CHLL {  // CHaracter Link-List
  char ch;  
  struct CHLL * next;
} CHLL;

SEQ * readFASTA(char *filename);

SEQ * readFASTA(char *filename) {
  FILE *INFILE;
  INFILE = fopen(filename, "r");

  SEQ *head = malloc(sizeof(SEQ));
  SEQ *curr;
  curr = head;  
  char ch;    
  int desc_line = 0, i;    
  while( (ch = fgetc(INFILE)) != EOF) {
    SEQ *SEQ_NODE = malloc(sizeof(SEQ));

    CHLL *CHLL_head = malloc(sizeof(CHLL));
    CHLL *CHLL_curr;
    CHLL_curr = CHLL_head;    // description line
    if ( ch == '>') {
      desc_line = 1;
    }  
    if (desc_line == 1) {
      i = 0;  
      while( (ch = fgetc(INFILE) ) != '\n') {
          CHLL *CHLL_NODE = malloc(sizeof(CHLL));
          i++;
          CHLL_NODE->ch = ch;
          CHLL_curr->next = CHLL_NODE;
          CHLL_curr = CHLL_NODE;
      }
    }      
    int desc_width = i;
    CHLL_curr = CHLL_head->next;  
    char* desc = malloc(sizeof(char) * desc_width);      
    for (i=0; i < desc_width; i++) {
      desc[i] = CHLL_curr->ch;
      CHLL_curr = CHLL_curr->next;
    }

    SEQ_NODE->desc = desc;    // sequence lines
    // re-initial the CHLL link list to store sequence characters
    CHLL_curr = CHLL_head;
    desc_line = 0;
    i = 0;      
    while( (ch=fgetc(INFILE)) != '>' && ch != EOF) {  
      if (ch == '\n') {  
          continue;
      }
      CHLL *CHLL_NODE = malloc(sizeof(CHLL));
      CHLL_NODE->ch = ch;
      CHLL_curr->next = CHLL_NODE;
      CHLL_curr = CHLL_NODE;
      i++;
    }  
    int seq_width = i;      
    char * seq = malloc(sizeof(char)*seq_width);
    CHLL_curr = CHLL_head->next;      
    for (i=0; i < seq_width; i++) {
      seq[i] = CHLL_curr->ch;
      CHLL_curr = CHLL_curr->next;
    }

    SEQ_NODE->seq = seq;
    SEQ_NODE->width = seq_width;

    curr->next = SEQ_NODE;
    curr = SEQ_NODE;

    desc_line = 1;
  }    
  return head;
}

有了FASTA的解析函数之后,问题就容易了,无非是读文件,核苷酸计数,最近打印结果。

#include<stdio.h>
#include<stdlib.h>
#include "include/readFASTA.h"

int main() {
  SEQ * head, *curr;
  head = readFASTA("DATA/rosalind_cons.txt");
  curr = head;
  int width=0;
  while(curr = curr->next) {
    if (width < curr->width) {
      width = curr->width;
    }
  }

  int NT_type = 4; // ACGT
  int cons[NT_type][width];
  int r, c;
  for (r=0; r < NT_type; r++) {
    for (c=0; c < width; c++) {
      cons[r][c] = 0;
    }
  }

  curr=head;
  int i;
  while(curr = curr->next) {
    for (i=0; i < curr->width; i++) {
      switch((curr->seq)[i]) {
      case 'A':
          cons[0][i]++;
          break;
      case 'C':
          cons[1][i]++;
          break;
      case 'G':
          cons[2][i]++;
          break;
      case 'T':
          cons[3][i]++;
          break;
      }
    }
  }

  char NT[4] = "ACGT";
  int max_r_idx, max_r=0;
  for (c=0; c < width; c++) {
    for (r=0; r < NT_type; r++) {
      if (max_r < cons[r][c]) {
          max_r = cons[r][c];
          max_r_idx = r;
      }
    }
    printf("%c", NT[max_r_idx]);
    max_r = 0;
  }
  printf("\n");
  for (r=0; r < NT_type; r++) {
    printf("%c: ", NT[r]);
    for (c=0; c < width; c++) {
      printf("%d ", cons[r][c]);
    }
    printf("\n");
  }
}

Python version

这里我读序列就是看到>就证明上一条序列读完了,可以更新计数了。
而这个更新,主要是zip函数稍微难懂一点,str是序列,而list是列表,str如果是char一样就是1不然是0,所以 [1 if x == char else 0 for x in str] 这一句根据str生成了一个0,1的列表,这个列表当然也和list等长,用zip对两个列表同时进行迭代,并加和,这样就更新了计数。

#!/usr/bin/env python3  
  
def update_cnt(list, char, str):
    res = [a + b for a,b in zip(list, [1 if x == char else 0 for x in str])]    return(res)def print_cnt(list, label):
    print(label, end = ": ")  
    for x in list:
        print(x, end=" ")
    print()

f = open("DATA/rosalind_cons.txt", 'r')
fas = f.readlines()

seq = ''  
j = 0  
for i in range(len(fas)):  
    if fas[i][0] == '>' and j == 0:
        j += 1
        next      
    elif fas[i][0] == '>':  
        if j == 1:
            A = [0 for x in seq]
            C = A
            G = A
            T = A
        A = update_cnt(A, 'A', seq)
        C = update_cnt(C, 'C', seq)
        G = update_cnt(G, 'G', seq)
        T = update_cnt(T, 'T', seq)
        j += 1
        seq = ''
    else:
        seq += fas[i].strip()  
  
## last seq  
A = update_cnt(A, 'A', seq)
C = update_cnt(C, 'C', seq)
G = update_cnt(G, 'G', seq)
T = update_cnt(T, 'T', seq)

consensus = ''  
for i in range(len(A)):
    m = max(A[i], C[i], G[i], T[i])  
    if A[i] == m:
        consensus += 'A'
    elif C[i] == m:
        consensus += 'C'
    elif G[i] == m:
        consensus += 'G'
    elif T[i] == m:
        consensus += 'T'  
  
print(consensus)

print_cnt(A, 'A')
print_cnt(C, 'C')
print_cnt(G, 'G')
print_cnt(T, 'T')