vdj_pipe
pipeline for processing DNA sequence data
nucleotide_index.hpp
Go to the documentation of this file.
1 
7 #ifndef NUCLEOTIDE_INDEX_HPP_
8 #define NUCLEOTIDE_INDEX_HPP_
9 #include "boost/algorithm/string/predicate_facade.hpp"
10 #include "boost/assert.hpp"
11 #if defined(NDEBUG)
12 //fix boost::multi_array warnings
13 //boost/multi_array/base.hpp:503:13: warning: unused variable 'bound_adjustment'
14 #define BOOST_DISABLE_ASSERTS
15 #endif
16 #include "boost/multi_array.hpp"
18 #include "vdj_pipe/exception.hpp"
19 
20 #define VDJ_PIPE_N_NUCLEOTIDES 16
21 
22 namespace vdj_pipe{
23 
24 enum Nucleotide {
25  Adenine = 0,
26  Cytosine = 1,
27  Guanine = 2,
28  Thymine = 3,
29  Any = 4,
30  Uracil = 5,
31  Purine = 6,
32  Pyrimidine = 7,
33  Ketone = 8,
34  Amine = 9,
35  Strong = 10,
36  Weak = 11,
37  not_A = 12,
38  not_C = 13,
39  not_G = 14,
40  not_T = 15
41 };
42 
45 inline Nucleotide nucleotide_index(const char c) {
46  switch (c) {
47  case 'A':
48  case 'a':
49  return Adenine;
50  case 'C':
51  case 'c':
52  return Cytosine;
53  case 'G':
54  case 'g':
55  return Guanine;
56  case 'T':
57  case 't':
58  return Thymine;
59  case 'U':
60  case 'u':
61  return Uracil;
62  case 'R':
63  case 'r':
64  return Purine;
65  case 'Y':
66  case 'y':
67  return Pyrimidine;
68  case 'K':
69  case 'k':
70  return Ketone;
71  case 'M':
72  case 'm':
73  return Amine;
74  case 'S':
75  case 's':
76  return Strong;
77  case 'W':
78  case 'w':
79  return Weak;
80  case 'B':
81  case 'b':
82  return not_A;
83  case 'D':
84  case 'd':
85  return not_C;
86  case 'H':
87  case 'h':
88  return not_G;
89  case 'V':
90  case 'v':
91  return not_T;
92  case 'N':
93  case 'n':
94  return Any;
95  }
96  BOOST_THROW_EXCEPTION(
98  << base_exception::msg_t("invalid character")
100  );
101  return Adenine; //never comes to this
102 }
103 
106 inline bool is_ambiguous(const char c) {
107  switch (c) {
108  case 'A':
109  case 'a':
110  case 'C':
111  case 'c':
112  case 'G':
113  case 'g':
114  case 'T':
115  case 't':
116  case 'U':
117  case 'u':
118  return false;
119  case 'R':
120  case 'r':
121  case 'Y':
122  case 'y':
123  case 'K':
124  case 'k':
125  case 'M':
126  case 'm':
127  case 'S':
128  case 's':
129  case 'W':
130  case 'w':
131  case 'B':
132  case 'b':
133  case 'D':
134  case 'd':
135  case 'H':
136  case 'h':
137  case 'V':
138  case 'v':
139  case 'N':
140  case 'n':
141  return true;
142  }
143  BOOST_THROW_EXCEPTION(
145  << base_exception::msg_t("invalid character")
147  );
148  return 0; //never comes to this
149 }
150 
154  BOOST_ASSERT((int)n < VDJ_PIPE_N_NUCLEOTIDES);
155  switch (n) {
156  case Adenine: return Thymine;
157  case Cytosine: return Guanine;
158  case Guanine: return Cytosine;
159  case Thymine: return Adenine;
160  case Uracil: return Adenine;
161  case Purine: return Pyrimidine;
162  case Pyrimidine: return Purine;
163  case Ketone: return Amine;
164  case Amine: return Ketone;
165  case Strong: return Weak;
166  case Weak: return Strong;
167  case not_A: return not_T;
168  case not_C: return not_G;
169  case not_G: return not_C;
170  case not_T: return not_A;
171  case Any: return Any;
172  }
173  return Any; //never comes to this
174 }
175 
178 inline char complement(const char c) {
179  switch(c) {
180  case 'A': return 'T';
181  case 'a': return 't';
182  case 'T': return 'A';
183  case 't': return 'a';
184  case 'C': return 'G';
185  case 'c': return 'g';
186  case 'G': return 'C';
187  case 'g': return 'c';
188  case 'U': return 'A';
189  case 'u': return 'a';
190  case 'R': return 'Y';
191  case 'r': return 'y';
192  case 'Y': return 'R';
193  case 'y': return 'r';
194  case 'K': return 'M';
195  case 'k': return 'm';
196  case 'M': return 'K';
197  case 'm': return 'k';
198  case 'S': return 'W';
199  case 's': return 'w';
200  case 'W': return 'S';
201  case 'w': return 's';
202  case 'N': return 'N';
203  case 'n': return 'n';
204  default:
205  BOOST_THROW_EXCEPTION(
207  << base_exception::msg_t("invalid nucleotide symbol")
209  );
210  }
211  return 0; //never comes to this
212 }
213 
216 inline char to_capital(const Nucleotide n) {
217  BOOST_ASSERT((int)n < VDJ_PIPE_N_NUCLEOTIDES);
218  switch (n) {
219  case Adenine: return 'A';
220  case Cytosine: return 'C';
221  case Guanine: return 'G';
222  case Thymine: return 'T';
223  case Uracil: return 'U';
224  case Purine: return 'R';
225  case Pyrimidine: return 'Y';
226  case Ketone: return 'K';
227  case Amine: return 'M';
228  case Strong: return 'S';
229  case Weak: return 'W';
230  case not_A: return 'B';
231  case not_C: return 'D';
232  case not_G: return 'H';
233  case not_T: return 'V';
234  case Any: return 'N';
235  }
236  return 'X'; //never comes to this
237 }
238 
241 inline char to_small(const Nucleotide n) {
242  BOOST_ASSERT((int)n < VDJ_PIPE_N_NUCLEOTIDES);
243  switch (n) {
244  case Adenine: return 'a';
245  case Cytosine: return 'c';
246  case Guanine: return 'g';
247  case Thymine: return 't';
248  case Uracil: return 'u';
249  case Purine: return 'r';
250  case Pyrimidine: return 'y';
251  case Ketone: return 'k';
252  case Amine: return 'm';
253  case Strong: return 's';
254  case Weak: return 'w';
255  case not_A: return 'b';
256  case not_C: return 'd';
257  case not_G: return 'h';
258  case not_T: return 'v';
259  case Any: return 'n';
260  }
261  return 'n'; //never comes to this
262 }
263 
265 
266 namespace detail{
267 
268 template<int A, int I, int P, int U>
269 inline scoring_matrix_t make_sm() {
270  scoring_matrix_t
272 
273  static const int p = VDJ_PIPE_N_NUCLEOTIDES * VDJ_PIPE_N_NUCLEOTIDES;
274 
275  const int n[p] =
276  {
277  // AG CT GT AC CG AT ^A ^C ^G ^T
278  // A C G T N U R Y K M S W B D H V
279  /* A */ A, I, I, I, U, I, P, I, I, P, I, P, I, P, P, P, // A
280  /* C */ I, A, I, I, U, I, I, P, I, P, P, I, P, I, P, P, // C
281  /* G */ I, I, A, I, U, I, P, I, P, I, P, I, P, P, I, P, // G
282  /* T */ I, I, I, A, U, A, I, P, P, I, I, P, P, P, P, I, // T
283  /* N */ U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, // N
284  /* U */ I, I, I, A, U, A, I, P, P, I, I, P, P, P, P, I, // U
285  /* R */ P, I, P, I, U, I, P, I, P, P, P, P, P, P, P, P, // AG
286  /* Y */ I, P, I, P, U, P, I, P, P, P, P, P, P, P, P, P, // CT
287  /* K */ I, I, P, P, U, P, P, P, P, I, P, P, P, P, P, P, // GT
288  /* M */ P, P, I, I, U, I, P, P, I, P, P, P, P, P, P, P, // AC
289  /* S */ I, P, P, I, U, I, P, P, P, P, P, I, P, P, P, P, // CG
290  /* W */ P, I, I, P, U, P, P, P, P, P, I, P, P, P, P, P, // AT
291  /* B */ I, P, P, P, U, P, P, P, P, P, P, P, P, P, P, P, // CGT
292  /* D */ P, I, P, P, U, P, P, P, P, P, P, P, P, P, P, P, // AGT
293  /* H */ P, P, I, P, U, P, P, P, P, P, P, P, P, P, P, P, // ACT
294  /* V */ P, P, P, I, U, I, P, P, P, P, P, P, P, P, P, P // ACG
295  // A C G T N U R Y K M S W B D H V
296  // AG CT GT AC CG AT ^A ^C ^G ^T
297  };
298  std::copy(n, n + p, m.data());
299  return m;
300 }
301 
302 }//namespace detail
303 
306 template<int Match, int Mismatch, int Approximate, int Uncertain>
307 inline scoring_matrix_t const& scoring_matrix() {
308  static const scoring_matrix_t m =
309  detail::make_sm<Match,Mismatch,Approximate,Uncertain>();
310  return m;
311 }
312 
315 inline int identity(
316  const Nucleotide n1,
317  const Nucleotide n2,
318  scoring_matrix_t const& sm = scoring_matrix<2,-2,1,0>()
319 ) {
320  BOOST_ASSERT(
321  sm.shape()[0] == VDJ_PIPE_N_NUCLEOTIDES &&
322  sm.shape()[1] == VDJ_PIPE_N_NUCLEOTIDES
323  );
324  BOOST_ASSERT((int)n1 < VDJ_PIPE_N_NUCLEOTIDES);
325  BOOST_ASSERT((int)n2 < VDJ_PIPE_N_NUCLEOTIDES);
326  BOOST_ASSERT(sm[n1][n2] == sm[n2][n1]);
327  return sm[n1][n2];
328 }
329 
330 struct Is_ambiguous : public boost::algorithm::predicate_facade<Is_ambiguous> {
332  typedef bool result_type;
333  bool operator() (const char c) const {return vdj_pipe::is_ambiguous(c);}
334 };
335 
336 }//namespace vdj_pipe
337 #endif /* NUCLEOTIDE_INDEX_HPP_ */
bool result_type
Definition: nucleotide_index.hpp:332
int identity(Seq const &s1, const boost::string_ref s2, scoring_matrix_t const &sm, const std::size_t=0)
Definition: sequence_fls.hpp:144
char to_small(const Nucleotide n)
Definition: nucleotide_index.hpp:241
R : A, G.
Definition: nucleotide_index.hpp:31
U.
Definition: nucleotide_index.hpp:30
Definition: nucleotide_index.hpp:29
boost::multi_array< int, 2 > scoring_matrix_t
Definition: nucleotide_index.hpp:264
scoring_matrix_t const & scoring_matrix()
Definition: nucleotide_index.hpp:307
Is_ambiguous()
Definition: nucleotide_index.hpp:331
Definition: find_shared.hpp:22
bool is_ambiguous(const char c)
Definition: nucleotide_index.hpp:106
Nucleotide
Definition: nucleotide_index.hpp:24
scoring_matrix_t make_sm()
Definition: nucleotide_index.hpp:269
K : G, T, U.
Definition: nucleotide_index.hpp:33
M : A, C.
Definition: nucleotide_index.hpp:34
V : A C G.
Definition: nucleotide_index.hpp:40
Definition: nucleotide_index.hpp:330
Definition: nucleotide_index.hpp:27
Main namespace of vdj_pipe library.
Definition: keywords_variable.hpp:11
D : A G T.
Definition: nucleotide_index.hpp:38
Definition: nucleotide_index.hpp:25
boost::error_info< struct errinfo_str1_, std::string > str1_t
Definition: exception.hpp:25
S : C, G.
Definition: nucleotide_index.hpp:35
Definition: nucleotide_index.hpp:26
Nucleotide complement(const Nucleotide n)
Definition: nucleotide_index.hpp:153
const std::size_t n
Definition: vector_set_test.cpp:26
Definition: nucleotide_index.hpp:28
W : A, T, U.
Definition: nucleotide_index.hpp:36
Nucleotide nucleotide_index(const char c)
Definition: nucleotide_index.hpp:45
Definition: exception.hpp:23
char to_capital(const Nucleotide n)
Definition: nucleotide_index.hpp:216
boost::error_info< struct errinfo_message_, std::string > msg_t
Definition: exception.hpp:24
std::string sanitize(const char c)
Definition: sanitize_string.cpp:53
H : A C T.
Definition: nucleotide_index.hpp:39
B : C G T.
Definition: nucleotide_index.hpp:37
Y : C, T, U.
Definition: nucleotide_index.hpp:32
#define VDJ_PIPE_N_NUCLEOTIDES
Definition: nucleotide_index.hpp:20