vdj_pipe
pipeline for processing DNA sequence data
sequence_fls.hpp
Go to the documentation of this file.
1 
7 #ifndef SEQUENCE_FLS_HPP_
8 #define SEQUENCE_FLS_HPP_
9 #include <algorithm>
10 #include <ostream>
11 #include "boost/assert.hpp"
12 #include "boost/cstdint.hpp"
13 #include "boost/integer/static_log2.hpp"
15 #include "vdj_pipe/exception.hpp"
17 
18 namespace vdj_pipe{
19 
24 template<unsigned N = 4, typename S = boost::uint_least32_t> class Seq_fls {
25  static const unsigned BITS_PER_CHAR = boost::static_log2<N>::value;
26  static const unsigned MAX_CHARS = sizeof(S) * 8 / BITS_PER_CHAR;
27  static const unsigned MASK = ~(~0 << BITS_PER_CHAR);
28 
29 public:
30  static const unsigned alphabet_size = N;
31  static std::size_t length() {return MAX_CHARS;}
32  typedef Seq_fls self_type;
33 
34  Seq_fls() {}
35 
36  explicit Seq_fls(const boost::string_ref seq) : val_(0) {
37  assign(seq);
38  }
39 
40  Seq_fls(self_type const& ss) : val_(ss.val_) {}
41  std::size_t size() const {return MAX_CHARS;}
42  self_type& operator=(self_type const& ss) {
43  val_ = ss.val_;
44  return *this;
45  }
46 
47  self_type& operator=(const boost::string_ref seq) {
48  val_ = 0;
49  assign(seq);
50  return *this;
51  }
52 
53  bool operator==(self_type const& sf) const {return val_ == sf.val_;}
54  bool operator!=(self_type const& sf) const {return !(*this == sf);}
55 
56  Nucleotide operator[](const std::size_t i) const {
57  BOOST_ASSERT(i < MAX_CHARS);
58  return Nucleotide((val_>> i * BITS_PER_CHAR) & MASK);
59  }
60 
61  void set(const unsigned i, const Nucleotide n) {
62  BOOST_ASSERT(i < MAX_CHARS);
63  BOOST_ASSERT(n < alphabet_size);
64  val_ &= ~(MASK << i * BITS_PER_CHAR);
65  val_ |= n << i * BITS_PER_CHAR;
66  }
67 
68  std::string to_string(std::size_t n = self_type::length()) const {
69  std::string s(n, 'X');
70  n = std::min(self_type::length(), n);
71  for(std::size_t i = 0; i != n; ++i) {
72  s[i] = to_capital(this->operator[](i));
73  }
74  return s;
75  }
76 
77  std::ostream& print(
78  std::ostream& os,
79  std::size_t n = self_type::length()
80  ) const {
81  n = std::min(self_type::length(), n);
82  for(std::size_t i = 0; i != n; ++i) {
83  os << to_capital(this->operator[](i));
84  }
85  return os;
86  }
87 
88 private:
89  S val_;
90 
91  void assign(const boost::string_ref seq) {
92  if( seq.size() > size() ) BOOST_THROW_EXCEPTION(
94  << base_exception::msg_t("sequence too long")
95  << base_exception::int1_t(seq.size())
96  << base_exception::int2_t(size())
97  );
98  for(unsigned i = seq.size(); i != 0; --i) {
99  const Nucleotide n = nucleotide_index(seq[i-1]);
100  if( n >= alphabet_size ) {
101  std::string c("\"");
102  c += seq[i-1];
103  c += '\"';
104  BOOST_THROW_EXCEPTION(
106  << base_exception::msg_t("character not supported")
107  << base_exception::str1_t(sanitize(seq[i-1]))
108  );
109  }
110  val_ <<= BITS_PER_CHAR;
111  val_ |= n;
112  }
113  }
114 
115  friend inline std::size_t hash_value(self_type const& seq) {return seq.val_;}
116 
117  friend std::ostream& operator<<(std::ostream& os, self_type const& seq) {
118  return seq.print(os);
119  }
120 
128  friend bool assign(self_type& seq1, const boost::string_ref seq2) {
129  if( seq2.size() > length() ) return false;
130  seq1.val_ = 0;
131  for(unsigned i = seq2.size(); i != 0; --i) {
132  const Nucleotide n = nucleotide_index(seq2[i-1]);
133  if( n >= alphabet_size ) return false;
134  seq1.val_ <<= BITS_PER_CHAR;
135  seq1.val_ |= n;
136  }
137  return true;
138  }
139 };
140 
143 template<class Seq> inline int
145  Seq const& s1,
146  const boost::string_ref s2,
147  scoring_matrix_t const& sm,
148  const std::size_t = 0
149 ) {
150  int i = 0;
151  for(std::size_t n = 0; n != s1.size() && n != s2.size(); ++n) {
152  const Nucleotide n1 = s1[n];
153  const Nucleotide n2 = nucleotide_index(s2[n]);
154  i += identity(n1, n2, sm);
155  }
156  return i;
157 }
158 
161 template<unsigned N, typename S> inline int
163  const Seq_fls<N,S> seq1,
164  const Seq_fls<N,S> seq2,
165  scoring_matrix_t const& sm,
166  const unsigned len = N
167 ) {
168  int i = 0;
169  for( unsigned j = 0; j != len; ++j ) {
170  i += identity(seq1[j], seq2[j], sm);
171  }
172  return i;
173 }
174 
175 }//namespace vdj_pipe
176 #endif /* SEQUENCE_FLS_HPP_ */
store short sequence in an integer
Definition: sequence_fls.hpp:24
int identity(Seq const &s1, const boost::string_ref s2, scoring_matrix_t const &sm, const std::size_t=0)
Definition: sequence_fls.hpp:144
friend bool assign(self_type &seq1, const boost::string_ref seq2)
Definition: sequence_fls.hpp:128
Definition: find_shared.hpp:22
Nucleotide
Definition: nucleotide_index.hpp:24
Main namespace of vdj_pipe library.
Definition: sequence_file.hpp:14
Definition: exception.hpp:23