vdj_pipe
pipeline for processing DNA sequence data
sequence_map_full.hpp
Go to the documentation of this file.
1 
7 #ifndef SEQUENCE_MAP_FULL_HPP_
8 #define SEQUENCE_MAP_FULL_HPP_
9 #include <vector>
10 #include "boost/foreach.hpp"
11 #include "boost/utility/string_ref.hpp"
18 
19 namespace vdj_pipe{
20 
23 class Seq_map_full {
24 public:
25  typedef std::vector<Nucleotide> seq_type;
27 
28 private:
31 
32  static seq_type to_vector(const boost::string_ref s) {
33  seq_type sv(s.size());
34  for(std::size_t n = 0; n != sv.size(); ++n) {
35  sv[n] = nucleotide_index(s[n]);
36  }
37  return sv;
38  }
39 
40 public:
42  typedef iterator const_iterator;
43  struct Err : public base_exception{};
44 
45  static std::string to_string(seq_type const& sv) {
46  std::string s(sv.size(), 'X');
47  for(std::size_t n = 0; n != sv.size(); ++n) {
48  s[n] = to_capital(sv[n]);
49  }
50  return s;
51  }
52 
53  explicit Seq_map_full(
54  const bool allow_duplicate_seq = false
55  )
56  : seqs_(Mid_id(1)),
57  names_(Mid_id(1)),
58  allow_duplicate_seq_(allow_duplicate_seq),
59  min_len_(std::numeric_limits<std::size_t>::max()),
60  max_len_(0)
61  {}
62 
63  iterator begin() const {return iterator(seqs_.min_id());}
64  iterator end() const {return ++iterator(seqs_.max_id());}
65  seq_type seq(const Mid_id id) const {return seqs_[id];}
66  std::string seq_string(const Mid_id id) const {return to_string(seqs_[id]);}
67  std::string const& name(const Mid_id id) const {return names_[id];}
68  Mid_id max_id() const {return seqs_.max_id();}
69  std::size_t min_len() const {return min_len_;}
70  std::size_t max_len() const {return max_len_;}
71  Mid_id const* find_exact(seq_type const& seq) const {return seqs_.find(seq);}
72 
73  Mid_id const* find_exact(const boost::string_ref seq) const {
74  return find_exact(to_vector(seq));
75  }
76 
77  template<class Seq> bool find_closest(
78  const Seq s,
79  match_type& m,
80  scoring_matrix_t const& sm
81  ) const {
82  if( Mid_id const* id = find_exact(s) ) {
83  return m.combine(*id, identity(seq(*id), s, sm));
84  }
85 
86  bool found = false;
87  BOOST_FOREACH(const Mid_id id, *this) {
88  const int n = identity(seq(id), s, sm);
89  if( m.combine(id, n) ) found = true;
90  }
91  return found;
92  }
93 
94  Mid_id insert(std::string name, std::string const& seq) {
95  if( name.empty() ) name = seq;
96 
97  BOOST_ASSERT(seqs_.size() == names_.size());
98  const seq_type s = to_vector(seq);
99  if( Mid_id const* id = seqs_.find(s) ) {
100  if( allow_duplicate_seq_ ) {
101  if( names_[*id] != name ) BOOST_THROW_EXCEPTION(
102  Err()
103  << Err::msg_t("same sequence, different name")
104  << Err::str1_t(sanitize(seq))
105  << Err::str2_t(sanitize(names_[*id]))
106  << Err::str3_t(sanitize(name))
107  );
108  return *id;
109  }
110  BOOST_THROW_EXCEPTION(
111  Err()
112  << Err::msg_t("duplicate match element sequence")
113  << Err::str1_t(sanitize(seq))
114  << Err::str2_t(sanitize(name))
115  );
116  }
117 
118  const std::pair<Mid_id,bool> p1 = seqs_.insert(s);
119  const Mid_id id = names_.insert(name);
120  unused_variable(id);
121  BOOST_ASSERT(id);
122  BOOST_ASSERT(p1.first == id);
123  if( min_len_ > seq.size() ) min_len_ = seq.size();
124  if( max_len_ < seq.size() ) max_len_ = seq.size();
125  return p1.first;
126  }
127 
128 private:
129  seq_map seqs_;
130  name_map names_;
132  std::size_t min_len_, max_len_;
133 };
134 
135 }//namespace vdj_pipe
136 #endif /* SEQUENCE_MAP_FULL_HPP_ */
std::string const & name(const Mid_id id) const
Definition: sequence_map_full.hpp:67
Best_match_pair< Mid_id, int > match_type
Definition: sequence_map_full.hpp:26
Definition: sequence_map_full.hpp:23
iterator begin() const
Definition: sequence_map_full.hpp:63
iterator const_iterator
Definition: sequence_map_full.hpp:42
std::pair< id_type, bool > insert(obj_type const &obj)
Definition: id_bimap.hpp:160
int identity(Seq const &s1, const boost::string_ref s2, scoring_matrix_t const &sm, const std::size_t=0)
Definition: sequence_fls.hpp:144
id_type min_id() const
Definition: id_bimap.hpp:119
Definition: best_match_pair.hpp:16
std::size_t min_len() const
Definition: sequence_map_full.hpp:69
std::size_t size() const
Definition: id_map.hpp:37
std::string seq_string(const Mid_id id) const
Definition: sequence_map_full.hpp:66
Definition: find_shared.hpp:22
name_map names_
Definition: sequence_map_full.hpp:130
STL namespace.
void unused_variable(T const &)
Definition: unused_variable.hpp:14
std::size_t max_len_
Definition: sequence_map_full.hpp:132
Id_iterator< Mid_id > iterator
Definition: sequence_map_full.hpp:41
Mid_id max_id() const
Definition: sequence_map_full.hpp:68
static seq_type to_vector(const boost::string_ref s)
Definition: sequence_map_full.hpp:32
iterator end() const
Definition: sequence_map_full.hpp:64
static std::string to_string(seq_type const &sv)
Definition: sequence_map_full.hpp:45
bool find_closest(const Seq s, match_type &m, scoring_matrix_t const &sm) const
Definition: sequence_map_full.hpp:77
id_type max_id() const
Definition: id_bimap.hpp:120
detail::Id_bimap< Mid_id, seq_type > seq_map
Definition: sequence_map_full.hpp:29
id_type const * find(ObjCompat const &obj) const
Definition: id_bimap.hpp:151
Main namespace of vdj_pipe library.
Definition: keywords_variable.hpp:11
Seq_map_full(const bool allow_duplicate_seq=false)
Definition: sequence_map_full.hpp:53
Mid_id const * find_exact(const boost::string_ref seq) const
Definition: sequence_map_full.hpp:73
std::size_t max_len() const
Definition: sequence_map_full.hpp:70
boost::error_info< struct errinfo_str2_, std::string > str2_t
Definition: exception.hpp:26
seq_type seq(const Mid_id id) const
Definition: sequence_map_full.hpp:65
Mid_id const * find_exact(seq_type const &seq) const
Definition: sequence_map_full.hpp:71
boost::error_info< struct errinfo_str1_, std::string > str1_t
Definition: exception.hpp:25
Definition: sequence_map_full.hpp:43
Mid_id insert(std::string name, std::string const &seq)
Definition: sequence_map_full.hpp:94
const std::size_t n
Definition: vector_set_test.cpp:26
detail::Id_map< Mid_id, std::string > name_map
Definition: sequence_map_full.hpp:30
std::size_t min_len_
Definition: sequence_map_full.hpp:132
Nucleotide nucleotide_index(const char c)
Definition: nucleotide_index.hpp:45
Definition: exception.hpp:23
boost::error_info< struct errinfo_str3_, std::string > str3_t
Definition: exception.hpp:27
char to_capital(const Nucleotide n)
Definition: nucleotide_index.hpp:216
Definition: id_iterator.hpp:16
boost::error_info< struct errinfo_message_, std::string > msg_t
Definition: exception.hpp:24
std::string sanitize(const char c)
Definition: sanitize_string.cpp:53
std::size_t size() const
Definition: id_bimap.hpp:115
id_type insert(value_type const &obj)
Definition: id_map.hpp:83
std::vector< Nucleotide > seq_type
Definition: sequence_map_full.hpp:25
bool allow_duplicate_seq_
Definition: sequence_map_full.hpp:131
seq_map seqs_
Definition: sequence_map_full.hpp:129
bool combine(id_type const &id, const score_type score)
Definition: best_match_pair.hpp:49