vdj_pipe
pipeline for processing DNA sequence data
sequence_map_short.hpp
Go to the documentation of this file.
1 
7 #ifndef SEQUENCE_MAP_SHORT_HPP_
8 #define SEQUENCE_MAP_SHORT_HPP_
9 #include "boost/utility/string_ref.hpp"
10 #include "boost/foreach.hpp"
11 
17 #include "vdj_pipe/exception.hpp"
18 #include "vdj_pipe/object_ids.hpp"
20 
21 namespace vdj_pipe{
22 
26 public:
29 
30 private:
33 
34 public:
36  typedef iterator const_iterator;
37  struct Err : public base_exception{};
38 
39  explicit Seq_map_short(
40  const bool allow_duplicate_seq
41  )
42  : mid_size_(0),
43  seqs_(Mid_id(1)),
44  names_(Mid_id(1)),
45  allow_duplicate_seq_(allow_duplicate_seq)
46  {}
47 
48  iterator begin() const {return iterator(seqs_.min_id());}
49  iterator end() const {return ++iterator(seqs_.max_id());}
50  seq_type seq(const Mid_id id) const {return seqs_[id];}
51  std::string seq_string(const Mid_id id) const {return seqs_[id].to_string(mid_size_);}
52  std::string const& name(const Mid_id id) const {return names_[id];}
53  std::size_t seq_size() const {return mid_size_;}
54  Mid_id max_id() const {return seqs_.max_id();}
55  Mid_id const* find_exact(const seq_type seq) const {return seqs_.find(seq);}
56 
57  Mid_id const* find_exact(const boost::string_ref seq) const {
58  seq_type seq1;
59  if( assign(seq1, seq) ) return find_exact(seq1);
60  return 0;
61  }
62 
63  template<class Seq> bool find_closest(
64  const Seq s,
65  match_type& m,
66  scoring_matrix_t const& sm
67  ) const {
68  if( Mid_id const* id = find_exact(s) ) {
69  return m.combine(*id, identity(Adenine, Adenine, sm) * seq_size());
70  }
71 
72  bool found = false;
73  BOOST_FOREACH(const Mid_id id, *this) {
74  const int n = identity(seq(id), s, sm, seq_size());
75  if( m.combine(id, n) ) found = true;
76  }
77  return found;
78  }
79 
80  Mid_id insert(std::string name, std::string const& seq) {
81  if( name.empty() ) name = seq;
82  if( ! mid_size_ ) {
83  if( seq_type::length() < seq.size() ) BOOST_THROW_EXCEPTION(
84  Err()
85  << Err::msg_t("match element sequence too long")
86  << Err::str1_t(sanitize(seq))
88  );
89  mid_size_ = seq.size();
90  }
91 
92  if( ! seq.size() || seq.size() != mid_size_ ) BOOST_THROW_EXCEPTION(
93  Err()
94  << Err::msg_t("wrong match element sequence length")
95  << Err::str1_t(sanitize(seq))
96  << Err::str2_t(sanitize(name))
98  << Err::int2_t(seq.size())
99  );
100  BOOST_ASSERT(seqs_.size() == names_.size());
101  const seq_type s(seq);
102  if( Mid_id const* id = seqs_.find(s) ) {
103  if( allow_duplicate_seq_ ) {
104  if( names_[*id] != name ) BOOST_THROW_EXCEPTION(
105  Err()
106  << Err::msg_t("same sequence, different name")
107  << Err::str1_t(sanitize(seq))
108  << Err::str2_t(sanitize(names_[*id]))
109  << Err::str3_t(sanitize(name))
110  );
111  return *id;
112  }
113  BOOST_THROW_EXCEPTION(
114  Err()
115  << Err::msg_t("duplicate match element sequence")
116  << Err::str1_t(sanitize(seq))
117  << Err::str2_t(sanitize(name))
118  );
119  }
120 
121  const std::pair<Mid_id,bool> p1 = seqs_.insert(s);
122  const Mid_id id = names_.insert(name);
123  unused_variable(id);
124  BOOST_ASSERT(id);
125  BOOST_ASSERT(p1.first == id);
126  return p1.first;
127  }
128 
129 private:
130  std::size_t mid_size_;
131  seq_map seqs_;
132  name_map names_;
134 };
135 
136 }//namespace vdj_pipe
137 #endif /* SEQUENCE_MAP_SHORT_HPP_ */
store short sequence in an integer
Definition: sequence_fls.hpp:24
std::string const & name(const Mid_id id) const
Definition: sequence_map_short.hpp:52
std::pair< id_type, bool > insert(obj_type const &obj)
Definition: id_bimap.hpp:160
seq_type seq(const Mid_id id) const
Definition: sequence_map_short.hpp:50
int identity(Seq const &s1, const boost::string_ref s2, scoring_matrix_t const &sm, const std::size_t=0)
Definition: sequence_fls.hpp:144
std::size_t seq_size() const
Definition: sequence_map_short.hpp:53
id_type min_id() const
Definition: id_bimap.hpp:119
Id_iterator< Mid_id > iterator
Definition: sequence_map_short.hpp:35
name_map names_
Definition: sequence_map_short.hpp:132
detail::Id_bimap< Mid_id, seq_type > seq_map
Definition: sequence_map_short.hpp:31
static std::size_t length()
Definition: sequence_fls.hpp:31
Definition: best_match_pair.hpp:16
Definition: sequence_map_short.hpp:37
std::size_t size() const
Definition: id_map.hpp:37
Definition: find_shared.hpp:22
Mid_id const * find_exact(const boost::string_ref seq) const
Definition: sequence_map_short.hpp:57
void unused_variable(T const &)
Definition: unused_variable.hpp:14
std::size_t mid_size_
Definition: sequence_map_short.hpp:130
Seq_map_short(const bool allow_duplicate_seq)
Definition: sequence_map_short.hpp:39
Seq_fls< 4, boost::uint_fast32_t > seq_type
Definition: sequence_map_short.hpp:27
std::string seq_string(const Mid_id id) const
Definition: sequence_map_short.hpp:51
boost::error_info< struct errinfo_int1_, int > int1_t
Definition: exception.hpp:28
iterator begin() const
Definition: sequence_map_short.hpp:48
id_type max_id() const
Definition: id_bimap.hpp:120
seq_map seqs_
Definition: sequence_map_short.hpp:131
id_type const * find(ObjCompat const &obj) const
Definition: id_bimap.hpp:151
Main namespace of vdj_pipe library.
Definition: keywords_variable.hpp:11
const std::string seq1
Definition: match_step_run.cpp:43
bool allow_duplicate_seq_
Definition: sequence_map_short.hpp:133
boost::error_info< struct errinfo_str2_, std::string > str2_t
Definition: exception.hpp:26
boost::error_info< struct errinfo_int2_, int > int2_t
Definition: exception.hpp:29
Mid_id insert(std::string name, std::string const &seq)
Definition: sequence_map_short.hpp:80
Mid_id max_id() const
Definition: sequence_map_short.hpp:54
iterator end() const
Definition: sequence_map_short.hpp:49
Definition: nucleotide_index.hpp:25
boost::error_info< struct errinfo_str1_, std::string > str1_t
Definition: exception.hpp:25
const std::size_t n
Definition: vector_set_test.cpp:26
Definition: exception.hpp:23
boost::error_info< struct errinfo_str3_, std::string > str3_t
Definition: exception.hpp:27
Definition: id_iterator.hpp:16
boost::error_info< struct errinfo_message_, std::string > msg_t
Definition: exception.hpp:24
std::string sanitize(const char c)
Definition: sanitize_string.cpp:53
Best_match_pair< Mid_id, int > match_type
Definition: sequence_map_short.hpp:28
std::size_t size() const
Definition: id_bimap.hpp:115
Definition: sequence_map_short.hpp:25
Mid_id const * find_exact(const seq_type seq) const
Definition: sequence_map_short.hpp:55
id_type insert(value_type const &obj)
Definition: id_map.hpp:83
detail::Id_map< Mid_id, std::string > name_map
Definition: sequence_map_short.hpp:32
iterator const_iterator
Definition: sequence_map_short.hpp:36
bool combine(id_type const &id, const score_type score)
Definition: best_match_pair.hpp:49
bool find_closest(const Seq s, match_type &m, scoring_matrix_t const &sm) const
Definition: sequence_map_short.hpp:63