vdj_pipe
pipeline for processing DNA sequence data
sequence_map_aligned.hpp
Go to the documentation of this file.
1 
7 #ifndef SEQUENCE_MAP_ALIGNED_HPP_
8 #define SEQUENCE_MAP_ALIGNED_HPP_
9 #include <limits>
10 #include <stdint.h>
11 #include <vector>
12 
13 #include "boost/assert.hpp"
14 #include "boost/foreach.hpp"
15 
16 #include "external/ssw_cpp.h"
19 #include "vdj_pipe/exception.hpp"
20 #include "vdj_pipe/gdst/gdst.hpp"
25 
26 namespace vdj_pipe{
27 
34 public:
36  typedef iterator const_iterator;
37  struct Err : public base_exception{};
38 
39  struct Match {
41  : id_(),
42  score_(std::numeric_limits<int>::min()),
43  n_mismatches_(std::numeric_limits<unsigned>::max()),
44  identity_(0.0),
46  cigar_()
47  {}
48 
49  Seq_id id_;
50  int score_;
51  unsigned n_mismatches_;
52  double identity_;
54  std::vector<uint32_t> cigar_;
55  };
56 
58  : st_(),
59  idm_(Seq_id(1))
60  {}
61 
62  iterator begin() const {return iterator(st_.sequence_map().min_id());}
63  iterator end() const {return ++iterator(st_.sequence_map().max_id());}
64  Seq_id max_id() const {return st_.sequence_map().max_id();}
65  std::size_t size() const {return st_.sequence_map().size();}
66 
67  void load_file(std::string const& fasta, const bool reverse) {
68  for(Parser_fasta pf((File_input(fasta))); pf.has_next(); pf.next_record() ) {
69  const Seq_record sr = pf.get_record();
70  insert(sr.id_, sr.seq_, reverse);
71  }
72  }
73 
74  void insert(std::string const& id, std::string seq, const bool reverse) {
75  check(seq);
76  std::pair<Seq_id,bool> p = idm_.insert(id);
77  if( ! p.second ) BOOST_THROW_EXCEPTION(
78  Err()
79  << Err::msg_t("duplicate sequence")
80  << Err::str1_t(sanitize(id))
81  << Err::str2_t(sanitize(seq))
82  );
83  if( reverse ) {
84  st_.sequence_map().insert(p.first) = Seq_entry(complement(seq));
85  } else {
86  st_.sequence_map().insert(p.first) = Seq_entry(seq);
87  }
88 
89  st_.insert(p.first);
90  }
91 
92  Seq_entry const& seq(const Seq_id sid) const {
93  BOOST_ASSERT(st_.sequence_map().find(sid));
94  return st_.sequence_map()[sid];
95  }
96 
97  std::string const& seq_id(const Seq_id sid) const {
98  BOOST_ASSERT(st_.sequence_map().find(sid));
99  return idm_[sid];
100  }
101 
103  const boost::string_ref seq,
104  const std::size_t min_match,
105  const bool track_mismatches
106  ) const {
107  detail::Vector_set<Seq_id> vs = find_overlaping(seq, min_match);
108  Match m;
109  BOOST_FOREACH(const Seq_id sid, vs) {
110  StripedSmithWaterman::Aligner aligner;
111  StripedSmithWaterman::Filter filter;
112  StripedSmithWaterman::Alignment al;
113  Seq_entry const& se = st_.sequence_map()[sid];
114  aligner.Align(seq.data(), se.sequence().data(), se.size(), filter, &al);
115  if(
116  (
117  track_mismatches &&
118  ( m.n_mismatches_ > (unsigned)al.mismatches )
119  ) ||
120  ( ! track_mismatches && ( m.score_ < al.sw_score ) )
121  ) {
122  m.id_ = sid;
123  m.score_ = al.sw_score;
124  m.n_mismatches_ = (unsigned)al.mismatches;
125  m.cigar_ = al.cigar;
126  m.si_ = sequence_interval(al.query_begin, al.query_end);
127  double len =
128  0.5 *
129  (al.ref_end - al.ref_begin + al.query_end - al.query_begin);
130  m.identity_ = (len - al.mismatches) / len;
131  }
132  }
133  return m;
134  }
135 
137  const boost::string_ref seq,
138  const std::size_t min_match
139  ) const {
141  for(
142  Unambiguous_interval_iter uii(seq, min_match);
143  uii.has_subseq();
144  uii.next()
145  ) {
146  st_.find_overlaping(uii.subseq(), vs, min_match);
147  }
148  return vs;
149  }
150 
151 private:
152  suffix_tree st_;
153  id_map idm_;
154 
155  static void check(const boost::string_ref seq) {
156  for(std::size_t n = 0; n != seq.size(); ++n) {
157  if( is_ambiguous(seq[n]) ) BOOST_THROW_EXCEPTION(
158  Err()
159  << Err::msg_t("undefined nucleotide")
160  << Err::str1_t(sanitize(seq, 500))
161  << Err::str2_t(sanitize(seq[n]))
162  << Err::int1_t(n)
163  );
164  }
165  }
166 };
167 
168 
169 }//namespace vdj_pipe
170 #endif /* SEQUENCE_MAP_ALIGNED_HPP_ */
std::pair< id_type, bool > insert(obj_type const &obj)
Definition: id_bimap.hpp:160
Seq_id id_
Definition: sequence_map_aligned.hpp:49
Match best_match(const boost::string_ref seq, const std::size_t min_match, const bool track_mismatches) const
Definition: sequence_map_aligned.hpp:102
sequence_interval si_
Definition: sequence_map_aligned.hpp:53
bool has_next() const
Definition: parser_line.hpp:31
std::string const & seq_id(const Seq_id sid) const
Definition: sequence_map_aligned.hpp:97
iterator begin() const
Definition: sequence_map_aligned.hpp:62
std::vector< uint32_t > cigar_
Definition: sequence_map_aligned.hpp:54
unsigned size() const
Definition: sequence_entry.hpp:29
Definition: sequence_map_aligned.hpp:39
std::size_t size() const
Definition: id_map.hpp:37
iterator const_iterator
Definition: sequence_map_aligned.hpp:36
static void check(const boost::string_ref seq)
Definition: sequence_map_aligned.hpp:155
void find_overlaping(const boost::string_ref seq, detail::Vector_set< Seq_id > &vs, std::size_t min_overlap=0) const
Definition: gdst.cpp:136
STL namespace.
Definition: sequence_map_aligned.hpp:30
detail::Id_bimap< Seq_id, std::string > id_map
Definition: sequence_map_aligned.hpp:33
void load_file(std::string const &fasta, const bool reverse)
Definition: sequence_map_aligned.hpp:67
bool is_ambiguous(const char c)
Definition: nucleotide_index.hpp:106
sequence seq_
Definition: sequence_record.hpp:32
value_type const * find(const id_type id) const
Definition: id_map.hpp:73
unsigned n_mismatches_
Definition: sequence_map_aligned.hpp:51
boost::error_info< struct errinfo_int1_, int > int1_t
Definition: exception.hpp:28
seq_map const & sequence_map() const
Definition: gdst.hpp:102
Main namespace of vdj_pipe library.
Definition: keywords_variable.hpp:11
File target is supposed to exist at construction time.
Definition: file.hpp:93
Seq_map_aligned()
Definition: sequence_map_aligned.hpp:57
Definition: sequence_record.hpp:28
Match()
Definition: sequence_map_aligned.hpp:40
bool has_subseq() const
Definition: interval_iterator.hpp:53
Definition: id_map.hpp:20
id_type min_id() const
Definition: id_map.hpp:41
std::string const & sequence() const
Definition: sequence_entry.hpp:30
sequence_interval sequence_interval_invalid()
Definition: sequence_interval.hpp:44
boost::error_info< struct errinfo_str2_, std::string > str2_t
Definition: exception.hpp:26
boost::error_info< struct errinfo_str1_, std::string > str1_t
Definition: exception.hpp:25
Seq_id max_id() const
Definition: sequence_map_aligned.hpp:64
void insert(std::string const &id, std::string seq, const bool reverse)
Definition: sequence_map_aligned.hpp:74
std::string id_
Definition: sequence_record.hpp:20
Definition: parser_fasta.hpp:16
Nucleotide complement(const Nucleotide n)
Definition: nucleotide_index.hpp:153
const std::size_t n
Definition: vector_set_test.cpp:26
Definition: sequence_map_aligned.hpp:37
iterator end() const
Definition: sequence_map_aligned.hpp:63
suffix_tree st_
Definition: sequence_map_aligned.hpp:152
std::size_t size() const
Definition: sequence_map_aligned.hpp:65
boost::numeric::interval< int, boost::numeric::interval_lib::policies< boost::numeric::interval_lib::rounded_math< int >, detail::Interval_checking_policy< int > > > sequence_interval
Definition: sequence_interval.hpp:40
Definition: exception.hpp:23
detail::Vector_set< Seq_id > find_overlaping(const boost::string_ref seq, const std::size_t min_match) const
Definition: sequence_map_aligned.hpp:136
Definition: id_iterator.hpp:16
id_map idm_
Definition: sequence_map_aligned.hpp:153
boost::error_info< struct errinfo_message_, std::string > msg_t
Definition: exception.hpp:24
std::string sanitize(const char c)
Definition: sanitize_string.cpp:53
Seq_entry const & seq(const Seq_id sid) const
Definition: sequence_map_aligned.hpp:92
id_type max_id() const
Definition: id_map.hpp:42
suffix_tree::seq_map seq_map
Definition: sequence_map_aligned.hpp:32
gdst::Gdst suffix_tree
Definition: sequence_map_aligned.hpp:31
Id_iterator< Seq_id > iterator
Definition: sequence_map_aligned.hpp:35
Generalized DNA suffix tree.
Definition: gdst.hpp:66
double identity_
Definition: sequence_map_aligned.hpp:52
id_type insert(value_type const &obj)
Definition: id_map.hpp:83
Definition: interval_iterator.hpp:16
int score_
Definition: sequence_map_aligned.hpp:50
void insert(const Seq_id sid)
Definition: gdst.cpp:79
Definition: sequence_entry.hpp:19