vdj_pipe
pipeline for processing DNA sequence data
gdst.hpp
Go to the documentation of this file.
1 
7 #ifndef GDST_HPP_
8 #define GDST_HPP_
9 #include "boost/assert.hpp"
10 #include "boost/foreach.hpp"
11 #include "boost/shared_ptr.hpp"
12 
13 #include "vdj_pipe/config.hpp"
17 #include "vdj_pipe/exception.hpp"
20 #include "vdj_pipe/gdst/nodes.hpp"
23 #include "vdj_pipe/object_ids.hpp"
25 
26 namespace vdj_pipe{ namespace gdst{
27 class Gdst_stats;
28 class Common_substrings;
29 class Ukkonen_inserter;
30 
33 struct VDJ_PIPE_DECL Match {
34  explicit Match(
35  const Branch_id bid = Branch_id(),
36  const unsigned min_d = 0,
37  const unsigned i = 0
38  )
39  :curr_(bid),
40  min_d_(min_d),
41  i_(i),
42  at_pattern_end_(),
43  pn_(),
44  at_tree_end_(),
45  next_(),
46  mismatch_(),
47  en_()
48  {}
49 
50  void next(const boost::string_ref patt, Gdst const& st);
51  bool check(const boost::string_ref patt, Gdst const& st);
52 
53  Branch_id curr_;
54  unsigned min_d_;
55  unsigned i_;
58  bool at_tree_end_;
59  Branch_id next_;
60  bool mismatch_;
62 };
63 
66 class VDJ_PIPE_DECL Gdst {
70  typedef boost::string_ref seq_type;
71  friend class Gdst_stats;
72  friend class Common_substrings;
73  friend class Ukkonen_inserter;
74 
75 public:
76  struct Err : public base_exception{};
77 
79  typedef boost::shared_ptr<seq_map> seq_map_ptr;
81  typedef std::vector<match_type> match_vector;
82 
83  Gdst()
84  : ss_(new seq_map(Seq_id(1))),
85  bm_(Branch_id(1)),
86  cm_(Children_id(1)),
87  lm_(Leaf_id(1)),
88  root_(bm_.insert(Branch(0)))
89  {}
90 
91  explicit Gdst(seq_map_ptr ss)
92  : ss_(ss),
93  bm_(Branch_id(1)),
94  cm_(Children_id(1)),
95  lm_(Leaf_id(1)),
96  root_(bm_.insert(Branch(0)))
97  {}
98 
99  std::size_t size() const {return bm_.size();}
100  Depth_iter depth_first() const {return Depth_iter(*this, root_);}
101  Branch_id root() const {return root_;}
102  seq_map const& sequence_map() const {return *ss_;}
103  seq_map& sequence_map() {return *ss_;}
104 
105  Branch const& operator[](const Branch_id bid) const {
106  BOOST_ASSERT(bid);
107  return bm_[bid];
108  }
109 
110  Leaf const& operator[](const Leaf_id lid) const {
111  BOOST_ASSERT(lid);
112  return lm_[lid];
113  }
114 
115  Seq_entry const& operator[](const Seq_id sid) const {
116  return (*ss_)[sid];
117  }
118 
119  Leaf const& leaf(const Branch_id bid) const {
120  BOOST_ASSERT(bid);
121  static const Leaf empty = Leaf();
122  if( const Leaf_id lid = bm_[bid].leaf_ ) return lm_[lid];
123  return empty;
124  }
125 
126  const seq_type suffix(const Seq_id sid, const unsigned len) const {
127  BOOST_ASSERT(sid);
128  BOOST_ASSERT((*ss_)[sid].size());
129  BOOST_ASSERT((*ss_)[sid].size() >= len);
130  const seq_type seq = (*ss_)[sid].sequence();
131  return seq.substr(seq.length() - len);
132  }
133 
134  const seq_type suffix(const Leaf_id lid, const unsigned len) const {
135  BOOST_ASSERT(lid);
136  BOOST_ASSERT(lm_[lid].size());
137  return suffix(lm_[lid].front(), len);
138  }
139 
140  const seq_type suffix(Branch const& b) const {
141  if( b.leaf_ ) return suffix(b.leaf_, b.n_);
142  BOOST_ASSERT(b.sb_);
143  Branch const& suffix_branch = bm_[b.sb_];
144  BOOST_ASSERT(suffix_branch.leaf_);
145  return suffix(suffix_branch.leaf_, suffix_branch.n_);
146  }
147 
148  const seq_type suffix(const Branch_id bid) const {
149  BOOST_ASSERT(bid);
150  return suffix(bm_[bid]);
151  }
152 
153  Nucleotide letter(
154  const Branch_id bid,
155  const unsigned i
156  ) const {
157  BOOST_ASSERT(i < bm_[bid].n_);
158  const seq_type seq = suffix(bid);
159  return nucleotide_index(seq[i]);
160  }
161 
162  Nucleotide letter(
163  const Branch_id bid,
164  const Nucleotide n,
165  const unsigned i
166  ) const {
167  BOOST_ASSERT(bid);
168  BOOST_ASSERT(bm_[bid].n_ < i);
169  const Branch_id c = child(bid, n);
170  BOOST_ASSERT(c);
171  BOOST_ASSERT(i < bm_[c].n_);
172  const seq_type seq = suffix(c);
173  return nucleotide_index(seq[i]);
174  }
175 
176  Branch_id child(const Branch_id bid1, const Nucleotide n) const {
177  BOOST_ASSERT(bid1);
178  if( n >= 4 ) BOOST_THROW_EXCEPTION(
179  Err()
180  << Err::msg_t("invalid nucleotide (only ACGT supported)")
181  << Err::str1_t(sanitize(to_capital(n)))
182  );
183  Branch const& b1 = bm_[bid1];
184  return b1.c_ ? cm_[b1.c_][n] : Branch_id();
185  }
186 
187  Branch_id child(const Branch_id bid1, const seq_type seq) const {
188  BOOST_ASSERT(bid1);
189  Branch const& b1 = bm_[bid1];
190  if( ! b1.c_ ) return Branch_id();
191  BOOST_ASSERT(seq.length() > b1.n_);
192  const Nucleotide n = nucleotide_index(seq[b1.n_]);
193  return cm_[b1.c_][n];
194  }
195 
196  void child(
197  const Branch_id bid1,
198  const Nucleotide n,
199  const Branch_id bid2
200  ) {
201  BOOST_ASSERT(bid1);
202  BOOST_ASSERT(bid2);
203  BOOST_ASSERT(bm_[bid1].n_ < bm_[bid2].n_);
204  Branch& b1 = bm_[bid1];
205  if( ! b1.c_ ) b1.c_ = cm_.insert(Children());
206  cm_[b1.c_][n] = bid2;
207  }
208 
215  Common_subseq find_longest(
216  const seq_type seq,
217  std::size_t min_len = 0
218  ) const;
219 
226  void find_overlaping(
227  const boost::string_ref seq,
229  std::size_t min_overlap = 0
230  ) const;
231 
234  Match find(
235  const seq_type seq,
236  const Branch_id bid,
237  unsigned min_d = 0
238  ) const {
239  Match m(bid, min_d);
240  for( ; m.check(seq, *this); m.next(seq, *this) );
241  return m;
242  }
243 
244  void insert(const Seq_id sid);
245 
246 private:
247  seq_map_ptr ss_;
248  branch_map bm_;
249  children_map cm_;
250  leaf_map lm_;
251  Branch_id root_;
252 
253  void collect_sequences(
254  const gdst::Branch_id nid,
256  ) const {
257  for( gdst::Depth_iter di(*this, nid); ! di.at_end(); di.next() ) {
258  gdst::Leaf const& l = leaf(di.id());
259  vs.insert(l.begin(), l.end());
260  }
261  }
262 
263  void suffix_link(const Branch_id bid1, const Branch_id bid2) {
264  BOOST_ASSERT(bid1);
265  BOOST_ASSERT(bid2);
266  BOOST_ASSERT( ! bm_[bid1].sl_);
267  BOOST_ASSERT( bm_[bid1].n_ == bm_[bid2].n_ + 1);
268  bm_[bid1].sl_ = bid2;
269  }
270 
271  Branch_id suffix_link(const Branch_id bid) const {
272  BOOST_ASSERT(bid);
273  return bm_[bid].sl_;
274  }
275 
276  unsigned edge_length(const Branch_id bid, const Nucleotide n) const {
277  const Branch_id c = child(bid, n);
278  BOOST_ASSERT(c);
279  BOOST_ASSERT(bm_[bid].n_ < bm_[c].n_);
280  return bm_[c].n_ - bm_[bid].n_;
281  }
282 
283  unsigned edge_length(const Branch_id bid, const seq_type suff) const {
284  BOOST_ASSERT(bid);
285  BOOST_ASSERT(bm_[bid].n_ < suff.size());
286  return edge_length(bid, nucleotide_index(suff[bm_[bid].n_]));
287  }
288 
289  void check_edge(
290  Branch_id& an,
291  unsigned& aei,
292  unsigned& al,
293  const boost::string_ref seq
294  ) const {
295  BOOST_ASSERT(an);
296  for( ; ; ) {
297  BOOST_ASSERT(aei < seq.size());
298  const Nucleotide ae = nucleotide_index(seq[aei]);
299  const Branch_id c = child(an, ae);
300  BOOST_ASSERT(c);
301  BOOST_ASSERT(bm_[an].n_ < bm_[c].n_);
302  const unsigned l = bm_[c].n_ - bm_[an].n_;
303  if( al < l ) return;
304  an = c;
305  aei += l;
306  al -= l;
307  }
308  }
309 
310  void add_to_leaf(const Branch_id bid, const Seq_id sid) {
311  Branch& b = bm_[bid];
312  if( ! b.leaf_ ) {
313  b.leaf_ = lm_.insert(Leaf());
314  }
315  lm_[b.leaf_].insert(sid);
316  }
317 
318  Branch_id leaf_from_branch(
319  const Branch_id bid1,
320  const Nucleotide n1,
321  const unsigned i,
322  const Seq_id sid
323  ) {
324  const Branch_id bid2 = bm_.insert(Branch(i));
325  add_to_leaf(bid2, sid);
326  bm_[bid2].sb_ = bid2; //its own suffix branch
327  child(bid1, n1, bid2);
328  return bid2;
329  }
330 
331  Branch_id split_edge(
332  const Branch_id bid1,
333  const Nucleotide n1,
334  const unsigned i,
335  const Nucleotide n2
336  ) {
337  BOOST_ASSERT(bm_[bid1].n_ < i && "new child index should be greater");
338  const Branch_id bid2 = bm_.insert(Branch(i));
339  const Branch_id bid3 = child(bid1, n1);
340  BOOST_ASSERT(i < bm_[bid3].n_ && "old child index should be greater");
341  child(bid1, n1, bid2);
342  child(bid2, n2, bid3);
343  return bid2;
344  }
345 };
346 
347 }//namespace gdst
348 }//namespace vdj_pipe
349 #endif /* GDST_HPP_ */
Definition: nodes.hpp:26
Common subsequence.
Definition: common_subsequence.hpp:18
bool mismatch_
Definition: gdst.hpp:60
Nucleotide
Definition: nucleotide_index.hpp:24
Branch_id curr_
Definition: gdst.hpp:53
Leaf_id leaf_
Definition: nodes.hpp:67
Definition: nodes.hpp:45
unsigned n_
Definition: nodes.hpp:57
Main namespace of vdj_pipe library.
Definition: sequence_file.hpp:14
Definition: gdst.hpp:76
Branch_id sb_
Definition: nodes.hpp:64
Nucleotide en_
Definition: gdst.hpp:61
Definition: gdst_stats.hpp:17
bool at_pattern_end_
Definition: gdst.hpp:56
Definition: depth_first_iterator.hpp:20
Definition: sequence_position.hpp:15
bool at_tree_end_
Definition: gdst.hpp:58
Nucleotide pn_
Definition: gdst.hpp:57
unsigned min_d_
Definition: gdst.hpp:54
Definition: exception.hpp:23
unsigned i_
Definition: gdst.hpp:55
Generalized DNA suffix tree.
Definition: gdst.hpp:66
Store suffixes Each leaf is attached to a branch and stores suffixes that have the length of the bran...
Definition: nodes.hpp:22
Results of matching pattern to suffix tree.
Definition: gdst.hpp:33
Definition: sequence_entry.hpp:19