vdj_pipe
pipeline for processing DNA sequence data
gdst_ukkonen_inserter.hpp
Go to the documentation of this file.
1 
7 #ifndef GDST_UKKONEN_INSERTER_HPP_
8 #define GDST_UKKONEN_INSERTER_HPP_
9 #include "boost/assert.hpp"
11 #include "vdj_pipe/gdst/gdst.hpp"
12 
13 namespace vdj_pipe{ namespace gdst{
14 
21 public:
22  static void insert(Gdst& st, const Seq_id sid) {
23  for( Ukkonen_inserter ui(st, sid); ui.has_next(); ui.next() ) ui.insert();
24  }
25 
26  Ukkonen_inserter(Gdst& st, const Seq_id sid)
27  : st_(st),
28  sid_(sid),
29  suff_(st_[sid].sequence()),
30  active_node_(st_.root()),
32  md_(0),
33  prev_branch_(),
34  prev_leaf_()
35  {}
36 
37  void next() {suff_.remove_prefix(1);}
38  bool has_next() const {return ! suff_.empty();}
39 
40  void insert() {
41  for( ; ; ) {
42  if( prev_branch_ && st_[prev_branch_].n_ == active_length_ + 1 ) {
44  prev_branch_ = Branch_id();
45  }
46 
47  if( ! md_ && active_length_ == suff_.size() ) {
50  if( ! st_[active_node_].sl_ ) prev_leaf_ = active_node_;
51  else prev_leaf_ = Branch_id();
52  break;
53  }
54 
55  BOOST_ASSERT(active_length_ < suff_.size());
56  if( ! md_ ) {
57  if( active_child() ) {
58  md_ = 1;
59  continue;
60  }
61  const Branch_id leaf = leaf_from_branch(active_node_, active_edge());
62  if( prev_leaf_ ) set_leaf_sl(leaf);
63  prev_leaf_ = leaf;
64  prev_branch_ = Branch_id();
65  break;
66  }
67 
68  const unsigned l = edge_length();
69  if( l <= md_ ) {
70  md_ -= l;
72  continue;
73  }
74 
75  const Nucleotide n2 = letter(active_length_ + md_);
76  if( active_length_ + md_ == suff_.size() ) {
77  const Branch_id leaf = split_active_edge(active_length_ + md_, n2);
78  st_.bm_[leaf].sb_ = leaf;
79  add_to_leaf(leaf);
80  if( prev_leaf_ ) set_leaf_sl(leaf);
81  prev_leaf_ = leaf;
82  prev_branch_ = Branch_id();
83  break;
84  }
85 
87  if( n1 == n2 ) {
88  ++md_;
89  continue;
90  }
91 
92  const Branch_id branch = split_active_edge(active_length_ + md_, n2);
93  const Branch_id leaf = leaf_from_branch(branch, n1);
94  st_.bm_[branch].sb_ = leaf;
95  if( prev_branch_ ) set_branch_sl(branch);
96  if( prev_leaf_ ) set_leaf_sl(leaf);
97  if( st_[branch].n_ == 1) {
98  st_.bm_[branch].sl_ = st_.root();
99  prev_branch_ = Branch_id();
100  }
101  else prev_branch_ = branch;
102  if( st_[leaf].n_ == 1) {
103  st_.bm_[leaf].sl_ = st_.root();
104  prev_leaf_ = Branch_id();
105  }
106  else prev_leaf_ = leaf;
107  break;
108  }
109 
110  if( active_node_ == st_.root() ) {
111  if( md_ ) --md_;
112  } else {
113  if( const Branch_id bid = st_[active_node_].sl_ ) set_active_node(bid);
114  else set_active_node(st_.root());
115  }
116  }
117 
118 private:
120  const Seq_id sid_;
121  boost::string_ref suff_;
122  Branch_id active_node_;
123  unsigned active_length_;
124  unsigned md_;
126 
128  BOOST_ASSERT(active_length_ < suff_.size());
129  return nucleotide_index(suff_[active_length_]);
130  }
131 
132  Branch_id active_child() const {
133  return st_.child(active_node_, active_edge());
134  }
135 
136  void active_child(const Branch_id bid) {
137  BOOST_ASSERT(active_length_ < suff_.size());
138  const Nucleotide n = nucleotide_index(suff_[active_length_]);
139  st_.child(active_node_, n, bid);
140  }
141 
142  void set_branch_sl(const Branch_id bid) {
143  BOOST_ASSERT(bid);
144  BOOST_ASSERT(prev_branch_);
145  BOOST_ASSERT( ! st_[prev_branch_].sl_);
146  BOOST_ASSERT( st_[prev_branch_].n_ == st_[bid].n_ + 1);
147  st_.bm_[prev_branch_].sl_ = bid;
148  }
149 
150  void set_leaf_sl(const Branch_id bid) {
151  BOOST_ASSERT(bid);
152  BOOST_ASSERT( ! st_[prev_leaf_].sl_);
153  BOOST_ASSERT( st_[prev_leaf_].n_ == st_[bid].n_ + 1);
154  st_.bm_[prev_leaf_].sl_ = bid;
155  }
156 
157  void add_to_leaf(const Branch_id bid) {
158  Branch& b = st_.bm_[bid];
159  if( ! b.leaf_ ) {
160  b.leaf_ = st_.lm_.insert(Leaf());
161  }
162  st_.lm_[b.leaf_].insert(sid_);
163  }
164 
165  Branch_id leaf_from_branch(
166  const Branch_id bid,
167  const Nucleotide n1
168  ) {
169  const Branch_id bid2 = st_.bm_.insert(Branch(suff_.size()));
170  add_to_leaf(bid2);
171  st_.bm_[bid2].sb_ = bid2; //its own suffix branch
172  st_.child(bid, n1, bid2);
173  return bid2;
174  }
175 
176  void set_active_node(const Branch_id bid) {
177  active_node_ = bid;
178  active_length_ = st_[active_node_].n_;
179  }
180 
181  Branch_id split_active_edge(
182  const unsigned i,
183  const Nucleotide n2
184  ) {
185  BOOST_ASSERT(active_length_ < i && "new child index should be greater");
186  const Branch_id bid2 = st_.bm_.insert(Branch(i));
187  const Branch_id bid3 = active_child();
188  BOOST_ASSERT(i < st_[bid3].n_ && "old child index should be greater");
189  active_child(bid2);
190  st_.child(bid2, n2, bid3);
191  return bid2;
192  }
193 
194  unsigned edge_length() const {
195  const Branch_id c = active_child();
196  BOOST_ASSERT(c);
197  BOOST_ASSERT(st_[active_node_].n_ < st_[c].n_);
198  return st_[c].n_ - st_[active_node_].n_;
199  }
200 
201  Nucleotide letter(const unsigned i) const {
202  BOOST_ASSERT(active_node_);
203  BOOST_ASSERT(active_length_ < i);
204  const Branch_id c = active_child();
205  BOOST_ASSERT(c);
206  BOOST_ASSERT(i < st_[c].n_);
207  return st_.letter(c, i);
208  }
209 };
210 
211 }//namespace gdst
212 }//namespace vdj_pipe
213 #endif /* GDST_UKKONEN_INSERTER_HPP_ */
Branch_id prev_leaf_
Definition: gdst_ukkonen_inserter.hpp:125
unsigned edge_length() const
Definition: gdst_ukkonen_inserter.hpp:194
static void insert(Gdst &st, const Seq_id sid)
Definition: gdst_ukkonen_inserter.hpp:22
void add_to_leaf(const Branch_id bid)
Definition: gdst_ukkonen_inserter.hpp:157
Branch_id active_child() const
Definition: gdst_ukkonen_inserter.hpp:132
Nucleotide
Definition: nucleotide_index.hpp:24
Branch_id prev_branch_
Definition: gdst_ukkonen_inserter.hpp:125
boost::string_ref suff_
Definition: gdst_ukkonen_inserter.hpp:121
Branch_id child(const Branch_id bid1, const Nucleotide n) const
Definition: gdst.hpp:176
Nucleotide active_edge() const
Definition: gdst_ukkonen_inserter.hpp:127
leaf_map lm_
Definition: gdst.hpp:250
unsigned active_length_
Definition: gdst_ukkonen_inserter.hpp:123
Leaf_id leaf_
Definition: nodes.hpp:67
void set_active_node(const Branch_id bid)
Definition: gdst_ukkonen_inserter.hpp:176
Definition: nodes.hpp:45
Main namespace of vdj_pipe library.
Definition: keywords_variable.hpp:11
unsigned md_
Definition: gdst_ukkonen_inserter.hpp:124
According to: http://www.cise.ufl.edu/~sahni/dsaaj/enrich/c16/suffix.htm http://stackoverflow.com/questions/9452701.
Definition: gdst_ukkonen_inserter.hpp:20
void insert()
Definition: gdst_ukkonen_inserter.hpp:40
void set_branch_sl(const Branch_id bid)
Definition: gdst_ukkonen_inserter.hpp:142
Branch_id split_active_edge(const unsigned i, const Nucleotide n2)
Definition: gdst_ukkonen_inserter.hpp:181
Nucleotide letter(const unsigned i) const
Definition: gdst_ukkonen_inserter.hpp:201
branch_map bm_
Definition: gdst.hpp:248
bool has_next() const
Definition: gdst_ukkonen_inserter.hpp:38
void active_child(const Branch_id bid)
Definition: gdst_ukkonen_inserter.hpp:136
Branch_id root() const
Definition: gdst.hpp:101
Branch_id active_node_
Definition: gdst_ukkonen_inserter.hpp:122
const std::size_t n
Definition: vector_set_test.cpp:26
Ukkonen_inserter(Gdst &st, const Seq_id sid)
Definition: gdst_ukkonen_inserter.hpp:26
const Seq_id sid_
Definition: gdst_ukkonen_inserter.hpp:120
Nucleotide nucleotide_index(const char c)
Definition: nucleotide_index.hpp:45
void next()
Definition: gdst_ukkonen_inserter.hpp:37
Gdst & st_
Definition: gdst_ukkonen_inserter.hpp:119
Generalized DNA suffix tree.
Definition: gdst.hpp:66
void set_leaf_sl(const Branch_id bid)
Definition: gdst_ukkonen_inserter.hpp:150
Branch_id leaf_from_branch(const Branch_id bid, const Nucleotide n1)
Definition: gdst_ukkonen_inserter.hpp:165
id_type insert(value_type const &obj)
Definition: id_map.hpp:83
Store suffixes Each leaf is attached to a branch and stores suffixes that have the length of the bran...
Definition: nodes.hpp:22
Nucleotide letter(const Branch_id bid, const unsigned i) const
Definition: gdst.hpp:153