9 #include "boost/assert.hpp" 10 #include "boost/foreach.hpp" 11 #include "boost/shared_ptr.hpp" 28 class Common_substrings;
29 class Ukkonen_inserter;
35 const Branch_id bid = Branch_id(),
36 const unsigned min_d = 0,
50 void next(
const boost::string_ref patt,
Gdst const& st);
51 bool check(
const boost::string_ref patt,
Gdst const& st);
70 typedef boost::string_ref seq_type;
72 friend class Common_substrings;
73 friend class Ukkonen_inserter;
79 typedef boost::shared_ptr<seq_map> seq_map_ptr;
81 typedef std::vector<match_type> match_vector;
84 : ss_(
new seq_map(Seq_id(1))),
88 root_(bm_.insert(
Branch(0)))
91 explicit Gdst(seq_map_ptr ss)
96 root_(bm_.insert(
Branch(0)))
99 std::size_t size()
const {
return bm_.size();}
101 Branch_id root()
const {
return root_;}
102 seq_map
const& sequence_map()
const {
return *ss_;}
103 seq_map& sequence_map() {
return *ss_;}
105 Branch const& operator[](
const Branch_id bid)
const {
110 Leaf const& operator[](
const Leaf_id lid)
const {
115 Seq_entry const& operator[](
const Seq_id sid)
const {
119 Leaf const& leaf(
const Branch_id bid)
const {
122 if(
const Leaf_id lid = bm_[bid].leaf_ )
return lm_[lid];
126 const seq_type suffix(
const Seq_id sid,
const unsigned len)
const {
128 BOOST_ASSERT((*ss_)[sid].size());
129 BOOST_ASSERT((*ss_)[sid].size() >= len);
130 const seq_type seq = (*ss_)[sid].sequence();
131 return seq.substr(seq.length() - len);
134 const seq_type suffix(
const Leaf_id lid,
const unsigned len)
const {
136 BOOST_ASSERT(lm_[lid].size());
137 return suffix(lm_[lid].front(), len);
140 const seq_type suffix(
Branch const& b)
const {
143 Branch const& suffix_branch = bm_[b.
sb_];
144 BOOST_ASSERT(suffix_branch.
leaf_);
145 return suffix(suffix_branch.
leaf_, suffix_branch.
n_);
148 const seq_type suffix(
const Branch_id bid)
const {
150 return suffix(bm_[bid]);
157 BOOST_ASSERT(i < bm_[bid].n_);
158 const seq_type seq = suffix(bid);
159 return nucleotide_index(seq[i]);
168 BOOST_ASSERT(bm_[bid].n_ < i);
169 const Branch_id c = child(bid, n);
171 BOOST_ASSERT(i < bm_[c].n_);
172 const seq_type seq = suffix(c);
173 return nucleotide_index(seq[i]);
176 Branch_id child(
const Branch_id bid1,
const Nucleotide n)
const {
178 if( n >= 4 ) BOOST_THROW_EXCEPTION(
180 << Err::msg_t(
"invalid nucleotide (only ACGT supported)")
181 << Err::str1_t(sanitize(to_capital(n)))
183 Branch const& b1 = bm_[bid1];
184 return b1.c_ ? cm_[b1.c_][n] : Branch_id();
187 Branch_id child(
const Branch_id bid1,
const seq_type seq)
const {
189 Branch const& b1 = bm_[bid1];
190 if( ! b1.c_ )
return Branch_id();
191 BOOST_ASSERT(seq.length() > b1.
n_);
193 return cm_[b1.c_][n];
197 const Branch_id bid1,
203 BOOST_ASSERT(bm_[bid1].n_ < bm_[bid2].n_);
205 if( ! b1.c_ ) b1.c_ = cm_.insert(
Children());
206 cm_[b1.c_][n] = bid2;
217 std::size_t min_len = 0
226 void find_overlaping(
227 const boost::string_ref seq,
229 std::size_t min_overlap = 0
240 for( ; m.check(seq, *
this); m.next(seq, *
this) );
244 void insert(
const Seq_id sid);
253 void collect_sequences(
254 const gdst::Branch_id nid,
259 vs.insert(l.begin(), l.end());
263 void suffix_link(
const Branch_id bid1,
const Branch_id bid2) {
266 BOOST_ASSERT( ! bm_[bid1].sl_);
267 BOOST_ASSERT( bm_[bid1].n_ == bm_[bid2].n_ + 1);
268 bm_[bid1].sl_ = bid2;
271 Branch_id suffix_link(
const Branch_id bid)
const {
276 unsigned edge_length(
const Branch_id bid,
const Nucleotide n)
const {
277 const Branch_id c = child(bid, n);
279 BOOST_ASSERT(bm_[bid].n_ < bm_[c].n_);
280 return bm_[c].n_ - bm_[bid].n_;
283 unsigned edge_length(
const Branch_id bid,
const seq_type suff)
const {
285 BOOST_ASSERT(bm_[bid].n_ < suff.size());
286 return edge_length(bid, nucleotide_index(suff[bm_[bid].n_]));
293 const boost::string_ref seq
297 BOOST_ASSERT(aei < seq.size());
298 const Nucleotide ae = nucleotide_index(seq[aei]);
299 const Branch_id c = child(an, ae);
301 BOOST_ASSERT(bm_[an].n_ < bm_[c].n_);
302 const unsigned l = bm_[c].n_ - bm_[an].n_;
310 void add_to_leaf(
const Branch_id bid,
const Seq_id sid) {
315 lm_[b.
leaf_].insert(sid);
318 Branch_id leaf_from_branch(
319 const Branch_id bid1,
324 const Branch_id bid2 = bm_.insert(
Branch(i));
325 add_to_leaf(bid2, sid);
326 bm_[bid2].sb_ = bid2;
327 child(bid1, n1, bid2);
331 Branch_id split_edge(
332 const Branch_id bid1,
337 BOOST_ASSERT(bm_[bid1].n_ < i &&
"new child index should be greater");
338 const Branch_id bid2 = bm_.insert(
Branch(i));
339 const Branch_id bid3 = child(bid1, n1);
340 BOOST_ASSERT(i < bm_[bid3].n_ &&
"old child index should be greater");
341 child(bid1, n1, bid2);
342 child(bid2, n2, bid3);
Common subsequence.
Definition: common_subsequence.hpp:18
bool mismatch_
Definition: gdst.hpp:60
Nucleotide
Definition: nucleotide_index.hpp:24
Branch_id curr_
Definition: gdst.hpp:53
Leaf_id leaf_
Definition: nodes.hpp:67
unsigned n_
Definition: nodes.hpp:57
Main namespace of vdj_pipe library.
Definition: sequence_file.hpp:14
Branch_id sb_
Definition: nodes.hpp:64
Nucleotide en_
Definition: gdst.hpp:61
Definition: gdst_stats.hpp:17
bool at_pattern_end_
Definition: gdst.hpp:56
Definition: depth_first_iterator.hpp:20
Definition: sequence_position.hpp:15
bool at_tree_end_
Definition: gdst.hpp:58
Nucleotide pn_
Definition: gdst.hpp:57
unsigned min_d_
Definition: gdst.hpp:54
Definition: exception.hpp:23
unsigned i_
Definition: gdst.hpp:55
Generalized DNA suffix tree.
Definition: gdst.hpp:66
Store suffixes Each leaf is attached to a branch and stores suffixes that have the length of the bran...
Definition: nodes.hpp:22
Results of matching pattern to suffix tree.
Definition: gdst.hpp:33
Definition: sequence_entry.hpp:19