9 #include "boost/assert.hpp" 10 #include "boost/foreach.hpp" 11 #include "boost/shared_ptr.hpp" 28 class Common_substrings;
29 class Ukkonen_inserter;
35 const Branch_id bid = Branch_id(),
36 const unsigned min_d = 0,
50 void next(
const boost::string_ref patt,
Gdst const& st);
51 bool check(
const boost::string_ref patt,
Gdst const& st);
72 friend class Common_substrings;
84 : ss_(new seq_map(Seq_id(1))),
88 root_(bm_.insert(
Branch(0)))
91 explicit Gdst(seq_map_ptr ss)
96 root_(bm_.insert(
Branch(0)))
99 std::size_t
size()
const {
return bm_.size();}
101 Branch_id
root()
const {
return root_;}
122 if(
const Leaf_id lid = bm_[bid].leaf_ )
return lm_[lid];
126 const seq_type
suffix(
const Seq_id sid,
const unsigned len)
const {
128 BOOST_ASSERT((*ss_)[sid].size());
129 BOOST_ASSERT((*ss_)[sid].size() >= len);
130 const seq_type seq = (*ss_)[sid].sequence();
131 return seq.substr(seq.length() - len);
134 const seq_type
suffix(
const Leaf_id lid,
const unsigned len)
const {
136 BOOST_ASSERT(lm_[lid].size());
137 return suffix(lm_[lid].front(), len);
143 Branch const& suffix_branch = bm_[b.
sb_];
144 BOOST_ASSERT(suffix_branch.
leaf_);
145 return suffix(suffix_branch.
leaf_, suffix_branch.
n_);
148 const seq_type
suffix(
const Branch_id bid)
const {
150 return suffix(bm_[bid]);
157 BOOST_ASSERT(i < bm_[bid].n_);
158 const seq_type seq = suffix(bid);
168 BOOST_ASSERT(bm_[bid].n_ < i);
169 const Branch_id c = child(bid, n);
171 BOOST_ASSERT(i < bm_[c].n_);
172 const seq_type seq = suffix(c);
178 if( n >= 4 ) BOOST_THROW_EXCEPTION(
180 <<
Err::msg_t(
"invalid nucleotide (only ACGT supported)")
183 Branch const& b1 = bm_[bid1];
184 return b1.
c_ ? cm_[b1.
c_][
n] : Branch_id();
187 Branch_id
child(
const Branch_id bid1,
const seq_type seq)
const {
189 Branch const& b1 = bm_[bid1];
190 if( ! b1.
c_ )
return Branch_id();
191 BOOST_ASSERT(seq.length() > b1.
n_);
193 return cm_[b1.
c_][
n];
197 const Branch_id bid1,
203 BOOST_ASSERT(bm_[bid1].n_ < bm_[bid2].n_);
206 cm_[b1.
c_][
n] = bid2;
217 std::size_t min_len = 0
226 void find_overlaping(
227 const boost::string_ref seq,
229 std::size_t min_overlap = 0
240 for( ; m.
check(seq, *
this); m.
next(seq, *
this) );
244 void insert(
const Seq_id sid);
254 const gdst::Branch_id nid,
266 BOOST_ASSERT( ! bm_[bid1].sl_);
267 BOOST_ASSERT( bm_[bid1].n_ == bm_[bid2].n_ + 1);
268 bm_[bid1].sl_ = bid2;
277 const Branch_id c = child(bid, n);
279 BOOST_ASSERT(bm_[bid].n_ < bm_[c].n_);
280 return bm_[c].n_ - bm_[bid].n_;
283 unsigned edge_length(
const Branch_id bid,
const seq_type suff)
const {
285 BOOST_ASSERT(bm_[bid].n_ < suff.
size());
293 const boost::string_ref seq
297 BOOST_ASSERT(aei < seq.size());
299 const Branch_id c = child(an, ae);
301 BOOST_ASSERT(bm_[an].n_ < bm_[c].n_);
302 const unsigned l = bm_[c].n_ - bm_[an].n_;
319 const Branch_id bid1,
325 add_to_leaf(bid2, sid);
326 bm_[bid2].sb_ = bid2;
327 child(bid1, n1, bid2);
332 const Branch_id bid1,
337 BOOST_ASSERT(bm_[bid1].n_ < i &&
"new child index should be greater");
339 const Branch_id bid3 = child(bid1, n1);
340 BOOST_ASSERT(i < bm_[bid3].n_ &&
"old child index should be greater");
341 child(bid1, n1, bid2);
342 child(bid2, n2, bid3);
Seq_entry const & operator[](const Seq_id sid) const
Definition: gdst.hpp:115
const seq_type suffix(const Seq_id sid, const unsigned len) const
Definition: gdst.hpp:126
const seq_type suffix(const Branch_id bid) const
Definition: gdst.hpp:148
bool check(const boost::string_ref patt, Gdst const &st)
Definition: gdst.cpp:46
detail::Id_map< Branch_id, Branch > branch_map
Definition: gdst.hpp:67
Gdst()
Definition: gdst.hpp:83
Leaf const & operator[](const Leaf_id lid) const
Definition: gdst.hpp:110
std::size_t size() const
Definition: id_map.hpp:37
bool at_end() const
Definition: depth_first_iterator.hpp:36
Branch_id child(const Branch_id bid1, const seq_type seq) const
Definition: gdst.hpp:187
Branch const & operator[](const Branch_id bid) const
Definition: gdst.hpp:105
std::size_t size() const
Definition: gdst.hpp:99
Common subsequence.
Definition: common_subsequence.hpp:18
unsigned edge_length(const Branch_id bid, const Nucleotide n) const
Definition: gdst.hpp:276
boost::string_ref seq_type
Definition: gdst.hpp:70
bool mismatch_
Definition: gdst.hpp:60
Nucleotide
Definition: nucleotide_index.hpp:24
Match(const Branch_id bid=Branch_id(), const unsigned min_d=0, const unsigned i=0)
Definition: gdst.hpp:34
void suffix_link(const Branch_id bid1, const Branch_id bid2)
Definition: gdst.hpp:263
Branch_id child(const Branch_id bid1, const Nucleotide n) const
Definition: gdst.hpp:176
Branch_id curr_
Definition: gdst.hpp:53
leaf_map lm_
Definition: gdst.hpp:250
seq_map const & sequence_map() const
Definition: gdst.hpp:102
Leaf_id leaf_
Definition: nodes.hpp:67
void check_edge(Branch_id &an, unsigned &aei, unsigned &al, const boost::string_ref seq) const
Definition: gdst.hpp:289
children_map cm_
Definition: gdst.hpp:249
unsigned n_
Definition: nodes.hpp:57
Branch_id next_
Definition: gdst.hpp:59
Main namespace of vdj_pipe library.
Definition: keywords_variable.hpp:11
Branch_id sb_
Definition: nodes.hpp:64
void insert(Iter i1, Iter i2)
Definition: vector_set.hpp:40
According to: http://www.cise.ufl.edu/~sahni/dsaaj/enrich/c16/suffix.htm http://stackoverflow.com/questions/9452701.
Definition: gdst_ukkonen_inserter.hpp:20
Gdst(seq_map_ptr ss)
Definition: gdst.hpp:91
void next(const boost::string_ref patt, Gdst const &st)
Definition: gdst.cpp:17
#define VDJ_PIPE_DECL
Definition: config.hpp:23
Depth_iter depth_first() const
Definition: gdst.hpp:100
Branch_id root_
Definition: gdst.hpp:251
Nucleotide en_
Definition: gdst.hpp:61
Definition: gdst_stats.hpp:17
bool at_pattern_end_
Definition: gdst.hpp:56
branch_map bm_
Definition: gdst.hpp:248
detail::Id_map< Seq_id, Seq_entry > seq_map
Definition: gdst.hpp:78
boost::shared_ptr< seq_map > seq_map_ptr
Definition: gdst.hpp:79
Definition: depth_first_iterator.hpp:20
Branch_id suffix_link(const Branch_id bid) const
Definition: gdst.hpp:271
std::vector< match_type > match_vector
Definition: gdst.hpp:81
Definition: sequence_position.hpp:15
bool at_tree_end_
Definition: gdst.hpp:58
boost::error_info< struct errinfo_str1_, std::string > str1_t
Definition: exception.hpp:25
Branch_id root() const
Definition: gdst.hpp:101
Branch_id split_edge(const Branch_id bid1, const Nucleotide n1, const unsigned i, const Nucleotide n2)
Definition: gdst.hpp:331
Nucleotide pn_
Definition: gdst.hpp:57
detail::Id_map< Leaf_id, Leaf > leaf_map
Definition: gdst.hpp:68
const seq_type suffix(const Leaf_id lid, const unsigned len) const
Definition: gdst.hpp:134
iterator end()
Definition: vector_set.hpp:29
const seq_type suffix(Branch const &b) const
Definition: gdst.hpp:140
const std::size_t n
Definition: vector_set_test.cpp:26
unsigned edge_length(const Branch_id bid, const seq_type suff) const
Definition: gdst.hpp:283
unsigned min_d_
Definition: gdst.hpp:54
Nucleotide nucleotide_index(const char c)
Definition: nucleotide_index.hpp:45
Definition: exception.hpp:23
char to_capital(const Nucleotide n)
Definition: nucleotide_index.hpp:216
seq_map & sequence_map()
Definition: gdst.hpp:103
Nucleotide letter(const Branch_id bid, const Nucleotide n, const unsigned i) const
Definition: gdst.hpp:162
boost::error_info< struct errinfo_message_, std::string > msg_t
Definition: exception.hpp:24
Branch_id leaf_from_branch(const Branch_id bid1, const Nucleotide n1, const unsigned i, const Seq_id sid)
Definition: gdst.hpp:318
seq_map_ptr ss_
Definition: gdst.hpp:247
std::string sanitize(const char c)
Definition: sanitize_string.cpp:53
void collect_sequences(const gdst::Branch_id nid, detail::Vector_set< Seq_id > &vs) const
Definition: gdst.hpp:253
void child(const Branch_id bid1, const Nucleotide n, const Branch_id bid2)
Definition: gdst.hpp:196
Leaf const & leaf(const Branch_id bid) const
Definition: gdst.hpp:119
iterator begin()
Definition: vector_set.hpp:28
unsigned i_
Definition: gdst.hpp:55
Generalized DNA suffix tree.
Definition: gdst.hpp:66
void add_to_leaf(const Branch_id bid, const Seq_id sid)
Definition: gdst.hpp:310
Match find(const seq_type seq, const Branch_id bid, unsigned min_d=0) const
Definition: gdst.hpp:234
detail::Id_map< Children_id, Children > children_map
Definition: gdst.hpp:69
id_type insert(value_type const &obj)
Definition: id_map.hpp:83
Children_id c_
Definition: nodes.hpp:59
Store suffixes Each leaf is attached to a branch and stores suffixes that have the length of the bran...
Definition: nodes.hpp:22
Seq_pos< Seq_id > match_type
Definition: gdst.hpp:80
Nucleotide letter(const Branch_id bid, const unsigned i) const
Definition: gdst.hpp:153
Results of matching pattern to suffix tree.
Definition: gdst.hpp:33
Definition: sequence_entry.hpp:19