vdj_pipe
pipeline for processing DNA sequence data
gdst_search.hpp
Go to the documentation of this file.
1 
7 #ifndef GDST_SEARCH_HPP_
8 #define GDST_SEARCH_HPP_
9 #include <string>
10 #include <vector>
11 #include "boost/algorithm/string/predicate.hpp" //all
12 #include "boost/algorithm/string/classification.hpp" //operator!
13 #include "boost/assert.hpp"
14 #include "boost/foreach.hpp"
16 #include "vdj_pipe/gdst/gdst.hpp"
18 
19 namespace vdj_pipe{
20 
24 template<class MinLength> inline void remove_subsequences(
25  Seq_store& ss,
26  gdst::Gdst& st,
27  MinLength const& min_len
28 ) {
29  if( ss.empty() ) return;
30  const unsigned shortest = ss[ss.by_size().front()].size();
31  std::vector<Seq_id> sidv;
32  for( unsigned len = ss[ss.by_size().back()].size(); len >= shortest; --len ) {
33  BOOST_FOREACH(const Seq_id sid, ss.by_size(len)) sidv.push_back(sid);
34  BOOST_FOREACH(const Seq_id sid, sidv) {
35  std::string const& seq = ss[sid].sequence();
36  if( ! all(seq, ! Is_ambiguous()) ) continue;
37  const gdst::Common_subseq cs =
38  st.find_longest(seq, min_len(seq.size()));
39  if( cs.seq_.empty() ) {
40  st.insert(sid);
41  } else {
42  ss.remove_subsequence(sid, cs.seq_);
43  }
44  }
45  sidv.clear();
46  }
47 }
48 
49 }//namespace vdj_pipe
50 #endif /* GDST_SEARCH_HPP_ */
void remove_subsequences(Seq_store &ss, gdst::Gdst &st, MinLength const &min_len)
identify unique sequences in sequence store, insert them into suffix tree, and remove non-unique ones...
Definition: gdst_search.hpp:24
bool empty() const
Definition: sequence_store.hpp:98
Common subsequence.
Definition: common_subsequence.hpp:18
size_range by_size() const
Definition: sequence_store.hpp:103
detail::Vector_set< super_seq > seq_
Definition: common_subsequence.hpp:27
Definition: nucleotide_index.hpp:330
Main namespace of vdj_pipe library.
Definition: keywords_variable.hpp:11
std::size_t size() const
Definition: sequence_store.hpp:95
void remove_subsequence(const Seq_id sid, Super_seqs const &ss)
Definition: sequence_store.hpp:145
Common_subseq find_longest(const seq_type seq, std::size_t min_len=0) const
Definition: gdst.cpp:85
Generalized DNA suffix tree.
Definition: gdst.hpp:66
Store sequence and related information.
Definition: sequence_store.hpp:30
void insert(const Seq_id sid)
Definition: gdst.cpp:79