vdj_pipe
pipeline for processing DNA sequence data
find_shared_impl.hpp
Go to the documentation of this file.
1 
7 #ifndef FIND_SHARED_IMPL_HPP_
8 #define FIND_SHARED_IMPL_HPP_
9 #include <algorithm>
10 #include <functional>
11 #include <ostream>
12 #include <string>
13 #include <vector>
14 #include "boost/algorithm/string/classification.hpp" //operator!
15 #include "boost/algorithm/string/predicate.hpp" //all
16 #include "boost/assert.hpp"
17 #include "boost/container/flat_map.hpp"
18 #include "boost/foreach.hpp"
19 #include "boost/multi_index_container.hpp"
20 #include "boost/multi_index/hashed_index.hpp"
21 #include "boost/multi_index/member.hpp"
22 #include "boost/multi_index/ordered_index.hpp"
23 #include "boost/range.hpp"
25 #include "vdj_pipe/exception.hpp"
26 #include "vdj_pipe/file_stream.hpp"
27 #include "vdj_pipe/gdst_search.hpp"
29 #include "vdj_pipe/object_ids.hpp"
30 
31 namespace vdj_pipe{
32 
35 struct Read_ginfo {
36  std::string id_str_;
37  Read_id rid_;
38  Mapped_id mid_;
39 };
40 
43 class Group_count {
44 
45 public:
47 
48  explicit Group_count(const std::size_t n_groups)
49  : v_(n_groups, 0)
50  {}
51 
52  unsigned group_count() const {
53  return std::count_if(v_.begin(), v_.end(), Identity<unsigned>());
54  }
55 
56  void operator()(const Mapped_id mid) {++v_[mid()];}
57  unsigned operator[](const Mapped_id mid) const {return v_[mid()];}
58 
59  unsigned copy_count() const {
60  return accumulate(v_.begin(), v_.end(), 0U);
61  }
62 
63  void operator()(Group_count const& gc) {
64  transform(
65  v_.begin(),
66  v_.end(),
67  gc.v_.begin(),
68  v_.begin(),
69  std::plus<unsigned>()
70  );
71 
72  }
73 
74  std::vector<unsigned> v_;
75 };
76 
80  typedef boost::multi_index_container<
81  Read_ginfo,
82  boost::multi_index::indexed_by<
83  boost::multi_index::hashed_unique<
84  boost::multi_index::tag<struct id_tag>,
85  boost::multi_index::member<
86  Read_ginfo, Read_id, &Read_ginfo::rid_
87  >
88  >,
89  boost::multi_index::hashed_unique<
90  boost::multi_index::tag<struct id_str_tag>,
91  boost::multi_index::member<
92  Read_ginfo, std::string, &Read_ginfo::id_str_
93  >
94  >,
95  boost::multi_index::ordered_non_unique<
96  boost::multi_index::tag<struct mapped_tag>,
97  boost::multi_index::member<
98  Read_ginfo, Mapped_id, &Read_ginfo::mid_
99  >
100  >
101  >
103 
104  typedef store_t::index<id_tag>::type id_index;
105  typedef id_index::const_iterator id_iterator;
106 
107  typedef store_t::index<id_str_tag>::type id_str_index;
108  typedef id_str_index::const_iterator id_str_iterator;
109 
110  typedef store_t::index<mapped_tag>::type mid_index;
111  typedef mid_index::const_iterator mid_iterator;
112  typedef boost::iterator_range<mid_iterator> mid_range;
113 
114 public:
115 
116  struct Err : public base_exception {};
117 
118  std::size_t size() const {return m_.size();}
119 
120  Read_id insert(std::string const& id, const Mapped_id mid) {
121  const Read_id rid(m_.size() + 1);
122  const Read_ginfo rg = {id, rid, mid};
123  m_.insert(rg);
124  return rid;
125  }
126 
127  mid_range by_mid(const Mapped_id mid) const {
128  return boost::make_iterator_range(m_.get<mapped_tag>().equal_range(mid));
129  }
130 
131  Read_ginfo const& operator[](const Read_id rid) const {
132  return *m_.get<id_tag>().find(rid);
133  }
134 
135 private:
137 };
138 
143  : gc_(), gem_(0), gme_(0), tem_(0), tme_(0), mm5_(0), mm3_(0)
144  {}
145 
146  Sequence_gstats(Group_count const& gc, const Mapped_id mid)
147  : gc_(gc), gem_(gc[mid]), gme_(0), tem_(gc.copy_count()), tme_(0),
148  mm5_(0), mm3_(0)
149  {}
150 
151  static std::ostream& print_header(std::ostream& os, const char delim) {
152  os
153  /* ISSUE-41:
154  << "group_copies" << delim
155  << "group_exact_copies" << delim
156  << "group_mismatched_ends" << delim
157  << "5'-mismatch" << delim
158  << "3'-mismatch" << delim
159  << "total_copies" << delim
160  << "total_exact_copies" << delim */
161  << "exact_copies"
162  ;
163  return os;
164  }
165 
166  std::ostream& print(
167  std::ostream& os,
168  const char delim,
169  const Mapped_id mid
170  ) const {
171  os
172  /* ISSUE-41:
173  << gc_[mid] << delim
174  << gem_ << delim
175  << gme_ << delim
176  << (int)mm5_ << delim
177  << (int)mm3_ << delim
178  << gc_.copy_count() << delim
179  << tem_ << delim
180  << tme_ */
181  << gc_[mid]
182  ;
183  return os;
184  }
185 
187  unsigned gem_;
188  unsigned gme_;
189  unsigned tem_;
190  unsigned tme_;
191  unsigned char mm5_;
192  unsigned char mm3_;
193 };
194 
198  typedef boost::container::flat_map<Seq_id, Sequence_gstats> seq_map;
199  typedef seq_map::value_type pair;
200  typedef std::vector<unsigned> xstats;
201 
202 public:
204  Seq_store& ss,
205  Read_ginfo_store const& rgs,
206  const unsigned n_mids,
207  const Mapped_id mid,
208  Get_match_length const& gml
209  )
210  : st_(ss.sequence_map()),
211  m_(),
212  ss_(ss),
213  rgs_(rgs),
214  n_mids_(n_mids),
215  mid_(mid),
216  gml_(gml)
217  {
218  if( ss.empty() ) return;
219  const unsigned shortest = ss_[ss_.by_size().front()].size();
220  for(
221  unsigned len = ss_[ss_.by_size().back()].size();
222  len >= shortest;
223  --len
224  ) {
225  BOOST_FOREACH(const Seq_id sid, ss_.by_size(len)) {
226  Group_count gc(n_mids_);
227  BOOST_FOREACH(const sub_seq sp, ss_.maps_from(sid)) {
228  gc(rgs_[sp.id_].mid_);
229  }
230  if( gc[mid_] ) find_duplicates(sid, gc);
231  }
232  }
233  }
234 
235  unsigned n_reads() const {return m_.size();}
236 
238  File_ostream& fos,
239  const unsigned min_duplicates,
240  const bool consensus_trim,
241  const std::string group_name
242  ) const {
243  BOOST_FOREACH(pair const& p, m_) {
244  const Read_id rid = find_id(p.first);
245  if( ! min_duplicates || min_duplicates < p.second.gc_[mid_] ) {
246  boost::string_ref seq = ss_[p.first].sequence();
247  if( consensus_trim ) {
248  seq.remove_prefix(p.second.mm5_);
249  seq.remove_suffix(p.second.mm3_);
250  }
251  //std::cout << min_duplicates << ' ' << rgs_[rid].id_str_ << ' ' << p.second.gc_[mid_] << ' ' << p.second.gem_ << '\n';
252  std::ostringstream stm;
253  stm << p.second.gc_[mid_];
254  std::string descr = rgs_[rid].id_str_ + "|MID=" + group_name + "|DUPCOUNT=" + stm.str();
255  fos.write(descr, seq);
256  }
257  }
258  }
259 
261  File_ostream& fos,
262  const unsigned min_duplicates
263  ) const {
264  BOOST_FOREACH(pair const& p, m_) {
265  BOOST_ASSERT(p.second.gc_.group_count());
266  BOOST_ASSERT(p.second.gc_[mid_]);
267  if(
268  p.second.gc_.group_count() > 1 ||
269  ( min_duplicates && p.second.gc_[mid_] <= min_duplicates )
270  ) continue;
271  const Read_id rid = find_id(p.first);
272  std::ostringstream stm;
273  stm << p.second.gc_[mid_];
274  std::string descr = rgs_[rid].id_str_ + "|DUPCOUNT=" + stm.str();
275  fos.write(descr, ss_[p.first].sequence());
276  }
277  }
278 
279  void write_redundancy(std::ostream& os, const char delim) const {
280  os << "read_id" << delim;
281  Sequence_gstats::print_header(os, delim) << '\n';
282 
283  BOOST_FOREACH(pair const& p, m_) {
284  const Read_id rid = find_id(p.first);
285  os << rgs_[rid].id_str_ << delim;
286  p.second.print(os, delim, mid_) << '\n';
287  }
288  }
289 
290  unsigned find_shared(xstats& xs1, xstats& xs2) {
291  xs1.resize(n_mids_, 0);
292  xs2.resize(n_mids_, 0);
293  BOOST_FOREACH(const Seq_id sid, ss_) {
294  Group_count gc(n_mids_);
295  BOOST_FOREACH(const sub_seq sp, ss_.maps_from(sid)) {
296  gc(rgs_[sp.id_].mid_);
297  }
298 
299  if( gc[mid_] || find_shared(sid, gc) ) {
300  transform(
301  xs2.begin(),
302  xs2.end(),
303  gc.v_.begin(),
304  xs2.begin(),
305  std::plus<unsigned>()
306  );
307  }
308  }
309 
310  unsigned unique = 0;
311  BOOST_FOREACH(pair const& p, m_) {
312  bool is_unique = true;
313  for(std::size_t n = 0; n != n_mids_; ++n) {
314  if( p.second.gc_.v_[n] ) ++xs1[n];
315  if( is_unique && n != mid_() && p.second.gc_.v_[n] ) {
316  is_unique = false;
317  }
318  }
319  unique += is_unique;
320  }
321  return unique;
322  }
323 
324  void group_redundancy_histogram(xstats& h) const {
325  BOOST_FOREACH(pair const& p, m_) {
326  const std::size_t n = p.second.gc_[mid_];
327  if( h.size() <= n ) h.resize(n + 1, 0);
328  ++h[n];
329  }
330  }
331 
332  void redundancy_histogram(xstats& h) const {
333  BOOST_FOREACH(pair const& p, m_) {
334  const std::size_t n = p.second.gc_.copy_count();
335  if( h.size() <= n ) h.resize(n + 1, 0);
336  ++h[n];
337  }
338  }
339 
340 private:
342  seq_map m_;
343  Seq_store const& ss_;
345  unsigned n_mids_;
346  Mapped_id mid_;
348 
350  const unsigned seq_len,
351  const unsigned match_pos,
352  const unsigned match_len,
353  super_seq const& super,
354  Group_count const& gc
355  ) {
356  Sequence_gstats& stats = m_[super.id_];
357  stats.gc_(gc);
358  if(
360  match_pos,
361  seq_len,
362  super.pos_,
363  ss_[super.id_].size(),
364  match_len
365  )
366  ) {
367  stats.gme_ += gc[mid_];
368  stats.tme_ += gc.copy_count();
369  }
370  if( stats.mm5_ < match_pos ) stats.mm5_ = match_pos;
371  const unsigned match_end = seq_len - match_pos - match_len;
372  if( stats.mm3_ < match_end ) stats.mm3_ = match_end;
373  }
374 
381  std::string const& seq,
382  Group_count const& gc
383  ) {
384  BOOST_ASSERT( all(seq, ! Is_ambiguous()) );
385  const gdst::Common_subseq cs = st_.find_longest(seq, gml_(seq.size()));
386  if( cs.seq_.empty() ) return false;
387  BOOST_FOREACH(super_seq const& super, cs.seq_) {
388  BOOST_ASSERT(m_.find(super.id_) != m_.end());
389  store_stats(seq.size(), cs.start_, cs.len_, super, gc);
390  }
391  return true;
392  }
393 
399  const Seq_id sid,
400  Group_count const& gc
401  ) {
402  std::string const& seq = ss_[sid].sequence();
403  if( ! all(seq, ! Is_ambiguous()) ) return false;
404  return check_duplicates(seq, gc);
405  }
406 
407  Read_id find_id(const Seq_id sid) const {
408  BOOST_FOREACH(const sub_seq sp, ss_.maps_from(sid)) {
409  const Read_id rid = sp.id_;
410  if( rgs_[rid].mid_ == mid_ ) return rid;
411  }
412  BOOST_ASSERT(false);
413  return Read_id();
414  }
415 
416  bool is_only_group(const Seq_id sid, const Mapped_id mid) const {
417  bool found = false;
418  BOOST_FOREACH(const sub_seq sp, ss_.maps_from(sid)) {
419  const Read_id rid = sp.id_;
420  if( rgs_[rid].mid_ == mid ) found = true;
421  if( rgs_[rid].mid_ == mid_ ) return false;
422  }
423  return found;
424  }
425 
427  const Seq_id sid,
428  Group_count const& gc
429  ) {
430  std::string const& seq = ss_[sid].sequence();
431  if( all(seq, ! Is_ambiguous()) ) {
432  if( check_duplicates(seq, gc) ) return;
433  st_.insert(sid);
434  }
435  BOOST_ASSERT(m_.find(sid) == m_.end());
436  m_.emplace(sid, Sequence_gstats(gc, mid_));
437  }
438 };
439 
440 }//namespace vdj_pipe
441 #endif /* FIND_SHARED_IMPL_HPP_ */
void operator()(const Mapped_id mid)
Definition: find_shared_impl.hpp:56
std::size_t size() const
Definition: find_shared_impl.hpp:118
Read ID and group ID.
Definition: find_shared_impl.hpp:35
Seq_store const & ss_
Definition: find_shared_impl.hpp:343
mid_index::const_iterator mid_iterator
Definition: find_shared_impl.hpp:111
Group_count(const std::size_t n_groups)
Definition: find_shared_impl.hpp:48
Group_unique(Seq_store &ss, Read_ginfo_store const &rgs, const unsigned n_mids, const Mapped_id mid, Get_match_length const &gml)
Definition: find_shared_impl.hpp:203
unsigned gme_
Definition: find_shared_impl.hpp:188
unsigned copy_count() const
Definition: find_shared_impl.hpp:59
void find_duplicates(const Seq_id sid, Group_count const &gc)
Definition: find_shared_impl.hpp:426
Read_id insert(std::string const &id, const Mapped_id mid)
Definition: find_shared_impl.hpp:120
unsigned gem_
Definition: find_shared_impl.hpp:187
Id id_
Definition: sequence_position.hpp:20
mid_range by_mid(const Mapped_id mid) const
Definition: find_shared_impl.hpp:127
unsigned start_
match starting position in pattern
Definition: common_subsequence.hpp:23
Definition: find_shared_impl.hpp:197
Definition: find_shared_impl.hpp:141
void write(std::string const &descr, Seq_record::sequence const &seq)
Definition: file_stream.cpp:173
bool empty() const
Definition: sequence_store.hpp:98
unsigned len_
match length
Definition: common_subsequence.hpp:24
Common subsequence.
Definition: common_subsequence.hpp:18
static std::ostream & print_header(std::ostream &os, const char delim)
Definition: find_shared_impl.hpp:151
Read_id find_id(const Seq_id sid) const
Definition: find_shared_impl.hpp:407
unsigned pos_
Definition: sequence_position.hpp:21
std::vector< unsigned > xstats
Definition: find_shared_impl.hpp:200
gdst::Gdst st_
Definition: find_shared_impl.hpp:341
void operator()(Group_count const &gc)
Definition: find_shared_impl.hpp:63
void write_redundancy(std::ostream &os, const char delim) const
Definition: find_shared_impl.hpp:279
detail::Vector_set< super_seq > seq_
Definition: common_subsequence.hpp:27
unsigned tme_
Definition: find_shared_impl.hpp:190
Get_match_length const & gml_
Definition: find_shared_impl.hpp:347
id_index::const_iterator id_iterator
Definition: find_shared_impl.hpp:105
Read_id rid_
Definition: find_shared_impl.hpp:37
store_t m_
Definition: find_shared_impl.hpp:136
boost::container::flat_map< Seq_id, Sequence_gstats > seq_map
Definition: find_shared_impl.hpp:198
boost::multi_index_container< Read_ginfo, boost::multi_index::indexed_by< boost::multi_index::hashed_unique< boost::multi_index::tag< struct id_tag >, boost::multi_index::member< Read_ginfo, Read_id,&Read_ginfo::rid_ > >, boost::multi_index::hashed_unique< boost::multi_index::tag< struct id_str_tag >, boost::multi_index::member< Read_ginfo, std::string,&Read_ginfo::id_str_ > >, boost::multi_index::ordered_non_unique< boost::multi_index::tag< struct mapped_tag >, boost::multi_index::member< Read_ginfo, Mapped_id,&Read_ginfo::mid_ > > > > store_t
Definition: find_shared_impl.hpp:102
Read_ginfo const & operator[](const Read_id rid) const
Definition: find_shared_impl.hpp:131
Definition: nucleotide_index.hpp:330
unsigned n_reads() const
Definition: find_shared_impl.hpp:235
Mapped_id mid_
Definition: find_shared_impl.hpp:38
Read ID and group ID.
Definition: find_shared_impl.hpp:79
void write_group_unique(File_ostream &fos, const unsigned min_duplicates, const bool consensus_trim, const std::string group_name) const
Definition: find_shared_impl.hpp:237
Main namespace of vdj_pipe library.
Definition: keywords_variable.hpp:11
bool is_only_group(const Seq_id sid, const Mapped_id mid) const
Definition: find_shared_impl.hpp:416
Mapped_id mid_
Definition: find_shared_impl.hpp:346
unsigned tem_
Definition: find_shared_impl.hpp:189
Sequence_gstats()
Definition: find_shared_impl.hpp:142
Group_count()
Definition: find_shared_impl.hpp:46
store_t::index< id_str_tag >::type id_str_index
Definition: find_shared_impl.hpp:107
std::size_t size() const
Definition: sequence_store.hpp:95
Definition: find_shared_impl.hpp:116
Group_count gc_
Definition: find_shared_impl.hpp:186
id_str_index::const_iterator id_str_iterator
Definition: find_shared_impl.hpp:108
unsigned group_count() const
Definition: find_shared_impl.hpp:52
void group_redundancy_histogram(xstats &h) const
Definition: find_shared_impl.hpp:324
Sequence_gstats(Group_count const &gc, const Mapped_id mid)
Definition: find_shared_impl.hpp:146
void write_unique(File_ostream &fos, const unsigned min_duplicates) const
Definition: find_shared_impl.hpp:260
std::string id_str_
Definition: find_shared_impl.hpp:36
void redundancy_histogram(xstats &h) const
Definition: find_shared_impl.hpp:332
unsigned operator[](const Mapped_id mid) const
Definition: find_shared_impl.hpp:57
void store_stats(const unsigned seq_len, const unsigned match_pos, const unsigned match_len, super_seq const &super, Group_count const &gc)
Definition: find_shared_impl.hpp:349
Definition: sequence_position.hpp:15
boost::iterator_range< mid_iterator > mid_range
Definition: find_shared_impl.hpp:112
bool find_shared(const Seq_id sid, Group_count const &gc)
Definition: find_shared_impl.hpp:398
std::vector< unsigned > v_
Definition: find_shared_impl.hpp:74
unsigned find_shared(xstats &xs1, xstats &xs2)
Definition: find_shared_impl.hpp:290
unsigned n_mids_
Definition: find_shared_impl.hpp:345
const std::size_t n
Definition: vector_set_test.cpp:26
store_t::index< mapped_tag >::type mid_index
Definition: find_shared_impl.hpp:110
store_t::index< id_tag >::type id_index
Definition: find_shared_impl.hpp:104
Common_subseq find_longest(const seq_type seq, std::size_t min_len=0) const
Definition: gdst.cpp:85
Definition: exception.hpp:23
seq_map::value_type pair
Definition: find_shared_impl.hpp:199
std::ostream & print(std::ostream &os, const char delim, const Mapped_id mid) const
Definition: find_shared_impl.hpp:166
unsigned char mm5_
Definition: find_shared_impl.hpp:191
Definition: file_stream.hpp:24
Definition: find_shared_impl.hpp:43
bool check_duplicates(std::string const &seq, Group_count const &gc)
Definition: find_shared_impl.hpp:380
Set a fraction of sequence length to match.
Definition: min_match_length.hpp:93
seq_map m_
Definition: find_shared_impl.hpp:342
std::string transform(const boost::string_ref seq, sequence_interval const &si, const bool reverse)
Reverse-complement (optionally) a portion of a sequence.
Definition: sequence_transform.hpp:34
sub_seq_set const & maps_from(const Seq_id sid) const
Definition: sequence_store.hpp:165
Generalized DNA suffix tree.
Definition: gdst.hpp:66
unsigned char mm3_
Definition: find_shared_impl.hpp:192
bool mismatching_end(const unsigned start1, const unsigned len1, const unsigned start2, const unsigned len2, const unsigned match_len)
Definition: common_subsequence.hpp:38
Store sequence and related information.
Definition: sequence_store.hpp:30
Read_ginfo_store const & rgs_
Definition: find_shared_impl.hpp:344
why is it not in STL?
Definition: identity_predicate.hpp:15
void insert(const Seq_id sid)
Definition: gdst.cpp:79