7 #ifndef FIND_SHARED_IMPL_HPP_ 8 #define FIND_SHARED_IMPL_HPP_ 14 #include "boost/algorithm/string/classification.hpp" 15 #include "boost/algorithm/string/predicate.hpp" 16 #include "boost/assert.hpp" 17 #include "boost/container/flat_map.hpp" 18 #include "boost/foreach.hpp" 19 #include "boost/multi_index_container.hpp" 20 #include "boost/multi_index/hashed_index.hpp" 21 #include "boost/multi_index/member.hpp" 22 #include "boost/multi_index/ordered_index.hpp" 23 #include "boost/range.hpp" 57 unsigned operator[](
const Mapped_id mid)
const {
return v_[mid()];}
60 return accumulate(v_.begin(), v_.end(), 0U);
74 std::vector<unsigned>
v_;
80 typedef boost::multi_index_container<
82 boost::multi_index::indexed_by<
83 boost::multi_index::hashed_unique<
84 boost::multi_index::tag<struct id_tag>,
85 boost::multi_index::member<
89 boost::multi_index::hashed_unique<
90 boost::multi_index::tag<struct id_str_tag>,
91 boost::multi_index::member<
95 boost::multi_index::ordered_non_unique<
96 boost::multi_index::tag<struct mapped_tag>,
97 boost::multi_index::member<
118 std::size_t
size()
const {
return m_.size();}
120 Read_id
insert(std::string
const&
id,
const Mapped_id mid) {
121 const Read_id rid(m_.size() + 1);
122 const Read_ginfo rg = {id, rid, mid};
127 mid_range
by_mid(
const Mapped_id mid)
const {
128 return boost::make_iterator_range(m_.get<mapped_tag>().equal_range(mid));
132 return *m_.get<id_tag>().find(rid);
143 : gc_(), gem_(0), gme_(0), tem_(0), tme_(0), mm5_(0), mm3_(0)
147 : gc_(gc), gem_(gc[mid]), gme_(0), tem_(gc.copy_count()), tme_(0),
151 static std::ostream&
print_header(std::ostream& os,
const char delim) {
198 typedef boost::container::flat_map<Seq_id, Sequence_gstats>
seq_map;
199 typedef seq_map::value_type
pair;
206 const unsigned n_mids,
210 : st_(ss.sequence_map()),
218 if( ss.
empty() )
return;
219 const unsigned shortest = ss_[ss_.by_size().front()].size();
221 unsigned len = ss_[ss_.by_size().back()].size();
225 BOOST_FOREACH(
const Seq_id sid, ss_.by_size(len)) {
227 BOOST_FOREACH(
const sub_seq sp, ss_.maps_from(sid)) {
228 gc(rgs_[sp.
id_].mid_);
230 if( gc[
mid_] ) find_duplicates(sid, gc);
239 const unsigned min_duplicates,
240 const bool consensus_trim,
241 const std::string group_name
243 BOOST_FOREACH(pair
const& p, m_) {
244 const Read_id rid = find_id(p.first);
245 if( ! min_duplicates || min_duplicates < p.second.gc_[
mid_] ) {
246 boost::string_ref seq = ss_[p.first].sequence();
247 if( consensus_trim ) {
248 seq.remove_prefix(p.second.mm5_);
249 seq.remove_suffix(p.second.mm3_);
252 std::ostringstream stm;
253 stm << p.second.gc_[
mid_];
254 std::string descr = rgs_[rid].id_str_ +
"|MID=" + group_name +
"|DUPCOUNT=" + stm.str();
255 fos.
write(descr, seq);
262 const unsigned min_duplicates
264 BOOST_FOREACH(pair
const& p, m_) {
265 BOOST_ASSERT(p.second.gc_.group_count());
266 BOOST_ASSERT(p.second.gc_[
mid_]);
268 p.second.gc_.group_count() > 1 ||
269 ( min_duplicates && p.second.gc_[
mid_] <= min_duplicates )
271 const Read_id rid = find_id(p.first);
272 std::ostringstream stm;
273 stm << p.second.gc_[
mid_];
274 std::string descr = rgs_[rid].id_str_ +
"|DUPCOUNT=" + stm.str();
275 fos.
write(descr, ss_[p.first].sequence());
280 os <<
"read_id" << delim;
283 BOOST_FOREACH(pair
const& p, m_) {
284 const Read_id rid = find_id(p.first);
285 os << rgs_[rid].id_str_ << delim;
286 p.second.print(os, delim,
mid_) <<
'\n';
291 xs1.resize(n_mids_, 0);
292 xs2.resize(n_mids_, 0);
293 BOOST_FOREACH(
const Seq_id sid, ss_) {
295 BOOST_FOREACH(
const sub_seq sp, ss_.maps_from(sid)) {
296 gc(rgs_[sp.
id_].mid_);
299 if( gc[
mid_] || find_shared(sid, gc) ) {
305 std::plus<unsigned>()
311 BOOST_FOREACH(pair
const& p, m_) {
312 bool is_unique =
true;
313 for(std::size_t
n = 0;
n != n_mids_; ++
n) {
314 if( p.second.gc_.v_[
n] ) ++xs1[
n];
315 if( is_unique &&
n !=
mid_() && p.second.gc_.v_[
n] ) {
325 BOOST_FOREACH(pair
const& p, m_) {
326 const std::size_t
n = p.second.gc_[
mid_];
327 if( h.size() <=
n ) h.resize(n + 1, 0);
333 BOOST_FOREACH(pair
const& p, m_) {
334 const std::size_t
n = p.second.gc_.copy_count();
335 if( h.size() <=
n ) h.resize(n + 1, 0);
350 const unsigned seq_len,
351 const unsigned match_pos,
352 const unsigned match_len,
370 if( stats.
mm5_ < match_pos ) stats.
mm5_ = match_pos;
371 const unsigned match_end = seq_len - match_pos - match_len;
372 if( stats.
mm3_ < match_end ) stats.
mm3_ = match_end;
381 std::string
const& seq,
386 if( cs.
seq_.empty() )
return false;
388 BOOST_ASSERT(m_.find(super.
id_) != m_.end());
389 store_stats(seq.size(), cs.
start_, cs.
len_, super, gc);
402 std::string
const& seq = ss_[sid].sequence();
404 return check_duplicates(seq, gc);
409 const Read_id rid = sp.
id_;
410 if( rgs_[rid].mid_ == mid_ )
return rid;
419 const Read_id rid = sp.
id_;
420 if( rgs_[rid].mid_ == mid ) found =
true;
421 if( rgs_[rid].mid_ == mid_ )
return false;
430 std::string
const& seq = ss_[sid].sequence();
432 if( check_duplicates(seq, gc) )
return;
435 BOOST_ASSERT(m_.find(sid) == m_.end());
void operator()(const Mapped_id mid)
Definition: find_shared_impl.hpp:56
std::size_t size() const
Definition: find_shared_impl.hpp:118
Read ID and group ID.
Definition: find_shared_impl.hpp:35
Seq_store const & ss_
Definition: find_shared_impl.hpp:343
mid_index::const_iterator mid_iterator
Definition: find_shared_impl.hpp:111
Group_count(const std::size_t n_groups)
Definition: find_shared_impl.hpp:48
Group_unique(Seq_store &ss, Read_ginfo_store const &rgs, const unsigned n_mids, const Mapped_id mid, Get_match_length const &gml)
Definition: find_shared_impl.hpp:203
unsigned gme_
Definition: find_shared_impl.hpp:188
unsigned copy_count() const
Definition: find_shared_impl.hpp:59
void find_duplicates(const Seq_id sid, Group_count const &gc)
Definition: find_shared_impl.hpp:426
Read_id insert(std::string const &id, const Mapped_id mid)
Definition: find_shared_impl.hpp:120
unsigned gem_
Definition: find_shared_impl.hpp:187
Id id_
Definition: sequence_position.hpp:20
mid_range by_mid(const Mapped_id mid) const
Definition: find_shared_impl.hpp:127
unsigned start_
match starting position in pattern
Definition: common_subsequence.hpp:23
Definition: find_shared_impl.hpp:197
Definition: find_shared_impl.hpp:141
void write(std::string const &descr, Seq_record::sequence const &seq)
Definition: file_stream.cpp:173
bool empty() const
Definition: sequence_store.hpp:98
unsigned len_
match length
Definition: common_subsequence.hpp:24
Common subsequence.
Definition: common_subsequence.hpp:18
static std::ostream & print_header(std::ostream &os, const char delim)
Definition: find_shared_impl.hpp:151
Read_id find_id(const Seq_id sid) const
Definition: find_shared_impl.hpp:407
unsigned pos_
Definition: sequence_position.hpp:21
std::vector< unsigned > xstats
Definition: find_shared_impl.hpp:200
gdst::Gdst st_
Definition: find_shared_impl.hpp:341
void operator()(Group_count const &gc)
Definition: find_shared_impl.hpp:63
void write_redundancy(std::ostream &os, const char delim) const
Definition: find_shared_impl.hpp:279
detail::Vector_set< super_seq > seq_
Definition: common_subsequence.hpp:27
unsigned tme_
Definition: find_shared_impl.hpp:190
Get_match_length const & gml_
Definition: find_shared_impl.hpp:347
id_index::const_iterator id_iterator
Definition: find_shared_impl.hpp:105
Read_id rid_
Definition: find_shared_impl.hpp:37
store_t m_
Definition: find_shared_impl.hpp:136
boost::container::flat_map< Seq_id, Sequence_gstats > seq_map
Definition: find_shared_impl.hpp:198
boost::multi_index_container< Read_ginfo, boost::multi_index::indexed_by< boost::multi_index::hashed_unique< boost::multi_index::tag< struct id_tag >, boost::multi_index::member< Read_ginfo, Read_id,&Read_ginfo::rid_ > >, boost::multi_index::hashed_unique< boost::multi_index::tag< struct id_str_tag >, boost::multi_index::member< Read_ginfo, std::string,&Read_ginfo::id_str_ > >, boost::multi_index::ordered_non_unique< boost::multi_index::tag< struct mapped_tag >, boost::multi_index::member< Read_ginfo, Mapped_id,&Read_ginfo::mid_ > > > > store_t
Definition: find_shared_impl.hpp:102
Read_ginfo const & operator[](const Read_id rid) const
Definition: find_shared_impl.hpp:131
Definition: nucleotide_index.hpp:330
unsigned n_reads() const
Definition: find_shared_impl.hpp:235
Mapped_id mid_
Definition: find_shared_impl.hpp:38
Read ID and group ID.
Definition: find_shared_impl.hpp:79
void write_group_unique(File_ostream &fos, const unsigned min_duplicates, const bool consensus_trim, const std::string group_name) const
Definition: find_shared_impl.hpp:237
Main namespace of vdj_pipe library.
Definition: keywords_variable.hpp:11
bool is_only_group(const Seq_id sid, const Mapped_id mid) const
Definition: find_shared_impl.hpp:416
Mapped_id mid_
Definition: find_shared_impl.hpp:346
unsigned tem_
Definition: find_shared_impl.hpp:189
Sequence_gstats()
Definition: find_shared_impl.hpp:142
Group_count()
Definition: find_shared_impl.hpp:46
store_t::index< id_str_tag >::type id_str_index
Definition: find_shared_impl.hpp:107
std::size_t size() const
Definition: sequence_store.hpp:95
Definition: find_shared_impl.hpp:116
Group_count gc_
Definition: find_shared_impl.hpp:186
id_str_index::const_iterator id_str_iterator
Definition: find_shared_impl.hpp:108
unsigned group_count() const
Definition: find_shared_impl.hpp:52
void group_redundancy_histogram(xstats &h) const
Definition: find_shared_impl.hpp:324
Sequence_gstats(Group_count const &gc, const Mapped_id mid)
Definition: find_shared_impl.hpp:146
void write_unique(File_ostream &fos, const unsigned min_duplicates) const
Definition: find_shared_impl.hpp:260
std::string id_str_
Definition: find_shared_impl.hpp:36
void redundancy_histogram(xstats &h) const
Definition: find_shared_impl.hpp:332
unsigned operator[](const Mapped_id mid) const
Definition: find_shared_impl.hpp:57
void store_stats(const unsigned seq_len, const unsigned match_pos, const unsigned match_len, super_seq const &super, Group_count const &gc)
Definition: find_shared_impl.hpp:349
Definition: sequence_position.hpp:15
boost::iterator_range< mid_iterator > mid_range
Definition: find_shared_impl.hpp:112
bool find_shared(const Seq_id sid, Group_count const &gc)
Definition: find_shared_impl.hpp:398
std::vector< unsigned > v_
Definition: find_shared_impl.hpp:74
unsigned find_shared(xstats &xs1, xstats &xs2)
Definition: find_shared_impl.hpp:290
unsigned n_mids_
Definition: find_shared_impl.hpp:345
const std::size_t n
Definition: vector_set_test.cpp:26
store_t::index< mapped_tag >::type mid_index
Definition: find_shared_impl.hpp:110
store_t::index< id_tag >::type id_index
Definition: find_shared_impl.hpp:104
Common_subseq find_longest(const seq_type seq, std::size_t min_len=0) const
Definition: gdst.cpp:85
Definition: exception.hpp:23
seq_map::value_type pair
Definition: find_shared_impl.hpp:199
std::ostream & print(std::ostream &os, const char delim, const Mapped_id mid) const
Definition: find_shared_impl.hpp:166
unsigned char mm5_
Definition: find_shared_impl.hpp:191
Definition: file_stream.hpp:24
Definition: find_shared_impl.hpp:43
bool check_duplicates(std::string const &seq, Group_count const &gc)
Definition: find_shared_impl.hpp:380
Set a fraction of sequence length to match.
Definition: min_match_length.hpp:93
seq_map m_
Definition: find_shared_impl.hpp:342
std::string transform(const boost::string_ref seq, sequence_interval const &si, const bool reverse)
Reverse-complement (optionally) a portion of a sequence.
Definition: sequence_transform.hpp:34
sub_seq_set const & maps_from(const Seq_id sid) const
Definition: sequence_store.hpp:165
Generalized DNA suffix tree.
Definition: gdst.hpp:66
unsigned char mm3_
Definition: find_shared_impl.hpp:192
bool mismatching_end(const unsigned start1, const unsigned len1, const unsigned start2, const unsigned len2, const unsigned match_len)
Definition: common_subsequence.hpp:38
Store sequence and related information.
Definition: sequence_store.hpp:30
Read_ginfo_store const & rgs_
Definition: find_shared_impl.hpp:344
why is it not in STL?
Definition: identity_predicate.hpp:15
void insert(const Seq_id sid)
Definition: gdst.cpp:79