vdj_pipe
pipeline for processing DNA sequence data
sequence_map_parse.hpp
Go to the documentation of this file.
1 
7 #ifndef SEQUENCE_MAP_PARSE_HPP_
8 #define SEQUENCE_MAP_PARSE_HPP_
9 #include <fstream>
10 #include <map>
11 #include <string>
12 #include <sstream>
13 #include <utility>
14 #include <vector>
15 
16 #include "boost/assert.hpp"
17 #include "boost/foreach.hpp"
18 #include "boost/property_tree/ptree.hpp"
19 #include "boost/tokenizer.hpp"
20 namespace bpt = boost::property_tree;
21 
22 #include "sequence_map_types.hpp"
23 #include "vdj_pipe/exception.hpp"
27 
28 namespace vdj_pipe{ namespace detail{
29 
32 inline void parse_sequences(bpt::ptree const& pt, name_seq_vector& nsv) {
33  std::string const& seq = pt.data();
34  if( seq.size() ) {
35  nsv.push_back(match_seq("", seq));
36  return;
37  }
38 
39  BOOST_FOREACH(bpt::ptree::value_type const& vt, pt) {
40  parse_sequences(vt.second, nsv);
41  }
42 }
43 
46 inline void parse_combinations(
47  bpt::ptree const& pt,
48  Input_manager const& in,
49  string_table& st
50 ) {
51  std::string const& fn = pt.data();
52  if( fn.size() ) {
53  const std::string in_path = in.path(fn);
54  std::ifstream ifs(in_path.c_str());
55 
56  typedef base_exception Err;
57  if( ! ifs.good() ) BOOST_THROW_EXCEPTION(
58  Err()
59  << Err::msg_t("cannot read")
60  << Err::str1_t(sanitize(in_path))
61  );
62 
63  std::string line_str;
64  for( int line_n = 1; getline(ifs, line_str); ++line_n ) {
65  if( line_str.empty() || line_str[0] == '#' ) continue;
66  typedef boost::tokenizer<> tokenizer;
67  tokenizer tok(line_str);
68  string_vector sv;
69  std::size_t n = 0;
70  BOOST_FOREACH(std::string const& s, tok) {
71  sv.push_back(s);
72  }
73  if( sv.empty() ) continue;
74  if( n == 0 ) n = sv.size();
75  else {
76  if( n != sv.size() ) BOOST_THROW_EXCEPTION(
77  Err()
78  << Err::msg_t("mismatched number of columns")
79  << Err::str1_t(sanitize(in_path))
80  << Err::int1_t(sv.size())
81  << Err::int2_t(n)
82  << Err::line_t(line_n)
83  );
84  }
85  st.push_back(sv);
86  }
87  return;
88  }
89 
90  BOOST_FOREACH(bpt::ptree::value_type const& vt, pt) {
91  parse_combinations(vt.second, in, st);
92  }
93 }
94 
97 inline void parse_seq_files(
98  bpt::ptree const& pt,
99  Input_manager const& in,
100  name_seq_vector& nsv
101 ) {
102  std::string const& fn = pt.data();
103  if( fn.size() ) {
104  const std::string in_path = in.path(fn);
105  for(Parser_fasta pf((File_input(in_path))); pf.has_next(); pf.next_record()) {
106  const Seq_record sr = pf.get_record();
107  nsv.push_back(match_seq(sr.id_, sr.seq_));
108  }
109  return;
110  }
111 
112  BOOST_FOREACH(bpt::ptree::value_type const& vt, pt) {
113  parse_seq_files(vt.second, in, nsv);
114  }
115 }
116 
120  name_seq_vector const& nsv,
121  string_table const& st
122 ) {
123  name_seq_vector sv2;
124 
125  typedef std::map<std::string, std::size_t> map_t;
126  map_t m;
127  for( std::size_t n = 0; n != nsv.size(); ++n) {
128  m.insert(std::make_pair(nsv[n].first, n));
129  }
130 
131  typedef base_exception Err;
132  BOOST_FOREACH(string_vector const& sv, st) {
133  BOOST_ASSERT(sv.size() > 2);
134  std::ostringstream os;
135  for( std::size_t n = 1; n != sv.size(); ++n ) {
136  std::string const& name = sv[n];
137  map_t::const_iterator i = m.find(name);
138  if( i == m.end() ) BOOST_THROW_EXCEPTION(
139  Err()
140  << Err::msg_t("sequence name not found")
141  << Err::str1_t(sanitize(name))
142  );
143  os << nsv[i->second].second;
144  }
145  sv2.push_back(std::make_pair(sv[0], os.str()));
146  }
147  return sv2;
148 }
149 
150 }//namespace detail
151 }//namespace vdj_pipe
152 #endif /* SEQUENCE_MAP_PARSE_HPP_ */
bool has_next() const
Definition: parser_line.hpp:31
void parse_sequences(bpt::ptree const &pt, name_seq_vector &nsv)
Definition: sequence_map_parse.hpp:32
sequence seq_
Definition: sequence_record.hpp:32
void parse_seq_files(bpt::ptree const &pt, Input_manager const &in, name_seq_vector &nsv)
Definition: sequence_map_parse.hpp:97
name_seq_vector combine(name_seq_vector const &nsv, string_table const &st)
Definition: sequence_map_parse.hpp:119
std::vector< std::string > string_vector
Definition: sequence_map_types.hpp:16
Main namespace of vdj_pipe library.
Definition: keywords_variable.hpp:11
File target is supposed to exist at construction time.
Definition: file.hpp:93
Definition: sequence_record.hpp:28
std::vector< string_vector > string_table
Definition: sequence_map_types.hpp:17
boost::tokenizer< boost::char_separator< char > > tokenizer
Definition: me_factory.cpp:34
static std::string path(std::string const &root, std::string const &fn)
Definition: input_manager.cpp:48
void parse_combinations(bpt::ptree const &pt, Input_manager const &in, string_table &st)
Definition: sequence_map_parse.hpp:46
std::string id_
Definition: sequence_record.hpp:20
Definition: parser_fasta.hpp:16
Definition: input_manager.hpp:22
const std::size_t n
Definition: vector_set_test.cpp:26
Definition: exception.hpp:23
std::string sanitize(const char c)
Definition: sanitize_string.cpp:53
std::pair< std::string, std::string > match_seq
Definition: sequence_map_types.hpp:14
bpt::ptree ptree
Definition: processing_step_utils.hpp:19
std::vector< match_seq > name_seq_vector
Definition: sequence_map_types.hpp:15