vdj_pipe
pipeline for processing DNA sequence data
parser_fastq.hpp
Go to the documentation of this file.
1 
7 #ifndef PARSER_FASTQ_HPP_
8 #define PARSER_FASTQ_HPP_
9 #include <vector>
10 #include "boost/foreach.hpp"
13 
14 namespace vdj_pipe{
15 
20  enum State {Def, Seq, Qual, End};
21 public:
22  typedef Seq_qual_record record;
23  typedef record::sequence sequence;
24  typedef record::quality quality;
25  static const char qual_offset = -33;
26 
27  explicit Parser_fastq(File_input const& fi, const char offset = qual_offset)
28  : detail::Parser_line(fi), offset_(offset), state_(End)
29  {
30  if( fi.format() != format::Fastq ) BOOST_THROW_EXCEPTION(
31  Err()
32  << Err::msg_t("wrong file format for FASTQ parser")
33  << Err::str1_t(sanitize(fi.path()))
34  << Err::int1_t(fi.format())
35  );
36  next_record();
37  }
38 
39  explicit Parser_fastq(
40  std::istream& is,
41  const compression::Compression compr = compression::none,
42  const char offset = qual_offset
43  )
44  : detail::Parser_line(is, compr), offset_(offset), state_(End)
45  {
46  next_record();
47  }
48 
49  void next_record() {
50  if( ! has_next() ) return;
51  switch (state_) {
52  case Def:
53  skip_line();
54  if( ! has_next() ) return;
55  /* no break */
56  case Seq:
57  skip_line();
58  if( ! has_next() ) return;
59  Parser_line::seek_line('+');
60  if( ! has_next() ) return;
61  /* no break */
62  case Qual:
63  skip_line();
64  if( ! has_next() ) return;
65  skip_line();
66  if( ! has_next() ) return;
67  /* no break */
68  case End:
69  state_ = Def;
70  }
71  Parser_line::seek_line('@');
72  }
73 
74  const boost::string_ref get_id() {
75  if( state_ != Def ) next_record();
76  if( ! has_next() ) return "";
77  state_ = Seq;
78  return Parser_line::get_id('@');
79  }
80 
81  const boost::string_ref get_defstr() {
82  if( state_ != Def ) next_record();
83  if( ! has_next() ) return "";
84  state_ = Seq;
85  return Parser_line::get_defstr('@');
86  }
87 
88  sequence get_sequence() {
89  if( ! has_next() ) return "";
90  switch (state_) {
91  case Qual:
92  case End:
93  next_record();
94  if( ! has_next() ) return "";
95  /* no break */
96  case Def:
97  skip_line();
98  if( ! has_next() ) return "";
99  /* no break */
100  case Seq: break;
101  }
102  state_ = Qual;
103  return Parser_line::get_sequence('+');
104  }
105 
106  quality get_qual() {
107  quality q;
108  get_qual(back_inserter(q));
109  return q;
110  }
111 
112  template<class InsertIter> void get_qual(InsertIter i) {
113  if( ! has_next() ) return;
114  switch (state_) {
115  case End:
116  next_record();
117  if( ! has_next() ) return;
118  /* no break */
119  case Def:
120  skip_line();
121  if( ! has_next() ) return;
122  /* no break */
123  case Seq:
124  skip_line();
125  if( ! has_next() ) return;
126  Parser_line::seek_line('+');
127  if( ! has_next() ) return;
128  /* no break */
129  case Qual:
130  skip_line();
131  if( ! has_next() ) return;
132  }
133  getline(fis_.istream(), str_);
134  ++line_;
135  BOOST_FOREACH(const char c, str_) {
136  if( c < '!' || c > '~') BOOST_THROW_EXCEPTION(
137  Err()
138  << Err::msg_t("invalid quality character")
139  << Err::str1_t(sanitize(c))
140  << Err::line_t(line_num() - 1)
141  );
142  i = c + offset_;
143  }
144  state_ = End;
145  }
146 
147  record get_record() {
148  record qr;
149  set_meta(qr, '@');
150  state_ = Seq;
151  qr.seq_ = get_sequence();
152  get_qual(back_inserter(qr.qual_));
153 
154  if( qr.seq_.size() != qr.qual_.size() ) BOOST_THROW_EXCEPTION(
155  Err()
156  << Err::msg_t("sequence-quality size mismatch")
157  << Err::str1_t(sanitize(qr.id_, 60))
158  << Err::int1_t(qr.seq_.size())
159  << Err::int2_t(qr.qual_.size())
160  << Err::line_t(line_num() - 1)
161  );
162  return qr;
163  }
164 
165 private:
166  const char offset_;
167  State state_;
168 };
169 
170 }//namespace vdj_pipe
171 #endif /* PARSER_FASTQ_HPP_ */
Definition: sequence_record.hpp:35
Parser for FASTQ files.
Definition: parser_fastq.hpp:19
Compression
File compression types.
Definition: file_properties.hpp:19
Definition: sequence_record.hpp:77
Definition: parser_line.hpp:26
Main namespace of vdj_pipe library.
Definition: sequence_file.hpp:14
File target is supposed to exist at construction time.
Definition: file.hpp:93
sequence seq_
Definition: sequence_record.hpp:82
std::string id_
Definition: sequence_record.hpp:20
quality qual_
Definition: sequence_record.hpp:85
Basic line-based parser; use to derive other parsers.
Definition: parser_line.hpp:23