SeqAn3 3.1.0-rc.2
The Modern C++ library for sequence analysis.
format_bam.hpp
Go to the documentation of this file.
1// -----------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6// -----------------------------------------------------------------------------------------------------
7
13#pragma once
14
15#include <seqan3/std/bit>
16#include <iterator>
17#include <seqan3/std/ranges>
18#include <string>
19#include <vector>
20
33
34namespace seqan3
35{
36
47class format_bam : private detail::format_sam_base
48{
49public:
53 // string_buffer is of type std::string and has some problems with pre-C++11 ABI
54 format_bam() = default;
55 format_bam(format_bam const &) = default;
56 format_bam & operator=(format_bam const &) = default;
57 format_bam(format_bam &&) = default;
59 ~format_bam() = default;
60
62
65 {
66 { "bam" }
67 };
68
69protected:
70 template <typename stream_type, // constraints checked by file
71 typename seq_legal_alph_type,
72 typename ref_seqs_type,
73 typename ref_ids_type,
74 typename seq_type,
75 typename id_type,
76 typename offset_type,
77 typename ref_seq_type,
78 typename ref_id_type,
79 typename ref_offset_type,
80 typename align_type,
81 typename cigar_type,
82 typename flag_type,
83 typename mapq_type,
84 typename qual_type,
85 typename mate_type,
86 typename tag_dict_type,
87 typename e_value_type,
88 typename bit_score_type>
89 void read_alignment_record(stream_type & stream,
90 sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
91 ref_seqs_type & ref_seqs,
93 seq_type & seq,
94 qual_type & qual,
95 id_type & id,
96 offset_type & offset,
97 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
98 ref_id_type & ref_id,
99 ref_offset_type & ref_offset,
100 align_type & align,
101 cigar_type & cigar_vector,
102 flag_type & flag,
103 mapq_type & mapq,
104 mate_type & mate,
105 tag_dict_type & tag_dict,
106 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
107 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score));
108
109 template <typename stream_type,
110 typename header_type,
111 typename seq_type,
112 typename id_type,
113 typename ref_seq_type,
114 typename ref_id_type,
115 typename align_type,
116 typename cigar_type,
117 typename qual_type,
118 typename mate_type,
119 typename tag_dict_type>
120 void write_alignment_record([[maybe_unused]] stream_type & stream,
121 [[maybe_unused]] sam_file_output_options const & options,
122 [[maybe_unused]] header_type && header,
123 [[maybe_unused]] seq_type && seq,
124 [[maybe_unused]] qual_type && qual,
125 [[maybe_unused]] id_type && id,
126 [[maybe_unused]] int32_t const offset,
127 [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
128 [[maybe_unused]] ref_id_type && ref_id,
129 [[maybe_unused]] std::optional<int32_t> ref_offset,
130 [[maybe_unused]] align_type && align,
131 [[maybe_unused]] cigar_type && cigar_vector,
132 [[maybe_unused]] sam_flag const flag,
133 [[maybe_unused]] uint8_t const mapq,
134 [[maybe_unused]] mate_type && mate,
135 [[maybe_unused]] tag_dict_type && tag_dict,
136 [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(e_value),
137 [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(bit_score));
138
139private:
141 bool header_was_read{false};
142
144 std::string string_buffer{};
145
147 struct alignment_record_core
148 { // naming corresponds to official SAM/BAM specifications
149 int32_t block_size;
150 int32_t refID;
151 int32_t pos;
152 uint32_t l_read_name:8;
153 uint32_t mapq:8;
154 uint32_t bin:16;
155 uint32_t n_cigar_op:16;
156 sam_flag flag;
157 int32_t l_seq;
158 int32_t next_refID;
159 int32_t next_pos;
160 int32_t tlen;
161 };
162
164 static constexpr std::array<uint8_t, 256> char_to_sam_rank
165 {
166 [] () constexpr
167 {
169
170 using index_t = std::make_unsigned_t<char>;
171
172 // ret['M'] = 0; set anyway by initialization
173 ret[static_cast<index_t>('I')] = 1;
174 ret[static_cast<index_t>('D')] = 2;
175 ret[static_cast<index_t>('N')] = 3;
176 ret[static_cast<index_t>('S')] = 4;
177 ret[static_cast<index_t>('H')] = 5;
178 ret[static_cast<index_t>('P')] = 6;
179 ret[static_cast<index_t>('=')] = 7;
180 ret[static_cast<index_t>('X')] = 8;
181
182 return ret;
183 }()
184 };
185
187 static uint16_t reg2bin(int32_t beg, int32_t end) noexcept
188 {
189 --end;
190 if (beg >> 14 == end >> 14) return ((1 << 15) - 1) / 7 + (beg >> 14);
191 if (beg >> 17 == end >> 17) return ((1 << 12) - 1) / 7 + (beg >> 17);
192 if (beg >> 20 == end >> 20) return ((1 << 9) - 1) / 7 + (beg >> 20);
193 if (beg >> 23 == end >> 23) return ((1 << 6) - 1) / 7 + (beg >> 23);
194 if (beg >> 26 == end >> 26) return ((1 << 3) - 1) / 7 + (beg >> 26);
195 return 0;
196 }
197
204 template <typename stream_view_type, std::integral number_type>
205 void read_integral_byte_field(stream_view_type && stream_view, number_type & target)
206 {
207 std::ranges::copy_n(std::ranges::begin(stream_view), sizeof(target), reinterpret_cast<char *>(&target));
208 }
209
215 template <typename stream_view_type>
216 void read_float_byte_field(stream_view_type && stream_view, float & target)
217 {
218 std::ranges::copy_n(std::ranges::begin(stream_view), sizeof(int32_t), reinterpret_cast<char *>(&target));
219 }
220
221 template <typename stream_view_type, typename value_type>
222 void read_sam_dict_vector(seqan3::detail::sam_tag_variant & variant,
223 stream_view_type && stream_view,
224 value_type const & SEQAN3_DOXYGEN_ONLY(value));
225
226 template <typename stream_view_type>
227 void read_sam_dict_field(stream_view_type && stream_view, sam_tag_dictionary & target);
228
229 template <typename cigar_input_type>
230 auto parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op) const;
231
232 static std::string get_tag_dict_str(sam_tag_dictionary const & tag_dict);
233};
234
236template <typename stream_type, // constraints checked by file
237 typename seq_legal_alph_type,
238 typename ref_seqs_type,
239 typename ref_ids_type,
240 typename seq_type,
241 typename id_type,
242 typename offset_type,
243 typename ref_seq_type,
244 typename ref_id_type,
245 typename ref_offset_type,
246 typename align_type,
247 typename cigar_type,
248 typename flag_type,
249 typename mapq_type,
250 typename qual_type,
251 typename mate_type,
252 typename tag_dict_type,
253 typename e_value_type,
254 typename bit_score_type>
255inline void format_bam::read_alignment_record(stream_type & stream,
256 sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
257 ref_seqs_type & ref_seqs,
259 seq_type & seq,
260 qual_type & qual,
261 id_type & id,
262 offset_type & offset,
263 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
264 ref_id_type & ref_id,
265 ref_offset_type & ref_offset,
266 align_type & align,
267 cigar_type & cigar_vector,
268 flag_type & flag,
269 mapq_type & mapq,
270 mate_type & mate,
271 tag_dict_type & tag_dict,
272 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
273 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
274{
275 static_assert(detail::decays_to_ignore_v<ref_offset_type> ||
276 detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
277 "The ref_offset must be a specialisation of std::optional.");
278
279 static_assert(detail::decays_to_ignore_v<mapq_type> || std::same_as<mapq_type, uint8_t>,
280 "The type of field::mapq must be uint8_t.");
281
282 static_assert(detail::decays_to_ignore_v<flag_type> || std::same_as<flag_type, sam_flag>,
283 "The type of field::flag must be seqan3::sam_flag.");
284
285 auto stream_view = seqan3::detail::istreambuf(stream);
286
287 // these variables need to be stored to compute the ALIGNMENT
288 [[maybe_unused]] int32_t offset_tmp{};
289 [[maybe_unused]] int32_t soft_clipping_end{};
290 [[maybe_unused]] std::vector<cigar> tmp_cigar_vector{};
291 [[maybe_unused]] int32_t ref_length{0}, seq_length{0}; // length of aligned part for ref and query
292
293 // Header
294 // -------------------------------------------------------------------------------------------------------------
295 if (!header_was_read)
296 {
297 // magic BAM string
298 if (!std::ranges::equal(stream_view | detail::take_exactly_or_throw(4), std::string_view{"BAM\1"}))
299 throw format_error{"File is not in BAM format."};
300
301 int32_t l_text{}; // length of header text including \0 character
302 int32_t n_ref{}; // number of reference sequences
303 int32_t l_name{}; // 1 + length of reference name including \0 character
304 int32_t l_ref{}; // length of reference sequence
305
306 read_integral_byte_field(stream_view, l_text);
307
308 if (l_text > 0) // header text is present
309 read_header(stream_view | detail::take_exactly_or_throw(l_text), header, ref_seqs);
310
311 read_integral_byte_field(stream_view, n_ref);
312
313 for (int32_t ref_idx = 0; ref_idx < n_ref; ++ref_idx)
314 {
315 read_integral_byte_field(stream_view, l_name);
316
317 string_buffer.resize(l_name - 1);
318 std::ranges::copy_n(std::ranges::begin(stream_view), l_name - 1, string_buffer.data()); // copy without \0 character
319 std::ranges::next(std::ranges::begin(stream_view)); // skip \0 character
320
321 read_integral_byte_field(stream_view, l_ref);
322
323 if constexpr (detail::decays_to_ignore_v<ref_seqs_type>) // no reference information given
324 {
325 // If there was no header text, we parse reference sequences block as header information
326 if (l_text == 0)
327 {
328 auto & reference_ids = header.ref_ids();
329 // put the length of the reference sequence into ref_id_info
330 header.ref_id_info.emplace_back(l_ref, "");
331 // put the reference name into reference_ids
332 reference_ids.push_back(string_buffer);
333 // assign the reference name an ascending reference id (starts at index 0).
334 header.ref_dict.emplace(reference_ids.back(), reference_ids.size() - 1);
335 continue;
336 }
337 }
338
339 auto id_it = header.ref_dict.find(string_buffer);
340
341 // sanity checks of reference information to existing header object:
342 if (id_it == header.ref_dict.end()) // [unlikely]
343 {
344 throw format_error{detail::to_string("Unknown reference name '" + string_buffer +
345 "' found in BAM file header (header.ref_ids():",
346 header.ref_ids(), ").")};
347 }
348 else if (id_it->second != ref_idx) // [unlikely]
349 {
350 throw format_error{detail::to_string("Reference id '", string_buffer, "' at position ", ref_idx,
351 " does not correspond to the position ", id_it->second,
352 " in the header (header.ref_ids():", header.ref_ids(), ").")};
353 }
354 else if (std::get<0>(header.ref_id_info[id_it->second]) != l_ref) // [unlikely]
355 {
356 throw format_error{"Provided reference has unequal length as specified in the header."};
357 }
358 }
359
360 header_was_read = true;
361
362 if (std::ranges::begin(stream_view) == std::ranges::end(stream_view)) // no records follow
363 return;
364 }
365
366 // read alignment record into buffer
367 // -------------------------------------------------------------------------------------------------------------
368 alignment_record_core core;
369 std::ranges::copy(stream_view | detail::take_exactly_or_throw(sizeof(core)), reinterpret_cast<char *>(&core));
370
371 if (core.refID >= static_cast<int32_t>(header.ref_ids().size()) || core.refID < -1) // [[unlikely]]
372 {
373 throw format_error{detail::to_string("Reference id index '", core.refID, "' is not in range of ",
374 "header.ref_ids(), which has size ", header.ref_ids().size(), ".")};
375 }
376 else if (core.refID > -1) // not unmapped
377 {
378 ref_id = core.refID; // field::ref_id
379 }
380
381 flag = core.flag; // field::flag
382 mapq = core.mapq; // field::mapq
383
384 if (core.pos > -1) // [[likely]]
385 ref_offset = core.pos; // field::ref_offset
386
387 if constexpr (!detail::decays_to_ignore_v<mate_type>) // field::mate
388 {
389 if (core.next_refID > -1)
390 get<0>(mate) = core.next_refID;
391
392 if (core.next_pos > -1) // [[likely]]
393 get<1>(mate) = core.next_pos;
394
395 get<2>(mate) = core.tlen;
396 }
397
398 // read id
399 // -------------------------------------------------------------------------------------------------------------
400 auto id_view = stream_view | detail::take_exactly_or_throw(core.l_read_name - 1);
401 if constexpr (!detail::decays_to_ignore_v<id_type>)
402 read_forward_range_field(id_view, id); // field::id
403 else
404 detail::consume(id_view);
405 std::ranges::next(std::ranges::begin(stream_view)); // skip '\0'
406
407 // read cigar string
408 // -------------------------------------------------------------------------------------------------------------
409 if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
410 {
411 std::tie(tmp_cigar_vector, ref_length, seq_length) = parse_binary_cigar(stream_view, core.n_cigar_op);
412 transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
413 // the actual cigar_vector is swapped with tmp_cigar_vector at the end to avoid copying
414 }
415 else
416 {
417 detail::consume(stream_view | detail::take_exactly_or_throw(core.n_cigar_op * 4));
418 }
419
420 offset = offset_tmp;
421
422 // read sequence
423 // -------------------------------------------------------------------------------------------------------------
424 if (core.l_seq > 0) // sequence information is given
425 {
426 auto seq_stream = stream_view
427 | detail::take_exactly_or_throw(core.l_seq / 2) // one too short if uneven
429 {
430 return {dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(c) >> 4)),
431 dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(c) & 0x0f))};
432 });
433
434 if constexpr (detail::decays_to_ignore_v<seq_type>)
435 {
436 auto skip_sequence_bytes = [&] ()
437 {
438 detail::consume(seq_stream);
439 if (core.l_seq & 1)
440 std::ranges::next(std::ranges::begin(stream_view));
441 };
442
443 if constexpr (!detail::decays_to_ignore_v<align_type>)
444 {
446 "If you want to read ALIGNMENT but not SEQ, the alignment"
447 " object must store a sequence container at the second (query) position.");
448
449 if (!tmp_cigar_vector.empty()) // only parse alignment if cigar information was given
450 {
451 assert(core.l_seq == (seq_length + offset_tmp + soft_clipping_end)); // sanity check
452 using alph_t = std::ranges::range_value_t<decltype(get<1>(align))>;
453 constexpr auto from_dna16 = detail::convert_through_char_representation<alph_t, dna16sam>;
454
455 get<1>(align).reserve(seq_length);
456
457 auto tmp_iter = std::ranges::begin(seq_stream);
458 std::ranges::advance(tmp_iter, offset_tmp / 2); // skip soft clipped bases at the beginning
459
460 if (offset_tmp & 1)
461 {
462 get<1>(align).push_back(from_dna16[to_rank(get<1>(*tmp_iter))]);
463 ++tmp_iter;
464 --seq_length;
465 }
466
467 for (size_t i = (seq_length / 2); i > 0; --i)
468 {
469 get<1>(align).push_back(from_dna16[to_rank(get<0>(*tmp_iter))]);
470 get<1>(align).push_back(from_dna16[to_rank(get<1>(*tmp_iter))]);
471 ++tmp_iter;
472 }
473
474 if (seq_length & 1)
475 {
476 get<1>(align).push_back(from_dna16[to_rank(get<0>(*tmp_iter))]);
477 ++tmp_iter;
478 --soft_clipping_end;
479 }
480
481 std::ranges::advance(tmp_iter, (soft_clipping_end + !(seq_length & 1)) / 2);
482 }
483 else
484 {
485 skip_sequence_bytes();
486 get<1>(align) = std::remove_reference_t<decltype(get<1>(align))>{}; // assign empty container
487 }
488 }
489 else
490 {
491 skip_sequence_bytes();
492 }
493 }
494 else
495 {
496 using alph_t = std::ranges::range_value_t<decltype(seq)>;
497 constexpr auto from_dna16 = detail::convert_through_char_representation<alph_t, dna16sam>;
498
499 for (auto [d1, d2] : seq_stream)
500 {
501 seq.push_back(from_dna16[to_rank(d1)]);
502 seq.push_back(from_dna16[to_rank(d2)]);
503 }
504
505 if (core.l_seq & 1)
506 {
507 dna16sam d = dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(*std::ranges::begin(stream_view)) >> 4));
508 seq.push_back(from_dna16[to_rank(d)]);
509 std::ranges::next(std::ranges::begin(stream_view));
510 }
511
512 if constexpr (!detail::decays_to_ignore_v<align_type>)
513 {
514 assign_unaligned(get<1>(align),
515 seq | views::slice(static_cast<std::ranges::range_difference_t<seq_type>>(offset_tmp),
516 std::ranges::distance(seq) - soft_clipping_end));
517 }
518 }
519 }
520
521 // read qual string
522 // -------------------------------------------------------------------------------------------------------------
523 auto qual_view = stream_view | detail::take_exactly_or_throw(core.l_seq)
524 | std::views::transform([] (char chr)
525 {
526 return static_cast<char>(chr + 33);
527 });
528 if constexpr (!detail::decays_to_ignore_v<qual_type>)
529 read_forward_range_field(qual_view, qual);
530 else
531 detail::consume(qual_view);
532
533 // All remaining optional fields if any: SAM tags dictionary
534 // -------------------------------------------------------------------------------------------------------------
535 int32_t remaining_bytes = core.block_size - (sizeof(alignment_record_core) - 4/*block_size excluded*/) -
536 core.l_read_name - core.n_cigar_op * 4 - (core.l_seq + 1) / 2 - core.l_seq;
537 assert(remaining_bytes >= 0);
538 auto tags_view = stream_view | detail::take_exactly_or_throw(remaining_bytes);
539
540 while (tags_view.size() > 0)
541 {
542 if constexpr (!detail::decays_to_ignore_v<tag_dict_type>)
543 read_sam_dict_field(tags_view, tag_dict);
544 else
545 detail::consume(tags_view);
546 }
547
548 // DONE READING - wrap up
549 // -------------------------------------------------------------------------------------------------------------
550 if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
551 {
552 // Check cigar, if it matches ‘kSmN’, where ‘k’ equals lseq, ‘m’ is the reference sequence length in the
553 // alignment, and ‘S’ and ‘N’ are the soft-clipping and reference-clip, then the cigar string was larger
554 // than 65535 operations and is stored in the sam_tag_dictionary (tag GC).
555 if (core.l_seq != 0 && offset_tmp == core.l_seq)
556 {
557 if constexpr (detail::decays_to_ignore_v<tag_dict_type> | detail::decays_to_ignore_v<seq_type>)
558 { // maybe only throw in debug mode and otherwise return an empty alignment?
559 throw format_error{detail::to_string("The cigar string '", offset_tmp, "S", ref_length,
560 "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
561 "stored in the optional field CG. You need to read in the field::tags and "
562 "field::seq in order to access this information.")};
563 }
564 else
565 {
566 auto it = tag_dict.find("CG"_tag);
567
568 if (it == tag_dict.end())
569 throw format_error{detail::to_string("The cigar string '", offset_tmp, "S", ref_length,
570 "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
571 "stored in the optional field CG but this tag is not present in the given ",
572 "record.")};
573
574 auto cigar_view = std::views::all(std::get<std::string>(it->second));
575 std::tie(tmp_cigar_vector, ref_length, seq_length) = detail::parse_cigar(cigar_view);
576 offset_tmp = soft_clipping_end = 0;
577 transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
578 tag_dict.erase(it); // remove redundant information
579
580 if constexpr (!detail::decays_to_ignore_v<align_type>)
581 {
582 assign_unaligned(get<1>(align),
583 seq | views::slice(static_cast<std::ranges::range_difference_t<seq_type>>(offset_tmp),
584 std::ranges::distance(seq) - soft_clipping_end));
585 }
586 }
587 }
588 }
589
590 // Alignment object construction
591 if constexpr (!detail::decays_to_ignore_v<align_type>)
592 construct_alignment(align, tmp_cigar_vector, core.refID, ref_seqs, core.pos, ref_length); // inherited from SAM
593
594 if constexpr (!detail::decays_to_ignore_v<cigar_type>)
595 std::swap(cigar_vector, tmp_cigar_vector);
596}
597
599template <typename stream_type,
600 typename header_type,
601 typename seq_type,
602 typename id_type,
603 typename ref_seq_type,
604 typename ref_id_type,
605 typename align_type,
606 typename cigar_type,
607 typename qual_type,
608 typename mate_type,
609 typename tag_dict_type>
610inline void format_bam::write_alignment_record([[maybe_unused]] stream_type & stream,
611 [[maybe_unused]] sam_file_output_options const & options,
612 [[maybe_unused]] header_type && header,
613 [[maybe_unused]] seq_type && seq,
614 [[maybe_unused]] qual_type && qual,
615 [[maybe_unused]] id_type && id,
616 [[maybe_unused]] int32_t const offset,
617 [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
618 [[maybe_unused]] ref_id_type && ref_id,
619 [[maybe_unused]] std::optional<int32_t> ref_offset,
620 [[maybe_unused]] align_type && align,
621 [[maybe_unused]] cigar_type && cigar_vector,
622 [[maybe_unused]] sam_flag const flag,
623 [[maybe_unused]] uint8_t const mapq,
624 [[maybe_unused]] mate_type && mate,
625 [[maybe_unused]] tag_dict_type && tag_dict,
626 [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(e_value),
627 [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(bit_score))
628{
629 // ---------------------------------------------------------------------
630 // Type Requirements (as static asserts for user friendliness)
631 // ---------------------------------------------------------------------
632 static_assert((std::ranges::forward_range<seq_type> &&
634 "The seq object must be a std::ranges::forward_range over "
635 "letters that model seqan3::alphabet.");
636
637 static_assert((std::ranges::forward_range<id_type> &&
639 "The id object must be a std::ranges::forward_range over "
640 "letters that model seqan3::alphabet.");
641
642 static_assert((std::ranges::forward_range<ref_seq_type> &&
644 "The ref_seq object must be a std::ranges::forward_range "
645 "over letters that model seqan3::alphabet.");
646
647 if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
648 {
649 static_assert((std::ranges::forward_range<ref_id_type> ||
650 std::integral<std::remove_reference_t<ref_id_type>> ||
651 detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>, std::optional>),
652 "The ref_id object must be a std::ranges::forward_range "
653 "over letters that model seqan3::alphabet or an integral or a std::optional<integral>.");
654 }
655
657 "The align object must be a std::pair of two ranges whose "
658 "value_type is comparable to seqan3::gap");
659
660 static_assert((std::tuple_size_v<std::remove_cvref_t<align_type>> == 2 &&
661 std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<0>(align))>> &&
662 std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<1>(align))>>),
663 "The align object must be a std::pair of two ranges whose "
664 "value_type is comparable to seqan3::gap");
665
666 static_assert((std::ranges::forward_range<qual_type> &&
668 "The qual object must be a std::ranges::forward_range "
669 "over letters that model seqan3::alphabet.");
670
672 "The mate object must be a std::tuple of size 3 with "
673 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
674 "2) a std::integral or std::optional<std::integral>, and "
675 "3) a std::integral.");
676
677 static_assert(((std::ranges::forward_range<decltype(std::get<0>(mate))> ||
678 std::integral<std::remove_cvref_t<decltype(std::get<0>(mate))>> ||
679 detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(std::get<0>(mate))>, std::optional>) &&
680 (std::integral<std::remove_cvref_t<decltype(std::get<1>(mate))>> ||
681 detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(std::get<1>(mate))>, std::optional>) &&
682 std::integral<std::remove_cvref_t<decltype(std::get<2>(mate))>>),
683 "The mate object must be a std::tuple of size 3 with "
684 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
685 "2) a std::integral or std::optional<std::integral>, and "
686 "3) a std::integral.");
687
688 static_assert(std::same_as<std::remove_cvref_t<tag_dict_type>, sam_tag_dictionary>,
689 "The tag_dict object must be of type seqan3::sam_tag_dictionary.");
690
691 if constexpr (detail::decays_to_ignore_v<header_type>)
692 {
693 throw format_error{"BAM can only be written with a header but you did not provide enough information! "
694 "You can either construct the output file with ref_ids and ref_seqs information and "
695 "the header will be created for you, or you can access the `header` member directly."};
696 }
697 else
698 {
699 // ---------------------------------------------------------------------
700 // logical Requirements
701 // ---------------------------------------------------------------------
702
703 if (ref_offset.has_value() && (ref_offset.value() + 1) < 0)
704 throw format_error{detail::to_string("The ref_offset object must be >= -1 but is: ", ref_offset)};
705
706 detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};
707
708 // ---------------------------------------------------------------------
709 // Writing the BAM Header on first call
710 // ---------------------------------------------------------------------
711 if (!header_was_written)
712 {
713 stream << "BAM\1";
715 write_header(os, options, header); // write SAM header to temporary stream to query the size.
716 int32_t l_text{static_cast<int32_t>(os.str().size())};
717 std::ranges::copy_n(reinterpret_cast<char *>(&l_text), 4, stream_it); // write read id
718
719 stream << os.str();
720
721 int32_t n_ref{static_cast<int32_t>(header.ref_ids().size())};
722 std::ranges::copy_n(reinterpret_cast<char *>(&n_ref), 4, stream_it); // write read id
723
724 for (int32_t ridx = 0; ridx < n_ref; ++ridx)
725 {
726 int32_t l_name{static_cast<int32_t>(header.ref_ids()[ridx].size()) + 1}; // plus null character
727 std::ranges::copy_n(reinterpret_cast<char *>(&l_name), 4, stream_it); // write l_name
728 // write reference name:
729 std::ranges::copy(header.ref_ids()[ridx].begin(), header.ref_ids()[ridx].end(), stream_it);
730 stream_it = '\0';
731 // write reference sequence length:
732 std::ranges::copy_n(reinterpret_cast<char *>(&get<0>(header.ref_id_info[ridx])), 4, stream_it);
733 }
734
735 header_was_written = true;
736 }
737
738 // ---------------------------------------------------------------------
739 // Writing the Record
740 // ---------------------------------------------------------------------
741 int32_t ref_length{};
742
743 // if alignment is non-empty, replace cigar_vector.
744 // else, compute the ref_length from given cigar_vector which is needed to fill field `bin`.
745 if (!std::ranges::empty(cigar_vector))
746 {
747 int32_t dummy_seq_length{};
748 for (auto & [count, operation] : cigar_vector)
749 detail::update_alignment_lengths(ref_length, dummy_seq_length, operation.to_char(), count);
750 }
751 else if (!std::ranges::empty(get<0>(align)) && !std::ranges::empty(get<1>(align)))
752 {
753 ref_length = std::ranges::distance(get<1>(align));
754
755 // compute possible distance from alignment end to sequence end
756 // which indicates soft clipping at the end.
757 // This should be replaced by a free count_gaps function for
758 // aligned sequences which is more efficient if possible.
759 int32_t off_end{static_cast<int32_t>(std::ranges::distance(seq)) - offset};
760
761 for (auto chr : get<1>(align))
762 if (chr == gap{})
763 ++off_end;
764
765 off_end -= ref_length;
766 cigar_vector = detail::get_cigar_vector(align, offset, off_end);
767 }
768
769 if (cigar_vector.size() >= (1 << 16)) // must be written into the sam tag CG
770 {
771 tag_dict["CG"_tag] = detail::get_cigar_string(cigar_vector);
772 cigar_vector.resize(2);
773 cigar_vector[0] = cigar{static_cast<uint32_t>(std::ranges::distance(seq)), 'S'_cigar_operation};
774 cigar_vector[1] = cigar{static_cast<uint32_t>(std::ranges::distance(get<1>(align))), 'N'_cigar_operation};
775 }
776
777 std::string tag_dict_binary_str = get_tag_dict_str(tag_dict);
778
779 // Compute the value for the l_read_name field for the bam record.
780 // This value is stored including a trailing `0`, so at most 254 characters of the id can be stored, since
781 // the data type to store the value is uint8_t and 255 is the maximal size.
782 // If the id is empty a '*' is written instead, i.e. the written id is never an empty string and stores at least
783 // 2 bytes.
784 uint8_t read_name_size = std::min<uint8_t>(std::ranges::distance(id), 254) + 1;
785 read_name_size += static_cast<uint8_t>(read_name_size == 1); // need size two since empty id is stored as '*'.
786
787 alignment_record_core core
788 {
789 /* block_size */ 0, // will be initialised right after
790 /* refID */ -1, // will be initialised right after
791 /* pos */ ref_offset.value_or(-1),
792 /* l_read_name */ read_name_size,
793 /* mapq */ mapq,
794 /* bin */ reg2bin(ref_offset.value_or(-1), ref_length),
795 /* n_cigar_op */ static_cast<uint16_t>(cigar_vector.size()),
796 /* flag */ flag,
797 /* l_seq */ static_cast<int32_t>(std::ranges::distance(seq)),
798 /* next_refId */ -1, // will be initialised right after
799 /* next_pos */ get<1>(mate).value_or(-1),
800 /* tlen */ get<2>(mate)
801 };
802
803 auto check_and_assign_id_to = [&header] ([[maybe_unused]] auto & id_source,
804 [[maybe_unused]] auto & id_target)
805 {
806 using id_t = std::remove_reference_t<decltype(id_source)>;
807
808 if constexpr (!detail::decays_to_ignore_v<id_t>)
809 {
810 if constexpr (std::integral<id_t>)
811 {
812 id_target = id_source;
813 }
814 else if constexpr (detail::is_type_specialisation_of_v<id_t, std::optional>)
815 {
816 id_target = id_source.value_or(-1);
817 }
818 else
819 {
820 if (!std::ranges::empty(id_source)) // otherwise default will remain (-1)
821 {
822 auto id_it = header.ref_dict.end();
823
824 if constexpr (std::ranges::contiguous_range<decltype(id_source)> &&
825 std::ranges::sized_range<decltype(id_source)> &&
826 std::ranges::borrowed_range<decltype(id_source)>)
827 {
828 id_it = header.ref_dict.find(std::span{std::ranges::data(id_source),
829 std::ranges::size(id_source)});
830 }
831 else
832 {
833 using header_ref_id_type = std::remove_reference_t<decltype(header.ref_ids()[0])>;
834
835 static_assert(implicitly_convertible_to<decltype(id_source), header_ref_id_type>,
836 "The ref_id type is not convertible to the reference id information stored in the "
837 "reference dictionary of the header object.");
838
839 id_it = header.ref_dict.find(id_source);
840 }
841
842 if (id_it == header.ref_dict.end())
843 {
844 throw format_error{detail::to_string("Unknown reference name '", id_source, "' could "
845 "not be found in BAM header ref_dict: ",
846 header.ref_dict, ".")};
847 }
848
849 id_target = id_it->second;
850 }
851 }
852 }
853 };
854
855 // initialise core.refID
856 check_and_assign_id_to(ref_id, core.refID);
857
858 // initialise core.next_refID
859 check_and_assign_id_to(get<0>(mate), core.next_refID);
860
861 // initialise core.block_size
862 core.block_size = sizeof(core) - 4/*block_size excluded*/ +
863 core.l_read_name +
864 core.n_cigar_op * 4 + // each int32_t has 4 bytes
865 (core.l_seq + 1) / 2 + // bitcompressed seq
866 core.l_seq + // quality string
867 tag_dict_binary_str.size();
868
869 std::ranges::copy_n(reinterpret_cast<char *>(&core), sizeof(core), stream_it); // write core
870
871 if (std::ranges::empty(id)) // empty id is represented as * for backward compatibility
872 stream_it = '*';
873 else
874 std::ranges::copy_n(std::ranges::begin(id), core.l_read_name - 1, stream_it); // write read id
875 stream_it = '\0';
876
877 // write cigar
878 for (auto [cigar_count, op] : cigar_vector)
879 {
880 cigar_count = cigar_count << 4;
881 cigar_count |= static_cast<int32_t>(char_to_sam_rank[op.to_char()]);
882 std::ranges::copy_n(reinterpret_cast<char *>(&cigar_count), 4, stream_it);
883 }
884
885 // write seq (bit-compressed: dna16sam characters go into one byte)
886 using alph_t = std::ranges::range_value_t<seq_type>;
887 constexpr auto to_dna16 = detail::convert_through_char_representation<dna16sam, alph_t>;
888
889 auto sit = std::ranges::begin(seq);
890 for (int32_t sidx = 0; sidx < ((core.l_seq & 1) ? core.l_seq - 1 : core.l_seq); ++sidx, ++sit)
891 {
892 uint8_t compressed_chr = to_rank(to_dna16[to_rank(*sit)]) << 4;
893 ++sidx, ++sit;
894 compressed_chr |= to_rank(to_dna16[to_rank(*sit)]);
895 stream_it = static_cast<char>(compressed_chr);
896 }
897
898 if (core.l_seq & 1) // write one more
899 stream_it = static_cast<char>(to_rank(to_dna16[to_rank(*sit)]) << 4);
900
901 // write qual
902 if (std::ranges::empty(qual))
903 {
904 auto v = views::repeat_n(static_cast<char>(255), core.l_seq);
905 std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
906 }
907 else
908 {
909 if (std::ranges::distance(qual) != core.l_seq)
910 throw format_error{detail::to_string("Expected quality of same length as sequence with size ",
911 core.l_seq, ". Got quality with size ",
912 std::ranges::distance(qual), " instead.")};
913
914 auto v = qual | std::views::transform([] (auto chr) { return static_cast<char>(to_rank(chr)); });
915 std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
916 }
917
918 // write optional fields
919 stream << tag_dict_binary_str;
920 } // if constexpr (!detail::decays_to_ignore_v<header_type>)
921}
922
924template <typename stream_view_type, typename value_type>
925inline void format_bam::read_sam_dict_vector(seqan3::detail::sam_tag_variant & variant,
926 stream_view_type && stream_view,
927 value_type const & SEQAN3_DOXYGEN_ONLY(value))
928{
929 int32_t count;
930 read_integral_byte_field(stream_view, count); // read length of vector
931 std::vector<value_type> tmp_vector;
932 tmp_vector.reserve(count);
933
934 while (count > 0)
935 {
936 value_type tmp{};
937 if constexpr(std::integral<value_type>)
938 {
939 read_integral_byte_field(stream_view, tmp);
940 }
941 else if constexpr(std::same_as<value_type, float>)
942 {
943 read_float_byte_field(stream_view, tmp);
944 }
945 else
946 {
947 constexpr bool always_false = std::is_same_v<value_type, void>;
948 static_assert(always_false, "format_bam::read_sam_dict_vector: unsupported value_type");
949 }
950 tmp_vector.push_back(std::move(tmp));
951 --count;
952 }
953 variant = std::move(tmp_vector);
954}
955
973template <typename stream_view_type>
974inline void format_bam::read_sam_dict_field(stream_view_type && stream_view, sam_tag_dictionary & target)
975{
976 /* Every BA< tag has the format "[TAG][TYPE_ID][VALUE]", where TAG is a two letter
977 name tag which is converted to a unique integer identifier and TYPE_ID is one character in [A,i,Z,H,B,f]
978 describing the type for the upcoming VALUES. If TYPE_ID=='B' it signals an array of
979 VALUE's and the inner value type is identified by the next character, one of [cCsSiIf], followed
980 by the length (int32_t) of the array, followed by the values.
981 */
982 auto it = std::ranges::begin(stream_view);
983 uint16_t tag = static_cast<uint16_t>(*it) << 8;
984 ++it; // skip char read before
985
986 tag += static_cast<uint16_t>(*it);
987 ++it; // skip char read before
988
989 char type_id = *it;
990 ++it; // skip char read before
991
992 switch (type_id)
993 {
994 case 'A' : // char
995 {
996 target[tag] = *it;
997 ++it; // skip char that has been read
998 break;
999 }
1000 // all integer sizes are possible
1001 case 'c' : // int8_t
1002 {
1003 int8_t tmp;
1004 read_integral_byte_field(stream_view, tmp);
1005 target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1006 break;
1007 }
1008 case 'C' : // uint8_t
1009 {
1010 uint8_t tmp;
1011 read_integral_byte_field(stream_view, tmp);
1012 target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1013 break;
1014 }
1015 case 's' : // int16_t
1016 {
1017 int16_t tmp;
1018 read_integral_byte_field(stream_view, tmp);
1019 target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1020 break;
1021 }
1022 case 'S' : // uint16_t
1023 {
1024 uint16_t tmp;
1025 read_integral_byte_field(stream_view, tmp);
1026 target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1027 break;
1028 }
1029 case 'i' : // int32_t
1030 {
1031 int32_t tmp;
1032 read_integral_byte_field(stream_view, tmp);
1033 target[tag] = std::move(tmp); // readable sam format only allows int32_t
1034 break;
1035 }
1036 case 'I' : // uint32_t
1037 {
1038 uint32_t tmp;
1039 read_integral_byte_field(stream_view, tmp);
1040 target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1041 break;
1042 }
1043 case 'f' : // float
1044 {
1045 float tmp;
1046 read_float_byte_field(stream_view, tmp);
1047 target[tag] = tmp;
1048 break;
1049 }
1050 case 'Z' : // string
1051 {
1052 string_buffer.clear();
1053 while (!is_char<'\0'>(*it))
1054 {
1055 string_buffer.push_back(*it);
1056 ++it;
1057 }
1058 ++it; // skip \0
1059 target[tag] = string_buffer;
1060 break;
1061 }
1062 case 'H' : // byte array, represented as null-terminated string; specification requires even number of bytes
1063 {
1064 std::vector<std::byte> byte_array;
1065 std::byte value;
1066 while (!is_char<'\0'>(*it))
1067 {
1068 string_buffer.clear();
1069 string_buffer.push_back(*it);
1070 ++it;
1071
1072 if (*it == '\0')
1073 throw format_error{"Hexadecimal tag has an uneven number of digits!"};
1074
1075 string_buffer.push_back(*it);
1076 ++it;
1077 read_byte_field(string_buffer, value);
1078 byte_array.push_back(value);
1079 }
1080 ++it; // skip \0
1081 target[tag] = byte_array;
1082 break;
1083 }
1084 case 'B' : // Array. Value type depends on second char [cCsSiIf]
1085 {
1086 char array_value_type_id = *it;
1087 ++it; // skip char read before
1088
1089 switch (array_value_type_id)
1090 {
1091 case 'c' : // int8_t
1092 read_sam_dict_vector(target[tag], stream_view, int8_t{});
1093 break;
1094 case 'C' : // uint8_t
1095 read_sam_dict_vector(target[tag], stream_view, uint8_t{});
1096 break;
1097 case 's' : // int16_t
1098 read_sam_dict_vector(target[tag], stream_view, int16_t{});
1099 break;
1100 case 'S' : // uint16_t
1101 read_sam_dict_vector(target[tag], stream_view, uint16_t{});
1102 break;
1103 case 'i' : // int32_t
1104 read_sam_dict_vector(target[tag], stream_view, int32_t{});
1105 break;
1106 case 'I' : // uint32_t
1107 read_sam_dict_vector(target[tag], stream_view, uint32_t{});
1108 break;
1109 case 'f' : // float
1110 read_sam_dict_vector(target[tag], stream_view, float{});
1111 break;
1112 default:
1113 throw format_error{detail::to_string("The first character in the numerical id of a SAM tag ",
1114 "must be one of [cCsSiIf] but '", array_value_type_id, "' was given.")};
1115 }
1116 break;
1117 }
1118 default:
1119 throw format_error{detail::to_string("The second character in the numerical id of a "
1120 "SAM tag must be one of [A,i,Z,H,B,f] but '", type_id, "' was given.")};
1121 }
1122}
1123
1138template <typename cigar_input_type>
1139inline auto format_bam::parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op) const
1140{
1141 std::vector<cigar> operations{};
1142 char operation{'\0'};
1143 uint32_t count{};
1144 int32_t ref_length{}, seq_length{};
1145 uint32_t operation_and_count{}; // in BAM operation and count values are stored within one 32 bit integer
1146 constexpr char const * cigar_mapping = "MIDNSHP=X*******";
1147 constexpr uint32_t cigar_mask = 0x0f; // 0000000000001111
1148
1149 if (n_cigar_op == 0) // [[unlikely]]
1150 return std::tuple{operations, ref_length, seq_length};
1151
1152 // parse the rest of the cigar
1153 // -------------------------------------------------------------------------------------------------------------
1154 while (n_cigar_op > 0) // until stream is not empty
1155 {
1156 std::ranges::copy_n(std::ranges::begin(cigar_input),
1157 sizeof(operation_and_count),
1158 reinterpret_cast<char*>(&operation_and_count));
1159 operation = cigar_mapping[operation_and_count & cigar_mask];
1160 count = operation_and_count >> 4;
1161
1162 detail::update_alignment_lengths(ref_length, seq_length, operation, count);
1163 operations.emplace_back(count, cigar::operation{}.assign_char(operation));
1164 --n_cigar_op;
1165 }
1166
1167 return std::tuple{operations, ref_length, seq_length};
1168}
1169
1173inline std::string format_bam::get_tag_dict_str(sam_tag_dictionary const & tag_dict)
1174{
1175 std::string result{};
1176
1177 auto stream_variant_fn = [&result] (auto && arg) // helper to print an std::variant
1178 {
1179 // T is either char, int32_t, float, std::string, or a std::vector<some int>
1180 using T = std::remove_cvref_t<decltype(arg)>;
1181
1182 if constexpr (std::same_as<T, int32_t>)
1183 {
1184 // always choose the smallest possible representation [cCsSiI]
1185 size_t const absolute_arg = std::abs(arg);
1186 auto n = std::countr_zero(std::bit_ceil(absolute_arg + 1u) >> 1u) / 8u;
1187 bool const negative = arg < 0;
1188 n = n * n + 2 * negative; // for switch case order
1189
1190 switch (n)
1191 {
1192 case 0:
1193 {
1194 result[result.size() - 1] = 'C';
1195 result.append(reinterpret_cast<char const *>(&arg), 1);
1196 break;
1197 }
1198 case 1:
1199 {
1200 result[result.size() - 1] = 'S';
1201 result.append(reinterpret_cast<char const *>(&arg), 2);
1202 break;
1203 }
1204 case 2:
1205 {
1206 result[result.size() - 1] = 'c';
1207 int8_t tmp = static_cast<int8_t>(arg);
1208 result.append(reinterpret_cast<char const *>(&tmp), 1);
1209 break;
1210 }
1211 case 3:
1212 {
1213 result[result.size() - 1] = 's';
1214 int16_t tmp = static_cast<int16_t>(arg);
1215 result.append(reinterpret_cast<char const *>(&tmp), 2);
1216 break;
1217 }
1218 default:
1219 {
1220 result.append(reinterpret_cast<char const *>(&arg), 4); // always i
1221 break;
1222 }
1223 }
1224 }
1225 else if constexpr (std::same_as<T, std::string>)
1226 {
1227 result.append(reinterpret_cast<char const *>(arg.data()), arg.size() + 1/*+ null character*/);
1228 }
1229 else if constexpr (!std::ranges::range<T>) // char, float
1230 {
1231 result.append(reinterpret_cast<char const *>(&arg), sizeof(arg));
1232 }
1233 else // std::vector of some arithmetic_type type
1234 {
1235 int32_t sz{static_cast<int32_t>(arg.size())};
1236 result.append(reinterpret_cast<char *>(&sz), 4);
1237 result.append(reinterpret_cast<char const *>(arg.data()),
1238 arg.size() * sizeof(std::ranges::range_value_t<T>));
1239 }
1240 };
1241
1242 for (auto & [tag, variant] : tag_dict)
1243 {
1244 result.push_back(static_cast<char>(tag / 256));
1245 result.push_back(static_cast<char>(tag % 256));
1246
1247 result.push_back(detail::sam_tag_type_char[variant.index()]);
1248
1249 if (!is_char<'\0'>(detail::sam_tag_type_char_extra[variant.index()]))
1250 result.push_back(detail::sam_tag_type_char_extra[variant.index()]);
1251
1252 std::visit(stream_variant_fn, variant);
1253 }
1254
1255 return result;
1256}
1257
1258} // namespace seqan3
T begin(T... args)
Provides the C++20 <bit> header if it is not already available.
constexpr derived_type & assign_char(char_type const chr) noexcept
Assign from a character, implicitly converts invalid characters.
Definition: alphabet_base.hpp:165
constexpr derived_type & assign_rank(rank_type const c) noexcept
Assign from a numeric value.
Definition: alphabet_base.hpp:191
The seqan3::cigar semialphabet pairs a counter with a seqan3::cigar::operation letter.
Definition: cigar.hpp:60
exposition_only::cigar_operation operation
The (extended) cigar operation alphabet of M,D,I,H,N,P,S,X,=.
Definition: cigar.hpp:99
A 16 letter DNA alphabet, containing all IUPAC symbols minus the gap and plus an equality sign ('=')....
Definition: dna16sam.hpp:48
The BAM format.
Definition: format_bam.hpp:48
format_bam()=default
Defaulted.
void write_alignment_record(stream_type &stream, sam_file_output_options const &options, header_type &&header, seq_type &&seq, qual_type &&qual, id_type &&id, int32_t const offset, ref_seq_type &&ref_seq, ref_id_type &&ref_id, std::optional< int32_t > ref_offset, align_type &&align, cigar_type &&cigar_vector, sam_flag const flag, uint8_t const mapq, mate_type &&mate, tag_dict_type &&tag_dict, double e_value, double bit_score)
Write the given fields to the specified stream.
Definition: format_bam.hpp:610
void read_alignment_record(stream_type &stream, sam_file_input_options< seq_legal_alph_type > const &options, ref_seqs_type &ref_seqs, sam_file_header< ref_ids_type > &header, seq_type &seq, qual_type &qual, id_type &id, offset_type &offset, ref_seq_type &ref_seq, ref_id_type &ref_id, ref_offset_type &ref_offset, align_type &align, cigar_type &cigar_vector, flag_type &flag, mapq_type &mapq, mate_type &mate, tag_dict_type &tag_dict, e_value_type &e_value, bit_score_type &bit_score)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_bam.hpp:255
format_bam & operator=(format_bam &&)=default
Defaulted.
format_bam & operator=(format_bam const &)=default
Defaulted.
format_bam(format_bam &&)=default
Defaulted.
~format_bam()=default
Defaulted.
format_bam(format_bam const &)=default
Defaulted.
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_bam.hpp:65
The alphabet of a gap character '-'.
Definition: gap.hpp:39
Stores the header information of alignment files.
Definition: header.hpp:32
std::unordered_map< key_type, int32_t, std::hash< key_type >, detail::view_equality_fn > ref_dict
The mapping of reference id to position in the ref_ids() range and the ref_id_info range.
Definition: header.hpp:157
ref_ids_type & ref_ids()
The range of reference ids.
Definition: header.hpp:118
std::vector< std::tuple< int32_t, std::string > > ref_id_info
The reference information. (used by the SAM/BAM format)
Definition: header.hpp:154
The SAM tag dictionary class that stores all optional SAM fields.
Definition: sam_tag_dictionary.hpp:334
T clear(T... args)
T data(T... args)
Provides seqan3::dna16sam.
T end(T... args)
Provides seqan3::detail::fast_ostreambuf_iterator.
Provides the seqan3::format_sam_base that can be inherited from.
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition: concept.hpp:155
constexpr T bit_ceil(T x) noexcept
Calculates the smallest integral power of two that is not smaller than x.
Definition: bit:133
constexpr int countr_zero(T x) noexcept
Returns the number of consecutive 0 bits in the value of x, starting from the least significant bit (...
Definition: bit:199
sam_flag
An enum flag that describes the properties of an aligned read (given as a SAM record).
Definition: sam_flag.hpp:74
@ flag
The alignment flag (bit information), uint16_t value.
@ ref_offset
Sequence (seqan3::field::ref_seq) relative start position (0-based), unsigned value.
@ mapq
The mapping quality of the seqan3::field::seq alignment, usually a Phred-scaled score.
@ offset
Sequence (seqan3::field::seq) relative start position (0-based), unsigned value.
@ mate
The mate pair information given as a std::tuple of reference name, offset and template length.
@ ref_id
The identifier of the (reference) sequence that seqan3::field::seq was aligned to.
@ seq
The "sequence", usually a range of nucleotides or amino acids.
@ qual
The qualities, usually in Phred score notation.
decltype(detail::transform< trait_t >(list_t{})) transform
Apply a transformation trait to every type in the list and return a seqan3::type_list of the results.
Definition: traits.hpp:471
constexpr ptrdiff_t count
Count the occurrences of a type in a pack.
Definition: traits.hpp:169
constexpr size_t size
The size of a type pack.
Definition: traits.hpp:151
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:183
constexpr auto repeat_n
A view factory that repeats a given value n times.
Definition: repeat_n.hpp:91
Provides the seqan3::sam_file_header class.
T index(T... args)
The generic alphabet concept that covers most data types used in ranges.
Resolves to std::ranges::implicitly_convertible_to<type1, type2>(). <dl class="no-api">This entity i...
A more refined container concept than seqan3::container.
Whether a type behaves like a tuple.
Auxiliary functions for the alignment IO.
Provides seqan3::detail::istreambuf.
T min(T... args)
The main SeqAn3 namespace.
Definition: cigar_operation_table.hpp:2
Provides seqan3::debug_stream and related types.
T push_back(T... args)
Adaptations of concepts from the Ranges TS.
T reserve(T... args)
T resize(T... args)
Provides seqan3::sam_file_input_options.
Provides helper data structures for the seqan3::sam_file_output.
Provides the seqan3::sam_tag_dictionary class and auxiliaries.
T size(T... args)
Provides seqan3::views::slice.
T str(T... args)
Thrown if information given to output format didn't match expectations.
Definition: exception.hpp:91
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:24
The options type defines various option members that influence the behavior of all or some formats.
Definition: output_options.hpp:23
T swap(T... args)
Provides seqan3::views::take_exactly and seqan3::views::take_exactly_or_throw.
T tie(T... args)
T visit(T... args)