70 template <
typename stream_type,
71 typename seq_legal_alph_type,
72 typename ref_seqs_type,
73 typename ref_ids_type,
77 typename ref_seq_type,
79 typename ref_offset_type,
86 typename tag_dict_type,
87 typename e_value_type,
88 typename bit_score_type>
91 ref_seqs_type & ref_seqs,
97 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
99 ref_offset_type & ref_offset,
101 cigar_type & cigar_vector,
105 tag_dict_type & tag_dict,
106 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
107 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score));
109 template <
typename stream_type,
110 typename header_type,
113 typename ref_seq_type,
114 typename ref_id_type,
119 typename tag_dict_type>
122 [[maybe_unused]] header_type && header,
123 [[maybe_unused]] seq_type && seq,
124 [[maybe_unused]] qual_type && qual,
125 [[maybe_unused]] id_type &&
id,
126 [[maybe_unused]] int32_t
const offset,
127 [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
128 [[maybe_unused]] ref_id_type && ref_id,
130 [[maybe_unused]] align_type && align,
131 [[maybe_unused]] cigar_type && cigar_vector,
132 [[maybe_unused]]
sam_flag const flag,
133 [[maybe_unused]] uint8_t
const mapq,
134 [[maybe_unused]] mate_type && mate,
135 [[maybe_unused]] tag_dict_type && tag_dict,
136 [[maybe_unused]]
double SEQAN3_DOXYGEN_ONLY(e_value),
137 [[maybe_unused]]
double SEQAN3_DOXYGEN_ONLY(bit_score));
141 bool header_was_read{
false};
147 struct alignment_record_core
152 uint32_t l_read_name:8;
155 uint32_t n_cigar_op:16;
173 ret[
static_cast<index_t
>(
'I')] = 1;
174 ret[
static_cast<index_t
>(
'D')] = 2;
175 ret[
static_cast<index_t
>(
'N')] = 3;
176 ret[
static_cast<index_t
>(
'S')] = 4;
177 ret[
static_cast<index_t
>(
'H')] = 5;
178 ret[
static_cast<index_t
>(
'P')] = 6;
179 ret[
static_cast<index_t
>(
'=')] = 7;
180 ret[
static_cast<index_t
>(
'X')] = 8;
187 static uint16_t reg2bin(int32_t beg, int32_t end)
noexcept
190 if (beg >> 14 == end >> 14)
return ((1 << 15) - 1) / 7 + (beg >> 14);
191 if (beg >> 17 == end >> 17)
return ((1 << 12) - 1) / 7 + (beg >> 17);
192 if (beg >> 20 == end >> 20)
return ((1 << 9) - 1) / 7 + (beg >> 20);
193 if (beg >> 23 == end >> 23)
return ((1 << 6) - 1) / 7 + (beg >> 23);
194 if (beg >> 26 == end >> 26)
return ((1 << 3) - 1) / 7 + (beg >> 26);
204 template <
typename stream_view_type, std::
integral number_type>
205 void read_integral_byte_field(stream_view_type && stream_view, number_type & target)
207 std::ranges::copy_n(std::ranges::begin(stream_view),
sizeof(target),
reinterpret_cast<char *
>(&target));
215 template <
typename stream_view_type>
216 void read_float_byte_field(stream_view_type && stream_view,
float & target)
218 std::ranges::copy_n(std::ranges::begin(stream_view),
sizeof(int32_t),
reinterpret_cast<char *
>(&target));
221 template <
typename stream_view_type,
typename value_type>
223 stream_view_type && stream_view,
224 value_type
const & SEQAN3_DOXYGEN_ONLY(value));
226 template <
typename stream_view_type>
227 void read_sam_dict_field(stream_view_type && stream_view, sam_tag_dictionary & target);
229 template <
typename cigar_input_type>
230 auto parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op)
const;
232 static std::string get_tag_dict_str(sam_tag_dictionary
const & tag_dict);
236template <
typename stream_type,
237 typename seq_legal_alph_type,
238 typename ref_seqs_type,
239 typename ref_ids_type,
242 typename offset_type,
243 typename ref_seq_type,
244 typename ref_id_type,
245 typename ref_offset_type,
252 typename tag_dict_type,
253 typename e_value_type,
254 typename bit_score_type>
257 ref_seqs_type & ref_seqs,
262 offset_type & offset,
263 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
264 ref_id_type & ref_id,
265 ref_offset_type & ref_offset,
267 cigar_type & cigar_vector,
271 tag_dict_type & tag_dict,
272 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
273 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
275 static_assert(detail::decays_to_ignore_v<ref_offset_type> ||
276 detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
277 "The ref_offset must be a specialisation of std::optional.");
279 static_assert(detail::decays_to_ignore_v<mapq_type> || std::same_as<mapq_type, uint8_t>,
280 "The type of field::mapq must be uint8_t.");
282 static_assert(detail::decays_to_ignore_v<flag_type> || std::same_as<flag_type, sam_flag>,
283 "The type of field::flag must be seqan3::sam_flag.");
285 auto stream_view = seqan3::detail::istreambuf(stream);
288 [[maybe_unused]] int32_t offset_tmp{};
289 [[maybe_unused]] int32_t soft_clipping_end{};
291 [[maybe_unused]] int32_t ref_length{0}, seq_length{0};
295 if (!header_was_read)
298 if (!std::ranges::equal(stream_view | detail::take_exactly_or_throw(4),
std::string_view{
"BAM\1"}))
306 read_integral_byte_field(stream_view, l_text);
309 read_header(stream_view | detail::take_exactly_or_throw(l_text), header, ref_seqs);
311 read_integral_byte_field(stream_view, n_ref);
313 for (int32_t ref_idx = 0; ref_idx < n_ref; ++ref_idx)
315 read_integral_byte_field(stream_view, l_name);
317 string_buffer.
resize(l_name - 1);
318 std::ranges::copy_n(std::ranges::begin(stream_view), l_name - 1, string_buffer.
data());
319 std::ranges::next(std::ranges::begin(stream_view));
321 read_integral_byte_field(stream_view, l_ref);
323 if constexpr (detail::decays_to_ignore_v<ref_seqs_type>)
328 auto & reference_ids = header.
ref_ids();
332 reference_ids.push_back(string_buffer);
334 header.
ref_dict.emplace(reference_ids.back(), reference_ids.size() - 1);
339 auto id_it = header.
ref_dict.find(string_buffer);
344 throw format_error{detail::to_string(
"Unknown reference name '" + string_buffer +
345 "' found in BAM file header (header.ref_ids():",
348 else if (id_it->second != ref_idx)
350 throw format_error{detail::to_string(
"Reference id '", string_buffer,
"' at position ", ref_idx,
351 " does not correspond to the position ", id_it->second,
352 " in the header (header.ref_ids():", header.
ref_ids(),
").")};
354 else if (std::get<0>(header.
ref_id_info[id_it->second]) != l_ref)
356 throw format_error{
"Provided reference has unequal length as specified in the header."};
360 header_was_read =
true;
362 if (std::ranges::begin(stream_view) == std::ranges::end(stream_view))
368 alignment_record_core core;
369 std::ranges::copy(stream_view | detail::take_exactly_or_throw(
sizeof(core)),
reinterpret_cast<char *
>(&core));
371 if (core.refID >=
static_cast<int32_t
>(header.
ref_ids().size()) || core.refID < -1)
373 throw format_error{detail::to_string(
"Reference id index '", core.refID,
"' is not in range of ",
374 "header.ref_ids(), which has size ", header.
ref_ids().size(),
".")};
376 else if (core.refID > -1)
387 if constexpr (!detail::decays_to_ignore_v<mate_type>)
389 if (core.next_refID > -1)
390 get<0>(
mate) = core.next_refID;
392 if (core.next_pos > -1)
393 get<1>(
mate) = core.next_pos;
395 get<2>(
mate) = core.tlen;
400 auto id_view = stream_view | detail::take_exactly_or_throw(core.l_read_name - 1);
401 if constexpr (!detail::decays_to_ignore_v<id_type>)
402 read_forward_range_field(id_view,
id);
404 detail::consume(id_view);
405 std::ranges::next(std::ranges::begin(stream_view));
409 if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
411 std::tie(tmp_cigar_vector, ref_length, seq_length) = parse_binary_cigar(stream_view, core.n_cigar_op);
412 transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
417 detail::consume(stream_view | detail::take_exactly_or_throw(core.n_cigar_op * 4));
426 auto seq_stream = stream_view
427 | detail::take_exactly_or_throw(core.l_seq / 2)
434 if constexpr (detail::decays_to_ignore_v<seq_type>)
436 auto skip_sequence_bytes = [&] ()
438 detail::consume(seq_stream);
440 std::ranges::next(std::ranges::begin(stream_view));
443 if constexpr (!detail::decays_to_ignore_v<align_type>)
446 "If you want to read ALIGNMENT but not SEQ, the alignment"
447 " object must store a sequence container at the second (query) position.");
449 if (!tmp_cigar_vector.empty())
451 assert(core.l_seq == (seq_length + offset_tmp + soft_clipping_end));
452 using alph_t = std::ranges::range_value_t<decltype(get<1>(align))>;
453 constexpr auto from_dna16 = detail::convert_through_char_representation<alph_t, dna16sam>;
455 get<1>(align).reserve(seq_length);
457 auto tmp_iter = std::ranges::begin(seq_stream);
458 std::ranges::advance(tmp_iter, offset_tmp / 2);
462 get<1>(align).push_back(from_dna16[
to_rank(get<1>(*tmp_iter))]);
467 for (
size_t i = (seq_length / 2); i > 0; --i)
469 get<1>(align).push_back(from_dna16[
to_rank(get<0>(*tmp_iter))]);
470 get<1>(align).push_back(from_dna16[
to_rank(get<1>(*tmp_iter))]);
476 get<1>(align).push_back(from_dna16[
to_rank(get<0>(*tmp_iter))]);
481 std::ranges::advance(tmp_iter, (soft_clipping_end + !(seq_length & 1)) / 2);
485 skip_sequence_bytes();
491 skip_sequence_bytes();
496 using alph_t = std::ranges::range_value_t<
decltype(
seq)>;
497 constexpr auto from_dna16 = detail::convert_through_char_representation<alph_t, dna16sam>;
499 for (
auto [d1, d2] : seq_stream)
509 std::ranges::next(std::ranges::begin(stream_view));
512 if constexpr (!detail::decays_to_ignore_v<align_type>)
514 assign_unaligned(get<1>(align),
515 seq |
views::slice(
static_cast<std::ranges::range_difference_t<seq_type>
>(offset_tmp),
516 std::ranges::distance(
seq) - soft_clipping_end));
523 auto qual_view = stream_view | detail::take_exactly_or_throw(core.l_seq)
526 return static_cast<char>(chr + 33);
528 if constexpr (!detail::decays_to_ignore_v<qual_type>)
529 read_forward_range_field(qual_view,
qual);
531 detail::consume(qual_view);
535 int32_t remaining_bytes = core.block_size - (
sizeof(alignment_record_core) - 4) -
536 core.l_read_name - core.n_cigar_op * 4 - (core.l_seq + 1) / 2 - core.l_seq;
537 assert(remaining_bytes >= 0);
538 auto tags_view = stream_view | detail::take_exactly_or_throw(remaining_bytes);
540 while (tags_view.size() > 0)
542 if constexpr (!detail::decays_to_ignore_v<tag_dict_type>)
543 read_sam_dict_field(tags_view, tag_dict);
545 detail::consume(tags_view);
550 if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
555 if (core.l_seq != 0 && offset_tmp == core.l_seq)
557 if constexpr (detail::decays_to_ignore_v<tag_dict_type> | detail::decays_to_ignore_v<seq_type>)
559 throw format_error{detail::to_string(
"The cigar string '", offset_tmp,
"S", ref_length,
560 "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
561 "stored in the optional field CG. You need to read in the field::tags and "
562 "field::seq in order to access this information.")};
566 auto it = tag_dict.find(
"CG"_tag);
568 if (it == tag_dict.end())
569 throw format_error{detail::to_string(
"The cigar string '", offset_tmp,
"S", ref_length,
570 "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
571 "stored in the optional field CG but this tag is not present in the given ",
574 auto cigar_view = std::views::all(std::get<std::string>(it->second));
575 std::tie(tmp_cigar_vector, ref_length, seq_length) = detail::parse_cigar(cigar_view);
576 offset_tmp = soft_clipping_end = 0;
577 transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
580 if constexpr (!detail::decays_to_ignore_v<align_type>)
582 assign_unaligned(get<1>(align),
583 seq |
views::slice(
static_cast<std::ranges::range_difference_t<seq_type>
>(offset_tmp),
584 std::ranges::distance(
seq) - soft_clipping_end));
591 if constexpr (!detail::decays_to_ignore_v<align_type>)
592 construct_alignment(align, tmp_cigar_vector, core.refID, ref_seqs, core.pos, ref_length);
594 if constexpr (!detail::decays_to_ignore_v<cigar_type>)
595 std::swap(cigar_vector, tmp_cigar_vector);
599template <
typename stream_type,
600 typename header_type,
603 typename ref_seq_type,
604 typename ref_id_type,
609 typename tag_dict_type>
612 [[maybe_unused]] header_type && header,
613 [[maybe_unused]] seq_type && seq,
614 [[maybe_unused]] qual_type && qual,
615 [[maybe_unused]] id_type &&
id,
616 [[maybe_unused]] int32_t
const offset,
617 [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
618 [[maybe_unused]] ref_id_type && ref_id,
620 [[maybe_unused]] align_type && align,
621 [[maybe_unused]] cigar_type && cigar_vector,
622 [[maybe_unused]]
sam_flag const flag,
623 [[maybe_unused]] uint8_t
const mapq,
624 [[maybe_unused]] mate_type && mate,
625 [[maybe_unused]] tag_dict_type && tag_dict,
626 [[maybe_unused]]
double SEQAN3_DOXYGEN_ONLY(e_value),
627 [[maybe_unused]]
double SEQAN3_DOXYGEN_ONLY(bit_score))
632 static_assert((std::ranges::forward_range<seq_type> &&
634 "The seq object must be a std::ranges::forward_range over "
635 "letters that model seqan3::alphabet.");
637 static_assert((std::ranges::forward_range<id_type> &&
639 "The id object must be a std::ranges::forward_range over "
640 "letters that model seqan3::alphabet.");
642 static_assert((std::ranges::forward_range<ref_seq_type> &&
644 "The ref_seq object must be a std::ranges::forward_range "
645 "over letters that model seqan3::alphabet.");
647 if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
649 static_assert((std::ranges::forward_range<ref_id_type> ||
650 std::integral<std::remove_reference_t<ref_id_type>> ||
651 detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>,
std::optional>),
652 "The ref_id object must be a std::ranges::forward_range "
653 "over letters that model seqan3::alphabet or an integral or a std::optional<integral>.");
657 "The align object must be a std::pair of two ranges whose "
658 "value_type is comparable to seqan3::gap");
660 static_assert((std::tuple_size_v<std::remove_cvref_t<align_type>> == 2 &&
661 std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<0>(align))>> &&
662 std::equality_comparable_with<
gap, std::ranges::range_reference_t<
decltype(std::get<1>(align))>>),
663 "The align object must be a std::pair of two ranges whose "
664 "value_type is comparable to seqan3::gap");
666 static_assert((std::ranges::forward_range<qual_type> &&
668 "The qual object must be a std::ranges::forward_range "
669 "over letters that model seqan3::alphabet.");
672 "The mate object must be a std::tuple of size 3 with "
673 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
674 "2) a std::integral or std::optional<std::integral>, and "
675 "3) a std::integral.");
677 static_assert(((std::ranges::forward_range<decltype(std::get<0>(
mate))> ||
680 (std::integral<std::remove_cvref_t<decltype(std::get<1>(
mate))>> ||
682 std::integral<std::remove_cvref_t<decltype(std::get<2>(
mate))>>),
683 "The mate object must be a std::tuple of size 3 with "
684 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
685 "2) a std::integral or std::optional<std::integral>, and "
686 "3) a std::integral.");
689 "The tag_dict object must be of type seqan3::sam_tag_dictionary.");
691 if constexpr (detail::decays_to_ignore_v<header_type>)
693 throw format_error{
"BAM can only be written with a header but you did not provide enough information! "
694 "You can either construct the output file with ref_ids and ref_seqs information and "
695 "the header will be created for you, or you can access the `header` member directly."};
706 detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};
711 if (!header_was_written)
715 write_header(os, options, header);
716 int32_t l_text{
static_cast<int32_t
>(os.
str().size())};
717 std::ranges::copy_n(
reinterpret_cast<char *
>(&l_text), 4, stream_it);
721 int32_t n_ref{
static_cast<int32_t
>(header.
ref_ids().size())};
722 std::ranges::copy_n(
reinterpret_cast<char *
>(&n_ref), 4, stream_it);
724 for (int32_t ridx = 0; ridx < n_ref; ++ridx)
726 int32_t l_name{
static_cast<int32_t
>(header.
ref_ids()[ridx].size()) + 1};
727 std::ranges::copy_n(
reinterpret_cast<char *
>(&l_name), 4, stream_it);
729 std::ranges::copy(header.
ref_ids()[ridx].begin(), header.
ref_ids()[ridx].end(), stream_it);
732 std::ranges::copy_n(
reinterpret_cast<char *
>(&get<0>(header.
ref_id_info[ridx])), 4, stream_it);
735 header_was_written =
true;
741 int32_t ref_length{};
745 if (!std::ranges::empty(cigar_vector))
747 int32_t dummy_seq_length{};
748 for (
auto & [
count, operation] : cigar_vector)
749 detail::update_alignment_lengths(ref_length, dummy_seq_length, operation.to_char(),
count);
751 else if (!std::ranges::empty(get<0>(align)) && !std::ranges::empty(get<1>(align)))
753 ref_length = std::ranges::distance(get<1>(align));
759 int32_t off_end{
static_cast<int32_t
>(std::ranges::distance(
seq)) -
offset};
761 for (
auto chr : get<1>(align))
765 off_end -= ref_length;
766 cigar_vector = detail::get_cigar_vector(align,
offset, off_end);
769 if (cigar_vector.size() >= (1 << 16))
771 tag_dict[
"CG"_tag] = detail::get_cigar_string(cigar_vector);
772 cigar_vector.resize(2);
773 cigar_vector[0] =
cigar{
static_cast<uint32_t
>(std::ranges::distance(
seq)),
'S'_cigar_operation};
774 cigar_vector[1] =
cigar{
static_cast<uint32_t
>(std::ranges::distance(get<1>(align))),
'N'_cigar_operation};
777 std::string tag_dict_binary_str = get_tag_dict_str(tag_dict);
784 uint8_t read_name_size = std::min<uint8_t>(std::ranges::distance(
id), 254) + 1;
785 read_name_size +=
static_cast<uint8_t
>(read_name_size == 1);
787 alignment_record_core core
795 static_cast<uint16_t
>(cigar_vector.size()),
797 static_cast<int32_t
>(std::ranges::distance(
seq)),
799 get<1>(
mate).value_or(-1),
803 auto check_and_assign_id_to = [&header] ([[maybe_unused]]
auto & id_source,
804 [[maybe_unused]]
auto & id_target)
808 if constexpr (!detail::decays_to_ignore_v<id_t>)
810 if constexpr (std::integral<id_t>)
812 id_target = id_source;
814 else if constexpr (detail::is_type_specialisation_of_v<id_t, std::optional>)
816 id_target = id_source.value_or(-1);
820 if (!std::ranges::empty(id_source))
824 if constexpr (std::ranges::contiguous_range<
decltype(id_source)> &&
825 std::ranges::sized_range<
decltype(id_source)> &&
826 std::ranges::borrowed_range<
decltype(id_source)>)
836 "The ref_id type is not convertible to the reference id information stored in the "
837 "reference dictionary of the header object.");
839 id_it = header.
ref_dict.find(id_source);
844 throw format_error{detail::to_string(
"Unknown reference name '", id_source,
"' could "
845 "not be found in BAM header ref_dict: ",
849 id_target = id_it->second;
856 check_and_assign_id_to(
ref_id, core.refID);
859 check_and_assign_id_to(get<0>(
mate), core.next_refID);
862 core.block_size =
sizeof(core) - 4 +
864 core.n_cigar_op * 4 +
865 (core.l_seq + 1) / 2 +
867 tag_dict_binary_str.
size();
869 std::ranges::copy_n(
reinterpret_cast<char *
>(&core),
sizeof(core), stream_it);
871 if (std::ranges::empty(
id))
874 std::ranges::copy_n(std::ranges::begin(
id), core.l_read_name - 1, stream_it);
878 for (
auto [cigar_count, op] : cigar_vector)
880 cigar_count = cigar_count << 4;
881 cigar_count |=
static_cast<int32_t
>(char_to_sam_rank[op.to_char()]);
882 std::ranges::copy_n(
reinterpret_cast<char *
>(&cigar_count), 4, stream_it);
886 using alph_t = std::ranges::range_value_t<seq_type>;
887 constexpr auto to_dna16 = detail::convert_through_char_representation<dna16sam, alph_t>;
889 auto sit = std::ranges::begin(
seq);
890 for (int32_t sidx = 0; sidx < ((core.l_seq & 1) ? core.l_seq - 1 : core.l_seq); ++sidx, ++sit)
895 stream_it =
static_cast<char>(compressed_chr);
899 stream_it =
static_cast<char>(
to_rank(to_dna16[
to_rank(*sit)]) << 4);
902 if (std::ranges::empty(
qual))
905 std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
909 if (std::ranges::distance(
qual) != core.l_seq)
910 throw format_error{detail::to_string(
"Expected quality of same length as sequence with size ",
911 core.l_seq,
". Got quality with size ",
912 std::ranges::distance(
qual),
" instead.")};
915 std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
919 stream << tag_dict_binary_str;
924template <
typename stream_view_type,
typename value_type>
926 stream_view_type && stream_view,
927 value_type
const & SEQAN3_DOXYGEN_ONLY(value))
930 read_integral_byte_field(stream_view,
count);
937 if constexpr(std::integral<value_type>)
939 read_integral_byte_field(stream_view, tmp);
941 else if constexpr(std::same_as<value_type, float>)
943 read_float_byte_field(stream_view, tmp);
947 constexpr bool always_false = std::is_same_v<value_type, void>;
948 static_assert(always_false,
"format_bam::read_sam_dict_vector: unsupported value_type");
953 variant = std::move(tmp_vector);
973template <
typename stream_view_type>
974inline void format_bam::read_sam_dict_field(stream_view_type && stream_view, sam_tag_dictionary & target)
983 uint16_t tag =
static_cast<uint16_t
>(*it) << 8;
986 tag +=
static_cast<uint16_t
>(*it);
1004 read_integral_byte_field(stream_view, tmp);
1005 target[tag] =
static_cast<int32_t
>(tmp);
1011 read_integral_byte_field(stream_view, tmp);
1012 target[tag] =
static_cast<int32_t
>(tmp);
1018 read_integral_byte_field(stream_view, tmp);
1019 target[tag] =
static_cast<int32_t
>(tmp);
1025 read_integral_byte_field(stream_view, tmp);
1026 target[tag] =
static_cast<int32_t
>(tmp);
1032 read_integral_byte_field(stream_view, tmp);
1033 target[tag] = std::move(tmp);
1039 read_integral_byte_field(stream_view, tmp);
1040 target[tag] =
static_cast<int32_t
>(tmp);
1046 read_float_byte_field(stream_view, tmp);
1052 string_buffer.
clear();
1053 while (!is_char<'\0'>(*it))
1059 target[tag] = string_buffer;
1066 while (!is_char<'\0'>(*it))
1068 string_buffer.
clear();
1073 throw format_error{
"Hexadecimal tag has an uneven number of digits!"};
1077 read_byte_field(string_buffer, value);
1081 target[tag] = byte_array;
1086 char array_value_type_id = *it;
1089 switch (array_value_type_id)
1092 read_sam_dict_vector(target[tag], stream_view, int8_t{});
1095 read_sam_dict_vector(target[tag], stream_view, uint8_t{});
1098 read_sam_dict_vector(target[tag], stream_view, int16_t{});
1101 read_sam_dict_vector(target[tag], stream_view, uint16_t{});
1104 read_sam_dict_vector(target[tag], stream_view, int32_t{});
1107 read_sam_dict_vector(target[tag], stream_view, uint32_t{});
1110 read_sam_dict_vector(target[tag], stream_view,
float{});
1113 throw format_error{detail::to_string(
"The first character in the numerical id of a SAM tag ",
1114 "must be one of [cCsSiIf] but '", array_value_type_id,
"' was given.")};
1119 throw format_error{detail::to_string(
"The second character in the numerical id of a "
1120 "SAM tag must be one of [A,i,Z,H,B,f] but '", type_id,
"' was given.")};
1138template <
typename cigar_input_type>
1139inline auto format_bam::parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op)
const
1142 char operation{
'\0'};
1144 int32_t ref_length{}, seq_length{};
1145 uint32_t operation_and_count{};
1146 constexpr char const * cigar_mapping =
"MIDNSHP=X*******";
1147 constexpr uint32_t cigar_mask = 0x0f;
1149 if (n_cigar_op == 0)
1150 return std::tuple{operations, ref_length, seq_length};
1154 while (n_cigar_op > 0)
1156 std::ranges::copy_n(std::ranges::begin(cigar_input),
1157 sizeof(operation_and_count),
1158 reinterpret_cast<char*
>(&operation_and_count));
1159 operation = cigar_mapping[operation_and_count & cigar_mask];
1160 count = operation_and_count >> 4;
1162 detail::update_alignment_lengths(ref_length, seq_length, operation,
count);
1167 return std::tuple{operations, ref_length, seq_length};
1173inline std::string format_bam::get_tag_dict_str(sam_tag_dictionary
const & tag_dict)
1177 auto stream_variant_fn = [&result] (
auto && arg)
1182 if constexpr (std::same_as<T, int32_t>)
1185 size_t const absolute_arg = std::abs(arg);
1187 bool const negative = arg < 0;
1188 n = n * n + 2 * negative;
1194 result[result.size() - 1] =
'C';
1195 result.append(
reinterpret_cast<char const *
>(&arg), 1);
1200 result[result.size() - 1] =
'S';
1201 result.append(
reinterpret_cast<char const *
>(&arg), 2);
1206 result[result.size() - 1] =
'c';
1207 int8_t tmp =
static_cast<int8_t
>(arg);
1208 result.append(
reinterpret_cast<char const *
>(&tmp), 1);
1213 result[result.size() - 1] =
's';
1214 int16_t tmp =
static_cast<int16_t
>(arg);
1215 result.append(
reinterpret_cast<char const *
>(&tmp), 2);
1220 result.append(
reinterpret_cast<char const *
>(&arg), 4);
1225 else if constexpr (std::same_as<T, std::string>)
1227 result.append(
reinterpret_cast<char const *
>(arg.data()), arg.size() + 1);
1229 else if constexpr (!std::ranges::range<T>)
1231 result.append(
reinterpret_cast<char const *
>(&arg),
sizeof(arg));
1235 int32_t sz{
static_cast<int32_t
>(arg.size())};
1236 result.append(
reinterpret_cast<char *
>(&sz), 4);
1237 result.append(
reinterpret_cast<char const *
>(arg.data()),
1238 arg.size() *
sizeof(std::ranges::range_value_t<T>));
1242 for (
auto & [tag, variant] : tag_dict)
1244 result.push_back(
static_cast<char>(tag / 256));
1245 result.push_back(
static_cast<char>(tag % 256));
1247 result.push_back(detail::sam_tag_type_char[variant.
index()]);
1249 if (!is_char<'\0'>(detail::sam_tag_type_char_extra[variant.
index()]))
1250 result.push_back(detail::sam_tag_type_char_extra[variant.
index()]);
Provides the C++20 <bit> header if it is not already available.
constexpr derived_type & assign_char(char_type const chr) noexcept
Assign from a character, implicitly converts invalid characters.
Definition: alphabet_base.hpp:165
constexpr derived_type & assign_rank(rank_type const c) noexcept
Assign from a numeric value.
Definition: alphabet_base.hpp:191
The seqan3::cigar semialphabet pairs a counter with a seqan3::cigar::operation letter.
Definition: cigar.hpp:60
exposition_only::cigar_operation operation
The (extended) cigar operation alphabet of M,D,I,H,N,P,S,X,=.
Definition: cigar.hpp:99
A 16 letter DNA alphabet, containing all IUPAC symbols minus the gap and plus an equality sign ('=')....
Definition: dna16sam.hpp:48
The alphabet of a gap character '-'.
Definition: gap.hpp:39
The SAM tag dictionary class that stores all optional SAM fields.
Definition: sam_tag_dictionary.hpp:334
Provides seqan3::dna16sam.
Provides seqan3::detail::fast_ostreambuf_iterator.
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition: concept.hpp:155
constexpr T bit_ceil(T x) noexcept
Calculates the smallest integral power of two that is not smaller than x.
Definition: bit:133
constexpr int countr_zero(T x) noexcept
Returns the number of consecutive 0 bits in the value of x, starting from the least significant bit (...
Definition: bit:199
sam_flag
An enum flag that describes the properties of an aligned read (given as a SAM record).
Definition: sam_flag.hpp:74
@ flag
The alignment flag (bit information), uint16_t value.
@ ref_offset
Sequence (seqan3::field::ref_seq) relative start position (0-based), unsigned value.
@ mapq
The mapping quality of the seqan3::field::seq alignment, usually a Phred-scaled score.
@ offset
Sequence (seqan3::field::seq) relative start position (0-based), unsigned value.
@ mate
The mate pair information given as a std::tuple of reference name, offset and template length.
@ ref_id
The identifier of the (reference) sequence that seqan3::field::seq was aligned to.
@ seq
The "sequence", usually a range of nucleotides or amino acids.
@ qual
The qualities, usually in Phred score notation.
decltype(detail::transform< trait_t >(list_t{})) transform
Apply a transformation trait to every type in the list and return a seqan3::type_list of the results.
Definition: traits.hpp:471
constexpr ptrdiff_t count
Count the occurrences of a type in a pack.
Definition: traits.hpp:169
constexpr size_t size
The size of a type pack.
Definition: traits.hpp:151
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:183
constexpr auto repeat_n
A view factory that repeats a given value n times.
Definition: repeat_n.hpp:91
The generic alphabet concept that covers most data types used in ranges.
Resolves to std::ranges::implicitly_convertible_to<type1, type2>(). <dl class="no-api">This entity i...
A more refined container concept than seqan3::container.
Whether a type behaves like a tuple.
Auxiliary functions for the alignment IO.
Provides seqan3::detail::istreambuf.
The main SeqAn3 namespace.
Definition: cigar_operation_table.hpp:2
Provides seqan3::debug_stream and related types.
Adaptations of concepts from the Ranges TS.
Provides helper data structures for the seqan3::sam_file_output.
Provides the seqan3::sam_tag_dictionary class and auxiliaries.
Provides seqan3::views::slice.
The options type defines various option members that influence the behavior of all or some formats.
Definition: output_options.hpp:23
Provides seqan3::views::take_exactly and seqan3::views::take_exactly_or_throw.