SeqAn3 3.1.0-rc.2
The Modern C++ library for sequence analysis.
format_sam.hpp
Go to the documentation of this file.
1// -----------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6// -----------------------------------------------------------------------------------------------------
7
13#pragma once
14
15#include <iterator>
16#include <seqan3/std/ranges>
17#include <string>
18#include <vector>
19
38
39namespace seqan3
40{
41
114class format_sam : private detail::format_sam_base
115{
116public:
120 // construction cannot be noexcept because this class has a std::string variable as a quality string buffer.
121 format_sam() = default;
122 format_sam(format_sam const &) = default;
123 format_sam & operator=(format_sam const &) = default;
124 format_sam(format_sam &&) = default;
126 ~format_sam() = default;
127
129
132 {
133 { "sam" },
134 };
135
136protected:
137 template <typename stream_type, // constraints checked by file
138 typename seq_legal_alph_type,
139 typename seq_type, // other constraints checked inside function
140 typename id_type,
141 typename qual_type>
142 void read_sequence_record(stream_type & stream,
144 seq_type & sequence,
145 id_type & id,
146 qual_type & qualities);
147
148 template <typename stream_type, // constraints checked by file
149 typename seq_type, // other constraints checked inside function
150 typename id_type,
151 typename qual_type>
152 void write_sequence_record(stream_type & stream,
153 sequence_file_output_options const & SEQAN3_DOXYGEN_ONLY(options),
154 seq_type && sequence,
155 id_type && id,
156 qual_type && qualities);
157
158 template <typename stream_type, // constraints checked by file
159 typename seq_legal_alph_type,
160 typename ref_seqs_type,
161 typename ref_ids_type,
162 typename seq_type,
163 typename id_type,
164 typename offset_type,
165 typename ref_seq_type,
166 typename ref_id_type,
167 typename ref_offset_type,
168 typename align_type,
169 typename cigar_type,
170 typename flag_type,
171 typename mapq_type,
172 typename qual_type,
173 typename mate_type,
174 typename tag_dict_type,
175 typename e_value_type,
176 typename bit_score_type>
177 void read_alignment_record(stream_type & stream,
178 sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
179 ref_seqs_type & ref_seqs,
181 seq_type & seq,
182 qual_type & qual,
183 id_type & id,
184 offset_type & offset,
185 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
186 ref_id_type & ref_id,
187 ref_offset_type & ref_offset,
188 align_type & align,
189 cigar_type & cigar_vector,
190 flag_type & flag,
191 mapq_type & mapq,
192 mate_type & mate,
193 tag_dict_type & tag_dict,
194 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
195 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score));
196
197 template <typename stream_type,
198 typename header_type,
199 typename seq_type,
200 typename id_type,
201 typename ref_seq_type,
202 typename ref_id_type,
203 typename align_type,
204 typename qual_type,
205 typename mate_type,
206 typename tag_dict_type,
207 typename e_value_type,
208 typename bit_score_type>
209 void write_alignment_record(stream_type & stream,
210 sam_file_output_options const & options,
211 header_type && header,
212 seq_type && seq,
213 qual_type && qual,
214 id_type && id,
215 int32_t const offset,
216 ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
217 ref_id_type && ref_id,
218 std::optional<int32_t> ref_offset,
219 align_type && align,
220 std::vector<cigar> const & cigar_vector,
221 sam_flag const flag,
222 uint8_t const mapq,
223 mate_type && mate,
224 tag_dict_type && tag_dict,
225 e_value_type && SEQAN3_DOXYGEN_ONLY(e_value),
226 bit_score_type && SEQAN3_DOXYGEN_ONLY(bit_score));
227
228private:
230 std::string tmp_qual{};
231
233 static constexpr std::string_view dummy{};
234
236 sam_file_header<> default_header{};
237
239 bool ref_info_present_in_header{false};
240
242 std::string_view const & default_or(detail::ignore_t) const noexcept
243 {
244 return dummy;
245 }
246
248 template <typename t>
249 decltype(auto) default_or(t && v) const noexcept
250 {
251 return std::forward<t>(v);
252 }
253
254 template <typename stream_view_type, arithmetic value_type>
255 void read_sam_dict_vector(seqan3::detail::sam_tag_variant & variant,
256 stream_view_type && stream_view,
257 value_type value);
258
259 template <typename stream_view_type>
260 void read_sam_byte_vector(seqan3::detail::sam_tag_variant & variant,
261 stream_view_type && stream_view);
262
263 template <typename stream_view_type>
264 void read_sam_dict_field(stream_view_type && stream_view, sam_tag_dictionary & target);
265
266 template <typename stream_it_t, std::ranges::forward_range field_type>
267 void write_range_or_asterisk(stream_it_t & stream_it, field_type && field_value);
268
269 template <typename stream_it_t>
270 void write_range_or_asterisk(stream_it_t & stream_it, char const * const field_value);
271
272 template <typename stream_it_t>
273 void write_tag_fields(stream_it_t & stream, sam_tag_dictionary const & tag_dict, char const separator);
274};
275
277template <typename stream_type, // constraints checked by file
278 typename seq_legal_alph_type,
279 typename seq_type, // other constraints checked inside function
280 typename id_type,
281 typename qual_type>
282inline void format_sam::read_sequence_record(stream_type & stream,
284 seq_type & sequence,
285 id_type & id,
286 qual_type & qualities)
287{
289
290 {
291 read_alignment_record(stream, align_options, std::ignore, default_header, sequence, qualities, id,
292 std::ignore, std::ignore, std::ignore, std::ignore, std::ignore, std::ignore,
293 std::ignore, std::ignore, std::ignore, std::ignore, std::ignore, std::ignore);
294 }
295
296 if constexpr (!detail::decays_to_ignore_v<seq_type>)
297 if (std::ranges::distance(sequence) == 0)
298 throw parse_error{"The sequence information must not be empty."};
299 if constexpr (!detail::decays_to_ignore_v<id_type>)
300 if (std::ranges::distance(id) == 0)
301 throw parse_error{"The id information must not be empty."};
302
303 if (options.truncate_ids)
304 id = id | detail::take_until_and_consume(is_space) | views::to<id_type>;
305}
306
308template <typename stream_type, // constraints checked by file
309 typename seq_type, // other constraints checked inside function
310 typename id_type,
311 typename qual_type>
312inline void format_sam::write_sequence_record(stream_type & stream,
313 sequence_file_output_options const & SEQAN3_DOXYGEN_ONLY(options),
314 seq_type && sequence,
315 id_type && id,
316 qual_type && qualities)
317{
319 using default_mate_t = std::tuple<std::string_view, std::optional<int32_t>, int32_t>;
320
321 sam_file_output_options output_options;
322
324 output_options,
325 /*header*/ std::ignore,
326 /*seq*/ default_or(sequence),
327 /*qual*/ default_or(qualities),
328 /*id*/ default_or(id),
329 /*offset*/ 0,
330 /*ref_seq*/ std::string_view{},
331 /*ref_id*/ std::string_view{},
332 /*ref_offset*/ -1,
333 /*align*/ default_align_t{},
334 /*cigar_vector*/ std::vector<cigar>{},
335 /*flag*/ sam_flag::none,
336 /*mapq*/ 0,
337 /*mate*/ default_mate_t{},
338 /*tag_dict*/ sam_tag_dictionary{},
339 /*e_value*/ 0,
340 /*bit_score*/ 0);
341}
342
344template <typename stream_type, // constraints checked by file
345 typename seq_legal_alph_type,
346 typename ref_seqs_type,
347 typename ref_ids_type,
348 typename seq_type,
349 typename id_type,
350 typename offset_type,
351 typename ref_seq_type,
352 typename ref_id_type,
353 typename ref_offset_type,
354 typename align_type,
355 typename cigar_type,
356 typename flag_type,
357 typename mapq_type,
358 typename qual_type,
359 typename mate_type,
360 typename tag_dict_type,
361 typename e_value_type,
362 typename bit_score_type>
363inline void format_sam::read_alignment_record(stream_type & stream,
364 sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
365 ref_seqs_type & ref_seqs,
367 seq_type & seq,
368 qual_type & qual,
369 id_type & id,
370 offset_type & offset,
371 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
372 ref_id_type & ref_id,
373 ref_offset_type & ref_offset,
374 align_type & align,
375 cigar_type & cigar_vector,
376 flag_type & flag,
377 mapq_type & mapq,
378 mate_type & mate,
379 tag_dict_type & tag_dict,
380 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
381 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
382{
383 static_assert(detail::decays_to_ignore_v<ref_offset_type> ||
384 detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
385 "The ref_offset must be a specialisation of std::optional.");
386
387 auto stream_view = detail::istreambuf(stream);
388 auto field_view = stream_view | detail::take_until_or_throw_and_consume(is_char<'\t'>);
389
390 // these variables need to be stored to compute the ALIGNMENT
391 int32_t ref_offset_tmp{};
392 std::ranges::range_value_t<decltype(header.ref_ids())> ref_id_tmp{};
393 [[maybe_unused]] int32_t offset_tmp{};
394 [[maybe_unused]] int32_t soft_clipping_end{};
395 [[maybe_unused]] std::vector<cigar> tmp_cigar_vector{};
396 [[maybe_unused]] int32_t ref_length{0}, seq_length{0}; // length of aligned part for ref and query
397
398 // Header
399 // -------------------------------------------------------------------------------------------------------------
400 if (is_char<'@'>(*std::ranges::begin(stream_view))) // we always read the header if present
401 {
402 read_header(stream_view, header, ref_seqs);
403
404 if (std::ranges::begin(stream_view) == std::ranges::end(stream_view)) // file has no records
405 return;
406 }
407
408 // Fields 1-5: ID FLAG REF_ID REF_OFFSET MAPQ
409 // -------------------------------------------------------------------------------------------------------------
410 if constexpr (!detail::decays_to_ignore_v<id_type>)
411 read_forward_range_field(field_view, id);
412 else
413 detail::consume(field_view);
414
415 uint16_t flag_integral{};
416 read_arithmetic_field(field_view, flag_integral);
417 flag = sam_flag{flag_integral};
418
419 read_forward_range_field(field_view, ref_id_tmp);
420 check_and_assign_ref_id(ref_id, ref_id_tmp, header, ref_seqs);
421
422 read_arithmetic_field(field_view, ref_offset_tmp);
423 --ref_offset_tmp; // SAM format is 1-based but SeqAn operates 0-based
424
425 if (ref_offset_tmp == -1)
426 ref_offset = std::nullopt; // indicates an unmapped read -> ref_offset is not set
427 else if (ref_offset_tmp > -1)
428 ref_offset = ref_offset_tmp;
429 else if (ref_offset_tmp < -1)
430 throw format_error{"No negative values are allowed for field::ref_offset."};
431
432 if constexpr (!detail::decays_to_ignore_v<mapq_type>)
433 read_arithmetic_field(field_view, mapq);
434 else
435 detail::consume(field_view);
436
437 // Field 6: CIGAR
438 // -------------------------------------------------------------------------------------------------------------
439 if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
440 {
441 if (!is_char<'*'>(*std::ranges::begin(stream_view))) // no cigar information given
442 {
443 std::tie(tmp_cigar_vector, ref_length, seq_length) = detail::parse_cigar(field_view);
444 transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
445 // the actual cigar_vector is swapped with tmp_cigar_vector at the end to avoid copying
446 }
447 else
448 {
449 std::ranges::next(std::ranges::begin(field_view)); // skip '*'
450 }
451 }
452 else
453 {
454 detail::consume(field_view);
455 }
456
457 offset = offset_tmp;
458
459 // Field 7-9: (RNEXT PNEXT TLEN) = MATE
460 // -------------------------------------------------------------------------------------------------------------
461 if constexpr (!detail::decays_to_ignore_v<mate_type>)
462 {
463 std::ranges::range_value_t<decltype(header.ref_ids())> tmp_mate_ref_id{};
464 read_forward_range_field(field_view, tmp_mate_ref_id); // RNEXT
465
466 if (tmp_mate_ref_id == "=") // indicates "same as ref id"
467 {
468 if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
469 get<0>(mate) = ref_id;
470 else
471 check_and_assign_ref_id(get<0>(mate), ref_id_tmp, header, ref_seqs);
472 }
473 else
474 {
475 check_and_assign_ref_id(get<0>(mate), tmp_mate_ref_id, header, ref_seqs);
476 }
477
478 int32_t tmp_pnext{};
479 read_arithmetic_field(field_view, tmp_pnext); // PNEXT
480
481 if (tmp_pnext > 0)
482 get<1>(mate) = --tmp_pnext; // SAM format is 1-based but SeqAn operates 0-based.
483 else if (tmp_pnext < 0)
484 throw format_error{"No negative values are allowed at the mate mapping position."};
485 // tmp_pnext == 0 indicates an unmapped mate -> do not fill std::optional get<1>(mate)
486
487 read_arithmetic_field(field_view, get<2>(mate)); // TLEN
488 }
489 else
490 {
491 for (size_t i = 0; i < 3u; ++i)
492 {
493 detail::consume(field_view);
494 }
495 }
496
497 // Field 10: Sequence
498 // -------------------------------------------------------------------------------------------------------------
499 if (!is_char<'*'>(*std::ranges::begin(stream_view))) // sequence information is given
500 {
501 auto constexpr is_legal_alph = char_is_valid_for<seq_legal_alph_type>;
502 auto seq_stream = field_view | std::views::transform([is_legal_alph] (char const c) // enforce legal alphabet
503 {
504 if (!is_legal_alph(c))
505 throw parse_error{std::string{"Encountered an unexpected letter: "} +
506 "char_is_valid_for<" +
507 detail::type_name_as_string<seq_legal_alph_type> +
508 "> evaluated to false on " +
509 detail::make_printable(c)};
510 return c;
511 });
512
513 if constexpr (detail::decays_to_ignore_v<seq_type>)
514 {
515 if constexpr (!detail::decays_to_ignore_v<align_type>)
516 {
518 "If you want to read ALIGNMENT but not SEQ, the alignment"
519 " object must store a sequence container at the second (query) position.");
520
521 if (!tmp_cigar_vector.empty()) // only parse alignment if cigar information was given
522 {
523
524 auto tmp_iter = std::ranges::begin(seq_stream);
525 std::ranges::advance(tmp_iter, offset_tmp);
526
527 for (; seq_length > 0; --seq_length) // seq_length is not needed anymore
528 {
529 get<1>(align).push_back(std::ranges::range_value_t<decltype(get<1>(align))>{}.assign_char(*tmp_iter));
530 ++tmp_iter;
531 }
532
533 std::ranges::advance(tmp_iter, soft_clipping_end);
534 }
535 else
536 {
537 get<1>(align) = std::remove_reference_t<decltype(get<1>(align))>{}; // empty container
538 }
539 }
540 else
541 {
542 detail::consume(seq_stream);
543 }
544 }
545 else
546 {
547 read_forward_range_field(seq_stream, seq);
548
549 if constexpr (!detail::decays_to_ignore_v<align_type>)
550 {
551 if (!tmp_cigar_vector.empty()) // if no alignment info is given, the field::alignment should remain empty
552 {
553 assign_unaligned(get<1>(align),
554 seq | views::slice(static_cast<decltype(std::ranges::size(seq))>(offset_tmp),
555 std::ranges::size(seq) - soft_clipping_end));
556 }
557 }
558 }
559 }
560 else
561 {
562 std::ranges::next(std::ranges::begin(field_view)); // skip '*'
563 }
564
565 // Field 11: Quality
566 // -------------------------------------------------------------------------------------------------------------
567 auto const tab_or_end = is_char<'\t'> || is_char<'\r'> || is_char<'\n'>;
568 auto qual_view = stream_view | detail::take_until_or_throw(tab_or_end);
569 if constexpr (!detail::decays_to_ignore_v<qual_type>)
570 read_forward_range_field(qual_view, qual);
571 else
572 detail::consume(qual_view);
573
574 if constexpr (!detail::decays_to_ignore_v<seq_type> && !detail::decays_to_ignore_v<qual_type>)
575 {
576 if (std::ranges::distance(seq) != 0 && std::ranges::distance(qual) != 0 &&
577 std::ranges::distance(seq) != std::ranges::distance(qual))
578 {
579 throw format_error{detail::to_string("Sequence length (", std::ranges::distance(seq),
580 ") and quality length (", std::ranges::distance(qual),
581 ") must be the same.")};
582 }
583 }
584
585 // All remaining optional fields if any: SAM tags dictionary
586 // -------------------------------------------------------------------------------------------------------------
587 while (is_char<'\t'>(*std::ranges::begin(stream_view))) // read all tags if present
588 {
589 std::ranges::next(std::ranges::begin(stream_view)); // skip tab
590 auto stream_until_tab_or_end = stream_view | detail::take_until_or_throw(tab_or_end);
591 if constexpr (!detail::decays_to_ignore_v<tag_dict_type>)
592 read_sam_dict_field(stream_until_tab_or_end, tag_dict);
593 else
594 detail::consume(stream_until_tab_or_end);
595 }
596
597 detail::consume(stream_view | detail::take_until(!(is_char<'\r'> || is_char<'\n'>))); // consume new line
598
599 // DONE READING - wrap up
600 // -------------------------------------------------------------------------------------------------------------
601 // Alignment object construction
602 // Note that the query sequence in get<1>(align) has already been filled while reading Field 10.
603 if constexpr (!detail::decays_to_ignore_v<align_type>)
604 {
605 int32_t ref_idx{(ref_id_tmp.empty()/*unmapped read?*/) ? -1 : 0};
606
607 if constexpr (!detail::decays_to_ignore_v<ref_seqs_type>)
608 {
609 if (!ref_id_tmp.empty())
610 {
611 assert(header.ref_dict.count(ref_id_tmp) != 0); // taken care of in check_and_assign_ref_id()
612 ref_idx = header.ref_dict[ref_id_tmp]; // get index for reference sequence
613 }
614 }
615
616 construct_alignment(align, tmp_cigar_vector, ref_idx, ref_seqs, ref_offset_tmp, ref_length);
617 }
618
619 if constexpr (!detail::decays_to_ignore_v<cigar_type>)
620 std::swap(cigar_vector, tmp_cigar_vector);
621}
622
624template <typename stream_type,
625 typename header_type,
626 typename seq_type,
627 typename id_type,
628 typename ref_seq_type,
629 typename ref_id_type,
630 typename align_type,
631 typename qual_type,
632 typename mate_type,
633 typename tag_dict_type,
634 typename e_value_type,
635 typename bit_score_type>
636inline void format_sam::write_alignment_record(stream_type & stream,
637 sam_file_output_options const & options,
638 header_type && header,
639 seq_type && seq,
640 qual_type && qual,
641 id_type && id,
642 int32_t const offset,
643 ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
644 ref_id_type && ref_id,
645 std::optional<int32_t> ref_offset,
646 align_type && align,
647 std::vector<cigar> const & cigar_vector,
648 sam_flag const flag,
649 uint8_t const mapq,
650 mate_type && mate,
651 tag_dict_type && tag_dict,
652 e_value_type && SEQAN3_DOXYGEN_ONLY(e_value),
653 bit_score_type && SEQAN3_DOXYGEN_ONLY(bit_score))
654{
655 /* Note the following general things:
656 *
657 * - Given the SAM specifications, all fields may be empty
658 *
659 * - arithmetic values default to 0 while all others default to '*'
660 *
661 * - Because of the former, arithmetic values can be directly streamed
662 * into 'stream' as operator<< is defined for all arithmetic types
663 * and the default value (0) is also the SAM default.
664 *
665 * - All other non-arithmetic values need to be checked for emptiness
666 */
667
668 // ---------------------------------------------------------------------
669 // Type Requirements (as static asserts for user friendliness)
670 // ---------------------------------------------------------------------
671 static_assert((std::ranges::forward_range<seq_type> &&
673 "The seq object must be a std::ranges::forward_range over "
674 "letters that model seqan3::alphabet.");
675
676 static_assert((std::ranges::forward_range<id_type> &&
678 "The id object must be a std::ranges::forward_range over "
679 "letters that model seqan3::alphabet.");
680
681 if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
682 {
683 static_assert((std::ranges::forward_range<ref_id_type> ||
684 std::integral<std::remove_reference_t<ref_id_type>> ||
685 detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>, std::optional>),
686 "The ref_id object must be a std::ranges::forward_range "
687 "over letters that model seqan3::alphabet.");
688
689 if constexpr (std::integral<std::remove_cvref_t<ref_id_type>> ||
690 detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>, std::optional>)
691 static_assert(!detail::decays_to_ignore_v<header_type>,
692 "If you give indices as reference id information the header must also be present.");
693 }
694
696 "The align object must be a std::pair of two ranges whose "
697 "value_type is comparable to seqan3::gap");
698
699 static_assert((std::tuple_size_v<std::remove_cvref_t<align_type>> == 2 &&
700 std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<0>(align))>> &&
701 std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<1>(align))>>),
702 "The align object must be a std::pair of two ranges whose "
703 "value_type is comparable to seqan3::gap");
704
705 static_assert((std::ranges::forward_range<qual_type> &&
707 "The qual object must be a std::ranges::forward_range "
708 "over letters that model seqan3::alphabet.");
709
711 "The mate object must be a std::tuple of size 3 with "
712 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
713 "2) a std::integral or std::optional<std::integral>, and "
714 "3) a std::integral.");
715
716 static_assert(((std::ranges::forward_range<decltype(std::get<0>(mate))> ||
717 std::integral<std::remove_cvref_t<decltype(std::get<0>(mate))>> ||
718 detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(std::get<0>(mate))>, std::optional>) &&
719 (std::integral<std::remove_cvref_t<decltype(std::get<1>(mate))>> ||
720 detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(std::get<1>(mate))>, std::optional>) &&
721 std::integral<std::remove_cvref_t<decltype(std::get<2>(mate))>>),
722 "The mate object must be a std::tuple of size 3 with "
723 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
724 "2) a std::integral or std::optional<std::integral>, and "
725 "3) a std::integral.");
726
727 if constexpr (std::integral<std::remove_cvref_t<decltype(std::get<0>(mate))>> ||
728 detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(std::get<0>(mate))>, std::optional>)
729 static_assert(!detail::decays_to_ignore_v<header_type>,
730 "If you give indices as mate reference id information the header must also be present.");
731
732 static_assert(std::same_as<std::remove_cvref_t<tag_dict_type>, sam_tag_dictionary>,
733 "The tag_dict object must be of type seqan3::sam_tag_dictionary.");
734
735 // ---------------------------------------------------------------------
736 // logical Requirements
737 // ---------------------------------------------------------------------
738 if constexpr (!detail::decays_to_ignore_v<header_type> &&
739 !detail::decays_to_ignore_v<ref_id_type> &&
740 !std::integral<std::remove_reference_t<ref_id_type>> &&
741 !detail::is_type_specialisation_of_v<std::remove_reference_t<ref_id_type>, std::optional>)
742 {
743
744 if (options.sam_require_header && !std::ranges::empty(ref_id))
745 {
746 auto id_it = header.ref_dict.end();
747
748 if constexpr (std::ranges::contiguous_range<decltype(ref_id)> &&
749 std::ranges::sized_range<decltype(ref_id)> &&
750 std::ranges::borrowed_range<decltype(ref_id)>)
751 {
752 id_it = header.ref_dict.find(std::span{std::ranges::data(ref_id), std::ranges::size(ref_id)});
753 }
754 else
755 {
756 using header_ref_id_type = std::remove_reference_t<decltype(header.ref_ids()[0])>;
757
759 "The ref_id type is not convertible to the reference id information stored in the "
760 "reference dictionary of the header object.");
761
762 id_it = header.ref_dict.find(ref_id);
763 }
764
765 if (id_it == header.ref_dict.end()) // no reference id matched
766 throw format_error{detail::to_string("The ref_id '", ref_id, "' was not in the list of references:",
767 header.ref_ids())};
768 }
769 }
770
771 if (ref_offset.has_value() && (ref_offset.value() + 1) < 0)
772 throw format_error{"The ref_offset object must be an std::integral >= 0."};
773
774 // ---------------------------------------------------------------------
775 // Writing the Header on first call
776 // ---------------------------------------------------------------------
777 if constexpr (!detail::decays_to_ignore_v<header_type>)
778 {
779 if (options.sam_require_header && !header_was_written)
780 {
781 write_header(stream, options, header);
782 header_was_written = true;
783 }
784 }
785
786 // ---------------------------------------------------------------------
787 // Writing the Record
788 // ---------------------------------------------------------------------
789
790 detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};
791 constexpr char separator{'\t'};
792
793 write_range_or_asterisk(stream_it, id);
794 *stream_it = separator;
795
796 stream_it.write_number(static_cast<uint16_t>(flag));
797 *stream_it = separator;
798
799 if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
800 {
801 if constexpr (std::integral<std::remove_reference_t<ref_id_type>>)
802 {
803 write_range_or_asterisk(stream_it, (header.ref_ids())[ref_id]);
804 }
805 else if constexpr (detail::is_type_specialisation_of_v<std::remove_reference_t<ref_id_type>, std::optional>)
806 {
807 if (ref_id.has_value())
808 write_range_or_asterisk(stream_it, (header.ref_ids())[ref_id.value()]);
809 else
810 *stream_it = '*';
811 }
812 else
813 {
814 write_range_or_asterisk(stream_it, ref_id);
815 }
816 }
817 else
818 {
819 *stream_it = '*';
820 }
821
822 *stream_it = separator;
823
824 // SAM is 1 based, 0 indicates unmapped read if optional is not set
825 stream_it.write_number(ref_offset.value_or(-1) + 1);
826 *stream_it = separator;
827
828 stream_it.write_number(static_cast<unsigned>(mapq));
829 *stream_it = separator;
830
831 if (!std::ranges::empty(cigar_vector))
832 {
833 for (auto & c : cigar_vector) //TODO THIS IS PROBABLY TERRIBLE PERFORMANCE_WISE
834 stream_it.write_range(c.to_string());
835 }
836 else if (!std::ranges::empty(get<0>(align)) && !std::ranges::empty(get<1>(align)))
837 {
838 // compute possible distance from alignment end to sequence end
839 // which indicates soft clipping at the end.
840 // This should be replace by a free count_gaps function for
841 // aligned sequences which is more efficient if possible.
842 size_t off_end{std::ranges::size(seq) - offset};
843 for (auto chr : get<1>(align))
844 if (chr == gap{})
845 ++off_end;
846
847 // Might happen if get<1>(align) doesn't correspond to the reference.
848 assert(off_end >= std::ranges::size(get<1>(align)));
849 off_end -= std::ranges::size(get<1>(align));
850
851 write_range_or_asterisk(stream_it, detail::get_cigar_string(align, offset, off_end));
852 }
853 else
854 {
855 *stream_it = '*';
856 }
857
858 *stream_it = separator;
859
860 if constexpr (std::integral<std::remove_reference_t<decltype(get<0>(mate))>>)
861 {
862 write_range_or_asterisk(stream_it, (header.ref_ids())[get<0>(mate)]);
863 }
864 else if constexpr (detail::is_type_specialisation_of_v<std::remove_reference_t<decltype(get<0>(mate))>, std::optional>)
865 {
866 if (get<0>(mate).has_value())
867 // value_or(0) instead of value() (which is equivalent here) as a
868 // workaround for a ubsan false-positive in GCC8: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90058
869 write_range_or_asterisk(stream_it, header.ref_ids()[get<0>(mate).value_or(0)]);
870 else
871 *stream_it = '*';
872 }
873 else
874 {
875 write_range_or_asterisk(stream_it, get<0>(mate));
876 }
877
878 *stream_it = separator;
879
880 if constexpr (detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(get<1>(mate))>, std::optional>)
881 {
882 // SAM is 1 based, 0 indicates unmapped read if optional is not set
883 stream_it.write_number(get<1>(mate).value_or(-1) + 1);
884 *stream_it = separator;
885 }
886 else
887 {
888 stream_it.write_number(get<1>(mate));
889 *stream_it = separator;
890 }
891
892 stream_it.write_number(get<2>(mate));
893 *stream_it = separator;
894
895 write_range_or_asterisk(stream_it, seq);
896 *stream_it = separator;
897
898 write_range_or_asterisk(stream_it, qual);
899
900 write_tag_fields(stream_it, tag_dict, separator);
901
902 stream_it.write_end_of_line(options.add_carriage_return);
903}
904
905
923template <typename stream_view_type, arithmetic value_type>
924inline void format_sam::read_sam_dict_vector(seqan3::detail::sam_tag_variant & variant,
925 stream_view_type && stream_view,
926 value_type value)
927{
928 std::vector<value_type> tmp_vector;
929 while (std::ranges::begin(stream_view) != ranges::end(stream_view)) // not fully consumed yet
930 {
931 read_arithmetic_field(stream_view | detail::take_until(is_char<','>), value);
932 tmp_vector.push_back(value);
933
934 if (is_char<','>(*std::ranges::begin(stream_view)))
935 std::ranges::next(std::ranges::begin(stream_view)); // skip ','
936 }
937 variant = std::move(tmp_vector);
938}
939
953template <typename stream_view_type>
954inline void format_sam::read_sam_byte_vector(seqan3::detail::sam_tag_variant & variant,
955 stream_view_type && stream_view)
956{
957 std::vector<std::byte> tmp_vector;
958 std::byte value;
959
960 while (std::ranges::begin(stream_view) != ranges::end(stream_view)) // not fully consumed yet
961 {
962 try
963 {
964 read_byte_field(stream_view | detail::take_exactly_or_throw(2), value);
965 }
966 catch (std::exception const & e)
967 {
968 throw format_error{"Hexadecimal tag has an uneven number of digits!"};
969 }
970
971 tmp_vector.push_back(value);
972 }
973
974 variant = std::move(tmp_vector);
975}
976
994template <typename stream_view_type>
995inline void format_sam::read_sam_dict_field(stream_view_type && stream_view, sam_tag_dictionary & target)
996{
997 /* Every SAM tag has the format "[TAG]:[TYPE_ID]:[VALUE]", where TAG is a two letter
998 name tag which is converted to a unique integer identifier and TYPE_ID is one character in [A,i,Z,H,B,f]
999 describing the type for the upcoming VALUES. If TYPE_ID=='B' it signals an array of comma separated
1000 VALUE's and the inner value type is identified by the character following ':', one of [cCsSiIf].
1001 */
1002 uint16_t tag = static_cast<uint16_t>(*std::ranges::begin(stream_view)) << 8;
1003 std::ranges::next(std::ranges::begin(stream_view)); // skip char read before
1004 tag += static_cast<uint16_t>(*std::ranges::begin(stream_view));
1005 std::ranges::next(std::ranges::begin(stream_view)); // skip char read before
1006 std::ranges::next(std::ranges::begin(stream_view)); // skip ':'
1007 char type_id = *std::ranges::begin(stream_view);
1008 std::ranges::next(std::ranges::begin(stream_view)); // skip char read before
1009 std::ranges::next(std::ranges::begin(stream_view)); // skip ':'
1010
1011 switch (type_id)
1012 {
1013 case 'A' : // char
1014 {
1015 target[tag] = static_cast<char>(*std::ranges::begin(stream_view));
1016 std::ranges::next(std::ranges::begin(stream_view)); // skip char that has been read
1017 break;
1018 }
1019 case 'i' : // int32_t
1020 {
1021 int32_t tmp;
1022 read_arithmetic_field(stream_view, tmp);
1023 target[tag] = tmp;
1024 break;
1025 }
1026 case 'f' : // float
1027 {
1028 float tmp;
1029 read_arithmetic_field(stream_view, tmp);
1030 target[tag] = tmp;
1031 break;
1032 }
1033 case 'Z' : // string
1034 {
1035 target[tag] = stream_view | views::to<std::string>;
1036 break;
1037 }
1038 case 'H' :
1039 {
1040 read_sam_byte_vector(target[tag], stream_view);
1041 break;
1042 }
1043 case 'B' : // Array. Value type depends on second char [cCsSiIf]
1044 {
1045 char array_value_type_id = *std::ranges::begin(stream_view);
1046 std::ranges::next(std::ranges::begin(stream_view)); // skip char read before
1047 std::ranges::next(std::ranges::begin(stream_view)); // skip first ','
1048
1049 switch (array_value_type_id)
1050 {
1051 case 'c' : // int8_t
1052 read_sam_dict_vector(target[tag], stream_view, int8_t{});
1053 break;
1054 case 'C' : // uint8_t
1055 read_sam_dict_vector(target[tag], stream_view, uint8_t{});
1056 break;
1057 case 's' : // int16_t
1058 read_sam_dict_vector(target[tag], stream_view, int16_t{});
1059 break;
1060 case 'S' : // uint16_t
1061 read_sam_dict_vector(target[tag], stream_view, uint16_t{});
1062 break;
1063 case 'i' : // int32_t
1064 read_sam_dict_vector(target[tag], stream_view, int32_t{});
1065 break;
1066 case 'I' : // uint32_t
1067 read_sam_dict_vector(target[tag], stream_view, uint32_t{});
1068 break;
1069 case 'f' : // float
1070 read_sam_dict_vector(target[tag], stream_view, float{});
1071 break;
1072 default:
1073 throw format_error{std::string("The first character in the numerical ") +
1074 "id of a SAM tag must be one of [cCsSiIf] but '" + array_value_type_id +
1075 "' was given."};
1076 }
1077 break;
1078 }
1079 default:
1080 throw format_error{std::string("The second character in the numerical id of a "
1081 "SAM tag must be one of [A,i,Z,H,B,f] but '") + type_id + "' was given."};
1082 }
1083}
1084
1092template <typename stream_it_t, std::ranges::forward_range field_type>
1093inline void format_sam::write_range_or_asterisk(stream_it_t & stream_it, field_type && field_value)
1094{
1095 if (std::ranges::empty(field_value))
1096 {
1097 *stream_it = '*';
1098 }
1099 else
1100 {
1101 if constexpr (std::same_as<std::remove_cvref_t<std::ranges::range_reference_t<field_type>>, char>)
1102 stream_it.write_range(field_value);
1103 else // convert from alphabets to their character representation
1104 stream_it.write_range(field_value | views::to_char);
1105 }
1106}
1107
1114template <typename stream_it_t>
1115inline void format_sam::write_range_or_asterisk(stream_it_t & stream_it, char const * const field_value)
1116{
1117 write_range_or_asterisk(stream_it, std::string_view{field_value});
1118}
1119
1127template <typename stream_it_t>
1128inline void format_sam::write_tag_fields(stream_it_t & stream_it, sam_tag_dictionary const & tag_dict, char const separator)
1129{
1130 auto const stream_variant_fn = [&stream_it] (auto && arg) // helper to print an std::variant
1131 {
1132 using T = std::remove_cvref_t<decltype(arg)>;
1133
1134 if constexpr (std::ranges::input_range<T>)
1135 {
1136 if constexpr (std::same_as<std::remove_cvref_t<std::ranges::range_reference_t<T>>, char>)
1137 {
1138 stream_it.write_range(arg);
1139 }
1140 else if constexpr (std::same_as<std::remove_cvref_t<std::ranges::range_reference_t<T>>, std::byte>)
1141 {
1142 if (!std::ranges::empty(arg))
1143 {
1144 stream_it.write_number(std::to_integer<uint8_t>(*std::ranges::begin(arg)));
1145
1146 for (auto && elem : arg | std::views::drop(1))
1147 {
1148 *stream_it = ',';
1149 stream_it.write_number(std::to_integer<uint8_t>(elem));
1150 }
1151 }
1152 }
1153 else
1154 {
1155 if (!std::ranges::empty(arg))
1156 {
1157 stream_it.write_number(*std::ranges::begin(arg));
1158
1159 for (auto && elem : arg | std::views::drop(1))
1160 {
1161 *stream_it = ',';
1162 stream_it.write_number(elem);
1163 }
1164 }
1165 }
1166 }
1167 else if constexpr (std::same_as<std::remove_cvref_t<T>, char>)
1168 {
1169 *stream_it = arg;
1170 }
1171 else // number
1172 {
1173 stream_it.write_number(arg);
1174 }
1175 };
1176
1177 for (auto & [tag, variant] : tag_dict)
1178 {
1179 *stream_it = separator;
1180
1181 char const char0 = tag / 256;
1182 char const char1 = tag % 256;
1183
1184 *stream_it = char0;
1185 *stream_it = char1;
1186 *stream_it = ':';
1187 *stream_it = detail::sam_tag_type_char[variant.index()];
1188 *stream_it = ':';
1189
1190 if (detail::sam_tag_type_char_extra[variant.index()] != '\0')
1191 {
1192 *stream_it = detail::sam_tag_type_char_extra[variant.index()];
1193 *stream_it = ',';
1194 }
1195
1196 std::visit(stream_variant_fn, variant);
1197 }
1198}
1199
1200} // namespace seqan3
Core alphabet concept and free function/type trait wrappers.
T begin(T... args)
The SAM format (tag).
Definition: format_sam.hpp:115
~format_sam()=default
Defaulted.
format_sam & operator=(format_sam const &)=default
Defaulted.
void read_alignment_record(stream_type &stream, sam_file_input_options< seq_legal_alph_type > const &options, ref_seqs_type &ref_seqs, sam_file_header< ref_ids_type > &header, seq_type &seq, qual_type &qual, id_type &id, offset_type &offset, ref_seq_type &ref_seq, ref_id_type &ref_id, ref_offset_type &ref_offset, align_type &align, cigar_type &cigar_vector, flag_type &flag, mapq_type &mapq, mate_type &mate, tag_dict_type &tag_dict, e_value_type &e_value, bit_score_type &bit_score)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_sam.hpp:363
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_sam.hpp:132
void write_sequence_record(stream_type &stream, sequence_file_output_options const &options, seq_type &&sequence, id_type &&id, qual_type &&qualities)
Write the given fields to the specified stream.
Definition: format_sam.hpp:312
format_sam(format_sam &&)=default
Defaulted.
format_sam & operator=(format_sam &&)=default
Defaulted.
void read_sequence_record(stream_type &stream, sequence_file_input_options< seq_legal_alph_type > const &options, seq_type &sequence, id_type &id, qual_type &qualities)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_sam.hpp:282
void write_alignment_record(stream_type &stream, sam_file_output_options const &options, header_type &&header, seq_type &&seq, qual_type &&qual, id_type &&id, int32_t const offset, ref_seq_type &&ref_seq, ref_id_type &&ref_id, std::optional< int32_t > ref_offset, align_type &&align, std::vector< cigar > const &cigar_vector, sam_flag const flag, uint8_t const mapq, mate_type &&mate, tag_dict_type &&tag_dict, e_value_type &&e_value, bit_score_type &&bit_score)
Write the given fields to the specified stream.
Definition: format_sam.hpp:636
format_sam()=default
Defaulted.
format_sam(format_sam const &)=default
Defaulted.
The alphabet of a gap character '-'.
Definition: gap.hpp:39
Stores the header information of alignment files.
Definition: header.hpp:32
std::unordered_map< key_type, int32_t, std::hash< key_type >, detail::view_equality_fn > ref_dict
The mapping of reference id to position in the ref_ids() range and the ref_id_info range.
Definition: header.hpp:157
ref_ids_type & ref_ids()
The range of reference ids.
Definition: header.hpp:118
The SAM tag dictionary class that stores all optional SAM fields.
Definition: sam_tag_dictionary.hpp:334
Provides seqan3::detail::fast_ostreambuf_iterator.
Provides the seqan3::format_sam_base that can be inherited from.
auto const to_char
A view that calls seqan3::to_char() on each element in the input range.
Definition: to_char.hpp:63
sam_flag
An enum flag that describes the properties of an aligned read (given as a SAM record).
Definition: sam_flag.hpp:74
@ none
None of the flags below are set.
@ flag
The alignment flag (bit information), uint16_t value.
@ ref_offset
Sequence (seqan3::field::ref_seq) relative start position (0-based), unsigned value.
@ mapq
The mapping quality of the seqan3::field::seq alignment, usually a Phred-scaled score.
@ offset
Sequence (seqan3::field::seq) relative start position (0-based), unsigned value.
@ mate
The mate pair information given as a std::tuple of reference name, offset and template length.
@ ref_id
The identifier of the (reference) sequence that seqan3::field::seq was aligned to.
@ seq
The "sequence", usually a range of nucleotides or amino acids.
@ qual
The qualities, usually in Phred score notation.
constexpr auto is_space
Checks whether c is a space character.
Definition: predicate.hpp:128
typename decltype(detail::split_after< i >(list_t{}))::second_type drop
Return a seqan3::type_list of the types in the input type list, except the first n.
Definition: traits.hpp:388
decltype(detail::transform< trait_t >(list_t{})) transform
Apply a transformation trait to every type in the list and return a seqan3::type_list of the results.
Definition: traits.hpp:471
constexpr size_t size
The size of a type pack.
Definition: traits.hpp:151
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:183
Provides the seqan3::sam_file_header class.
T index(T... args)
The generic alphabet concept that covers most data types used in ranges.
Resolves to std::ranges::implicitly_convertible_to<type1, type2>(). <dl class="no-api">This entity i...
A more refined container concept than seqan3::container.
The generic concept for a (biological) sequence.
Whether a type behaves like a tuple.
Auxiliary functions for the alignment IO.
Provides seqan3::detail::istreambuf.
The main SeqAn3 namespace.
Definition: cigar_operation_table.hpp:2
T push_back(T... args)
Adaptations of concepts from the Ranges TS.
Provides seqan3::sam_file_input_format and auxiliary classes.
Provides seqan3::sam_file_output_options.
Provides helper data structures for the seqan3::sam_file_output.
Provides the seqan3::sam_tag_dictionary class and auxiliaries.
Provides seqan3::sequence_file_input_format and auxiliary classes.
Provides seqan3::sequence_file_output_options.
Provides seqan3::views::slice.
Thrown if information given to output format didn't match expectations.
Definition: exception.hpp:91
Thrown if there is a parse error, such as reading an unexpected character from an input stream.
Definition: exception.hpp:48
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:24
The options type defines various option members that influence the behavior of all or some formats.
Definition: output_options.hpp:23
bool add_carriage_return
The default plain text line-ending is "\n", but on Windows an additional carriage return is recommend...
Definition: output_options.hpp:27
bool sam_require_header
Whether to require a header for SAM files.
Definition: output_options.hpp:41
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:25
bool truncate_ids
Read the ID string only up until the first whitespace character.
Definition: input_options.hpp:27
The options type defines various option members that influence the behaviour of all or some formats.
Definition: output_options.hpp:23
T swap(T... args)
Provides seqan3::views::take_until and seqan3::views::take_until_or_throw.
T tie(T... args)
Provides seqan3::views::to.
Provides seqan3::views::to_char.
Provides traits to inspect some information of a type, for example its name.
Provides seqan3::tuple_like.
T visit(T... args)