@@ -532,13 +532,20 @@ class DecoderImpl : public TranscoderBase {
532
532
const char *ProcessLine (char *input_line) override ;
533
533
534
534
private:
535
- void add_missing_fields (const char *entry, int field_count , string &ans);
535
+ void add_missing_fields (const char *entry, int n_alt , string &ans);
536
536
537
+ // temp buffers used in ProcessLine (to reduce allocations)
537
538
vector<string> dense_entries_;
538
539
OStringStream buffer_;
539
540
540
541
bool with_missing_fields_;
541
- int format_field_count_ = 0 ;
542
+ string format_;
543
+ vector<string> format_split_;
544
+ int iAD_ = -1 , iPL_ = -1 ;
545
+ // temp buffers used in add_missing_fields (to reduce allocations)
546
+ OStringStream format_buffer_;
547
+ string entry_copy_;
548
+ vector<char *> entry_fields_;
542
549
};
543
550
544
551
const char *DecoderImpl::ProcessLine (char *input_line) {
@@ -565,6 +572,23 @@ const char *DecoderImpl::ProcessLine(char *input_line) {
565
572
fail (" Invalid project VCF: fewer than 10 columns" );
566
573
}
567
574
575
+ int n_alt = -1 ;
576
+ if (with_missing_fields_) {
577
+ // count n_alt for use in missing fields with Number={A,G,R}
578
+ n_alt = 1 ;
579
+ for (char *alt = tokens[4 ]; *alt; alt++) {
580
+ if (*alt == ' ,' ) {
581
+ n_alt++;
582
+ }
583
+ }
584
+ if (n_alt != 1 ) {
585
+ // FIXME: handling multiallelic sites will require add_missing_fields() even on sparse
586
+ // entries
587
+ fail (" --with-missing-fields only works with biallelic pVCF for now"
588
+ " ; try piping output through bcftools instead" );
589
+ }
590
+ }
591
+
568
592
// Figure out number of dense columns, the number of columns on the first line
569
593
uint64_t N = dense_entries_.empty () ? (tokens.size () - 9 ) : dense_entries_.size ();
570
594
if (dense_entries_.empty ()) {
@@ -574,7 +598,6 @@ const char *DecoderImpl::ProcessLine(char *input_line) {
574
598
assert (dense_entries_.size () == N);
575
599
576
600
// Pass through first nine columns
577
- int format_field_count = 1 ;
578
601
buffer_.Clear ();
579
602
buffer_ << tokens[0 ];
580
603
for (int i = 1 ; i < 9 ; i++) {
@@ -592,15 +615,23 @@ const char *DecoderImpl::ProcessLine(char *input_line) {
592
615
continue ;
593
616
}
594
617
} else if (i == 8 && with_missing_fields_) {
595
- // Count FORMAT fields for --with-missing-fields
596
- for (char *FORMAT = tokens[8 ]; *FORMAT != 0 ; FORMAT++) {
597
- if (*FORMAT == ' :' ) {
598
- format_field_count++;
618
+ if (format_.empty ()) {
619
+ // one-time initialization
620
+ format_ = tokens[8 ];
621
+ string format_copy = format_;
622
+ vector<char *> format_split;
623
+ split (format_copy, ' :' , back_inserter (format_split));
624
+ for (int j = 0 ; j < format_split.size (); j++) {
625
+ char *s = format_split[j];
626
+ if (!strcmp (s, " AD" )) {
627
+ iAD_ = j;
628
+ } else if (!strcmp (s, " PL" )) {
629
+ iPL_ = j;
630
+ }
631
+ format_split_.push_back (s);
599
632
}
600
633
}
601
- if (format_field_count_ <= 0 ) {
602
- format_field_count_ = format_field_count;
603
- } else if (format_field_count != format_field_count_) {
634
+ if (format_ != tokens[8 ]) {
604
635
fail (
605
636
" --with-missing-fields is unsuitable when pVCF lines have varying field FORMATs"
606
637
" ; try piping output through bcftools instead" );
@@ -617,14 +648,12 @@ const char *DecoderImpl::ProcessLine(char *input_line) {
617
648
fail (" empty cell" );
618
649
} else if (*t != ' "' ) {
619
650
// Dense entry - remember it and copy it to the output
620
- // TODO: Perhaps fill QC fields with missing values (.) if they were squeezed out.
621
- // The VCF spec does however say "Trailing fields can be dropped"
622
651
if (dense_cursor >= N) {
623
652
fail (" Greater-than-expected number of columns implied by sparse encoding" );
624
653
}
625
654
string &dense_entry = dense_entries_[dense_cursor++];
626
655
if (with_missing_fields_) {
627
- add_missing_fields (t, format_field_count , dense_entry);
656
+ add_missing_fields (t, n_alt , dense_entry);
628
657
} else {
629
658
dense_entry = t;
630
659
}
@@ -678,19 +707,33 @@ const char *DecoderImpl::ProcessLine(char *input_line) {
678
707
return buffer_.Get ();
679
708
}
680
709
681
- // Add trailing missing fields to entry (--with-missing-fields); return tmpstr.c_str() or entry
682
- // itself (if already complete).
683
- void DecoderImpl::add_missing_fields (const char *entry, int field_count, string &ans) {
684
- int entry_field_count = 1 ;
685
- for (char *cursor = const_cast <char *>(entry); *cursor != 0 ; cursor++) {
686
- if (*cursor == ' :' ) {
687
- entry_field_count++;
710
+ // Add trailing missing fields to entry (--with-missing-fields)
711
+ // Most missing fields are just represented by . except for AD and PL, which we pad with . to the
712
+ // correct vector length. (In principle we should do that for any Number={A,G,R} field, but this
713
+ // suffices for our practical need for this feature.)
714
+ void DecoderImpl::add_missing_fields (const char *entry, int n_alt, string &ans) {
715
+ format_buffer_.Clear ();
716
+ entry_copy_ = entry;
717
+ entry_fields_.clear ();
718
+ split (entry_copy_, ' :' , back_inserter (entry_fields_));
719
+ for (int i = 0 ; i < format_split_.size (); i++) {
720
+ bool present = i < entry_fields_.size ();
721
+ const char *field = present ? entry_fields_[i] : nullptr ;
722
+ if (i > 0 ) {
723
+ format_buffer_ << ' :' ;
724
+ }
725
+ if (i == iAD_ && (!present || !strcmp (field, " ." ))) {
726
+ // FIXME: handle multiallelic. Meaning we need to run this on sparse entries too.
727
+ // const int nAD = n_alt + 1;
728
+ format_buffer_ << " .,." ;
729
+ } else if (i == iPL_ && (!present || !strcmp (field, " ." ))) {
730
+ // const int nPL = (n_alt + 1) * (n_alt + 2) / 2;
731
+ format_buffer_ << " .,.,." ;
732
+ } else {
733
+ format_buffer_ << (present ? field : " ." );
688
734
}
689
735
}
690
- ans = entry;
691
- while (entry_field_count++ < field_count) {
692
- ans += " :." ;
693
- }
736
+ ans = format_buffer_.Get ();
694
737
}
695
738
696
739
unique_ptr<Transcoder> NewDecoder (bool with_missing_fields) {
0 commit comments