Skip to content

Commit b25626d

Browse files
authored
--with-missing-fields: pad AD & PL to expected vector length (#29)
For now all sites must be biallelic. (Can be generalized in the future.)
1 parent 53ecc90 commit b25626d

File tree

1 file changed

+67
-24
lines changed

1 file changed

+67
-24
lines changed

src/spVCF.cc

+67-24
Original file line numberDiff line numberDiff line change
@@ -532,13 +532,20 @@ class DecoderImpl : public TranscoderBase {
532532
const char *ProcessLine(char *input_line) override;
533533

534534
private:
535-
void add_missing_fields(const char *entry, int field_count, string &ans);
535+
void add_missing_fields(const char *entry, int n_alt, string &ans);
536536

537+
// temp buffers used in ProcessLine (to reduce allocations)
537538
vector<string> dense_entries_;
538539
OStringStream buffer_;
539540

540541
bool with_missing_fields_;
541-
int format_field_count_ = 0;
542+
string format_;
543+
vector<string> format_split_;
544+
int iAD_ = -1, iPL_ = -1;
545+
// temp buffers used in add_missing_fields (to reduce allocations)
546+
OStringStream format_buffer_;
547+
string entry_copy_;
548+
vector<char *> entry_fields_;
542549
};
543550

544551
const char *DecoderImpl::ProcessLine(char *input_line) {
@@ -565,6 +572,23 @@ const char *DecoderImpl::ProcessLine(char *input_line) {
565572
fail("Invalid project VCF: fewer than 10 columns");
566573
}
567574

575+
int n_alt = -1;
576+
if (with_missing_fields_) {
577+
// count n_alt for use in missing fields with Number={A,G,R}
578+
n_alt = 1;
579+
for (char *alt = tokens[4]; *alt; alt++) {
580+
if (*alt == ',') {
581+
n_alt++;
582+
}
583+
}
584+
if (n_alt != 1) {
585+
// FIXME: handling multiallelic sites will require add_missing_fields() even on sparse
586+
// entries
587+
fail("--with-missing-fields only works with biallelic pVCF for now"
588+
"; try piping output through bcftools instead");
589+
}
590+
}
591+
568592
// Figure out number of dense columns, the number of columns on the first line
569593
uint64_t N = dense_entries_.empty() ? (tokens.size() - 9) : dense_entries_.size();
570594
if (dense_entries_.empty()) {
@@ -574,7 +598,6 @@ const char *DecoderImpl::ProcessLine(char *input_line) {
574598
assert(dense_entries_.size() == N);
575599

576600
// Pass through first nine columns
577-
int format_field_count = 1;
578601
buffer_.Clear();
579602
buffer_ << tokens[0];
580603
for (int i = 1; i < 9; i++) {
@@ -592,15 +615,23 @@ const char *DecoderImpl::ProcessLine(char *input_line) {
592615
continue;
593616
}
594617
} else if (i == 8 && with_missing_fields_) {
595-
// Count FORMAT fields for --with-missing-fields
596-
for (char *FORMAT = tokens[8]; *FORMAT != 0; FORMAT++) {
597-
if (*FORMAT == ':') {
598-
format_field_count++;
618+
if (format_.empty()) {
619+
// one-time initialization
620+
format_ = tokens[8];
621+
string format_copy = format_;
622+
vector<char *> format_split;
623+
split(format_copy, ':', back_inserter(format_split));
624+
for (int j = 0; j < format_split.size(); j++) {
625+
char *s = format_split[j];
626+
if (!strcmp(s, "AD")) {
627+
iAD_ = j;
628+
} else if (!strcmp(s, "PL")) {
629+
iPL_ = j;
630+
}
631+
format_split_.push_back(s);
599632
}
600633
}
601-
if (format_field_count_ <= 0) {
602-
format_field_count_ = format_field_count;
603-
} else if (format_field_count != format_field_count_) {
634+
if (format_ != tokens[8]) {
604635
fail(
605636
"--with-missing-fields is unsuitable when pVCF lines have varying field FORMATs"
606637
"; try piping output through bcftools instead");
@@ -617,14 +648,12 @@ const char *DecoderImpl::ProcessLine(char *input_line) {
617648
fail("empty cell");
618649
} else if (*t != '"') {
619650
// Dense entry - remember it and copy it to the output
620-
// TODO: Perhaps fill QC fields with missing values (.) if they were squeezed out.
621-
// The VCF spec does however say "Trailing fields can be dropped"
622651
if (dense_cursor >= N) {
623652
fail("Greater-than-expected number of columns implied by sparse encoding");
624653
}
625654
string &dense_entry = dense_entries_[dense_cursor++];
626655
if (with_missing_fields_) {
627-
add_missing_fields(t, format_field_count, dense_entry);
656+
add_missing_fields(t, n_alt, dense_entry);
628657
} else {
629658
dense_entry = t;
630659
}
@@ -678,19 +707,33 @@ const char *DecoderImpl::ProcessLine(char *input_line) {
678707
return buffer_.Get();
679708
}
680709

681-
// Add trailing missing fields to entry (--with-missing-fields); return tmpstr.c_str() or entry
682-
// itself (if already complete).
683-
void DecoderImpl::add_missing_fields(const char *entry, int field_count, string &ans) {
684-
int entry_field_count = 1;
685-
for (char *cursor = const_cast<char *>(entry); *cursor != 0; cursor++) {
686-
if (*cursor == ':') {
687-
entry_field_count++;
710+
// Add trailing missing fields to entry (--with-missing-fields)
711+
// Most missing fields are just represented by . except for AD and PL, which we pad with . to the
712+
// correct vector length. (In principle we should do that for any Number={A,G,R} field, but this
713+
// suffices for our practical need for this feature.)
714+
void DecoderImpl::add_missing_fields(const char *entry, int n_alt, string &ans) {
715+
format_buffer_.Clear();
716+
entry_copy_ = entry;
717+
entry_fields_.clear();
718+
split(entry_copy_, ':', back_inserter(entry_fields_));
719+
for (int i = 0; i < format_split_.size(); i++) {
720+
bool present = i < entry_fields_.size();
721+
const char *field = present ? entry_fields_[i] : nullptr;
722+
if (i > 0) {
723+
format_buffer_ << ':';
724+
}
725+
if (i == iAD_ && (!present || !strcmp(field, "."))) {
726+
// FIXME: handle multiallelic. Meaning we need to run this on sparse entries too.
727+
// const int nAD = n_alt + 1;
728+
format_buffer_ << ".,.";
729+
} else if (i == iPL_ && (!present || !strcmp(field, "."))) {
730+
// const int nPL = (n_alt + 1) * (n_alt + 2) / 2;
731+
format_buffer_ << ".,.,.";
732+
} else {
733+
format_buffer_ << (present ? field : ".");
688734
}
689735
}
690-
ans = entry;
691-
while (entry_field_count++ < field_count) {
692-
ans += ":.";
693-
}
736+
ans = format_buffer_.Get();
694737
}
695738

696739
unique_ptr<Transcoder> NewDecoder(bool with_missing_fields) {

0 commit comments

Comments
 (0)