Skip to content

Commit 2d5d8c4

Browse files
patched genbank parser and builder to handle BASE COUNT. (#386)
1 parent f2310db commit 2d5d8c4

File tree

2 files changed

+33
-1
lines changed

2 files changed

+33
-1
lines changed

data/sample.gbk

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ FEATURES Location/Qualifiers
7777
KDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNR
7878
RVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEK
7979
LISGDDKILNGVYSQYEEGESIFGSLF"
80+
BASE COUNT 67070277 a 48055043 c 48111528 g 67244164 t 18475410 n
8081
ORIGIN
8182
1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg
8283
61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct

io/genbank/genbank.go

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ type Meta struct {
5757
Origin string `json:"origin"`
5858
Locus Locus `json:"locus"`
5959
References []Reference `json:"references"`
60+
BaseCount []BaseCount `json:"base_count"`
6061
Other map[string]string `json:"other"`
6162
Name string `json:"name"`
6263
SequenceHash string `json:"sequence_hash"`
@@ -109,6 +110,12 @@ type Location struct {
109110
SubLocations []Location `json:"sub_locations"`
110111
}
111112

113+
// BaseCount is a struct that holds the base counts for a sequence.
114+
type BaseCount struct {
115+
Base string
116+
Count int
117+
}
118+
112119
// Precompiled regular expressions:
113120
var (
114121
basePairRegex = regexp.MustCompile(` \d* \w{2} `)
@@ -315,6 +322,13 @@ func BuildMulti(sequences []Genbank) ([]byte, error) {
315322
gbkString.WriteString(BuildFeatureString(feature))
316323
}
317324

325+
if len(sequence.Meta.BaseCount) > 0 {
326+
gbkString.WriteString("BASE COUNT ")
327+
for _, baseCount := range sequence.Meta.BaseCount {
328+
gbkString.WriteString(strconv.Itoa(baseCount.Count) + " " + baseCount.Base + " ")
329+
}
330+
gbkString.WriteString("\n")
331+
}
318332
// start writing sequence section.
319333
gbkString.WriteString("ORIGIN\n")
320334

@@ -378,7 +392,7 @@ type parseLoopParameters struct {
378392
emptyAttribute bool
379393
sequenceBuilder strings.Builder
380394
parseStep string
381-
genbank Genbank // since we are scanning lines we need a Genbank struct to store the data outside the loop.// since we are scanning lines we need a Genbank struct to store the data outside the loop.
395+
genbank Genbank // since we are scanning lines we need a Genbank struct to store the data outside the loop.
382396
feature Feature
383397
features []Feature
384398
metadataTag string
@@ -484,6 +498,23 @@ func ParseMultiNth(r io.Reader, count int) ([]Genbank, error) {
484498
}
485499
case "features":
486500

501+
baseCountFlag := strings.Contains(line, "BASE COUNT") // example string for BASE COUNT: "BASE COUNT 67070277 a 48055043 c 48111528 g 67244164 t 18475410 n"
502+
if baseCountFlag {
503+
fields := strings.Fields(line)
504+
for countIndex := 2; countIndex < len(fields)-1; countIndex += 2 { // starts at two because we don't want to include "BASE COUNT" in our fields
505+
count, err := strconv.Atoi(fields[countIndex])
506+
if err != nil {
507+
return []Genbank{}, err
508+
}
509+
510+
baseCount := BaseCount{
511+
Base: fields[countIndex+1],
512+
Count: count,
513+
}
514+
parameters.genbank.Meta.BaseCount = append(parameters.genbank.Meta.BaseCount, baseCount)
515+
}
516+
break
517+
}
487518
// Switch to sequence parsing
488519
originFlag := strings.Contains(line, "ORIGIN") // we detect the beginning of the sequence with "ORIGIN"
489520
if originFlag {

0 commit comments

Comments
 (0)